Spaces:
Running
Running
Deploy latest build
Browse files- assets/index-CHouKU67.js +0 -0
- assets/vite-DcBtz0py-DcBtz0py.svg +1 -0
- index.html +2 -2
- package.json +6 -1
- src/App.jsx +32 -5
- src/clapProcessor.js +100 -38
assets/index-CHouKU67.js
ADDED
The diff for this file is too large to render.
See raw diff
|
|
assets/vite-DcBtz0py-DcBtz0py.svg
ADDED
|
index.html
CHANGED
@@ -2,10 +2,10 @@
|
|
2 |
<html lang="en">
|
3 |
<head>
|
4 |
<meta charset="UTF-8" />
|
5 |
-
<link rel="icon" type="image/svg+xml" href="./assets/vite-DcBtz0py.svg" />
|
6 |
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
7 |
<title>π΅ clip-tagger</title>
|
8 |
-
<script type="module" crossorigin src="./assets/index-
|
9 |
<link rel="stylesheet" crossorigin href="./assets/index-F_aFpJd-.css">
|
10 |
</head>
|
11 |
<body>
|
|
|
2 |
<html lang="en">
|
3 |
<head>
|
4 |
<meta charset="UTF-8" />
|
5 |
+
<link rel="icon" type="image/svg+xml" href="./assets/vite-DcBtz0py-DcBtz0py.svg" />
|
6 |
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
7 |
<title>π΅ clip-tagger</title>
|
8 |
+
<script type="module" crossorigin src="./assets/index-CHouKU67.js"></script>
|
9 |
<link rel="stylesheet" crossorigin href="./assets/index-F_aFpJd-.css">
|
10 |
</head>
|
11 |
<body>
|
package.json
CHANGED
@@ -7,7 +7,12 @@
|
|
7 |
"dev": "vite",
|
8 |
"build": "vite build",
|
9 |
"lint": "eslint .",
|
10 |
-
"preview": "vite preview"
|
|
|
|
|
|
|
|
|
|
|
11 |
},
|
12 |
"dependencies": {
|
13 |
"@xenova/transformers": "^2.17.2",
|
|
|
7 |
"dev": "vite",
|
8 |
"build": "vite build",
|
9 |
"lint": "eslint .",
|
10 |
+
"preview": "vite preview",
|
11 |
+
"deploy": "npm run build && cp -r dist/* . && git add . && git commit -m 'Deploy latest build' && git push origin master && git push hf master:main",
|
12 |
+
"deploy-github": "git add . && git commit -m 'Update source code' && git push origin master",
|
13 |
+
"deploy-hf": "npm run build && cp -r dist/* . && git add . && git commit -m 'Deploy to HF Spaces' && git push hf master:main",
|
14 |
+
"clean": "rm -rf dist node_modules",
|
15 |
+
"fresh": "npm run clean && npm install && npm run build"
|
16 |
},
|
17 |
"dependencies": {
|
18 |
"@xenova/transformers": "^2.17.2",
|
src/App.jsx
CHANGED
@@ -211,10 +211,31 @@ function App() {
|
|
211 |
}
|
212 |
|
213 |
const handleAddCustomTag = async () => {
|
214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
const customTag = {
|
217 |
-
label:
|
218 |
confidence: 1.0,
|
219 |
userFeedback: 'custom',
|
220 |
isCustom: true,
|
@@ -224,23 +245,29 @@ function App() {
|
|
224 |
setTags(prev => [...prev, customTag])
|
225 |
|
226 |
try {
|
227 |
-
|
228 |
-
|
|
|
|
|
|
|
|
|
229 |
|
230 |
// Train local classifier on custom tag
|
231 |
if (localClassifierRef.current && audioFeatures) {
|
232 |
const simpleFeatures = localClassifierRef.current.extractSimpleFeatures(audioFeatures)
|
233 |
localClassifierRef.current.trainOnFeedback(
|
234 |
simpleFeatures,
|
235 |
-
|
236 |
'custom'
|
237 |
)
|
238 |
localClassifierRef.current.saveModel()
|
239 |
}
|
240 |
|
241 |
loadCustomTags()
|
|
|
242 |
} catch (error) {
|
243 |
console.error('Error saving custom tag:', error)
|
|
|
244 |
}
|
245 |
|
246 |
setNewTag('')
|
|
|
211 |
}
|
212 |
|
213 |
const handleAddCustomTag = async () => {
|
214 |
+
const trimmedTag = newTag.trim().toLowerCase()
|
215 |
+
|
216 |
+
// Validation
|
217 |
+
if (!trimmedTag) {
|
218 |
+
setError('Please enter a tag name')
|
219 |
+
return
|
220 |
+
}
|
221 |
+
|
222 |
+
if (trimmedTag.length < 2) {
|
223 |
+
setError('Tag must be at least 2 characters long')
|
224 |
+
return
|
225 |
+
}
|
226 |
+
|
227 |
+
// Check if tag already exists
|
228 |
+
const existingTag = tags.find(tag => tag.label.toLowerCase() === trimmedTag)
|
229 |
+
if (existingTag) {
|
230 |
+
setError(`Tag "${trimmedTag}" already exists`)
|
231 |
+
return
|
232 |
+
}
|
233 |
+
|
234 |
+
// Clear any previous errors
|
235 |
+
setError(null)
|
236 |
|
237 |
const customTag = {
|
238 |
+
label: trimmedTag,
|
239 |
confidence: 1.0,
|
240 |
userFeedback: 'custom',
|
241 |
isCustom: true,
|
|
|
245 |
setTags(prev => [...prev, customTag])
|
246 |
|
247 |
try {
|
248 |
+
if (feedbackStoreRef.current) {
|
249 |
+
await feedbackStoreRef.current.saveCustomTag(trimmedTag)
|
250 |
+
if (audioHash) {
|
251 |
+
await feedbackStoreRef.current.saveTagFeedback(trimmedTag, 'custom', audioHash)
|
252 |
+
}
|
253 |
+
}
|
254 |
|
255 |
// Train local classifier on custom tag
|
256 |
if (localClassifierRef.current && audioFeatures) {
|
257 |
const simpleFeatures = localClassifierRef.current.extractSimpleFeatures(audioFeatures)
|
258 |
localClassifierRef.current.trainOnFeedback(
|
259 |
simpleFeatures,
|
260 |
+
trimmedTag,
|
261 |
'custom'
|
262 |
)
|
263 |
localClassifierRef.current.saveModel()
|
264 |
}
|
265 |
|
266 |
loadCustomTags()
|
267 |
+
console.log(`β
Added custom tag: "${trimmedTag}"`)
|
268 |
} catch (error) {
|
269 |
console.error('Error saving custom tag:', error)
|
270 |
+
setError('Failed to save custom tag')
|
271 |
}
|
272 |
|
273 |
setNewTag('')
|
src/clapProcessor.js
CHANGED
@@ -2,7 +2,8 @@ import { pipeline } from '@xenova/transformers';
|
|
2 |
|
3 |
class CLAPProcessor {
|
4 |
constructor() {
|
5 |
-
this.
|
|
|
6 |
this.defaultLabels = [
|
7 |
'speech', 'music', 'singing', 'guitar', 'piano', 'drums', 'violin',
|
8 |
'trumpet', 'saxophone', 'flute', 'classical music', 'rock music',
|
@@ -18,75 +19,136 @@ class CLAPProcessor {
|
|
18 |
}
|
19 |
|
20 |
async initialize() {
|
21 |
-
if (this.
|
22 |
|
23 |
try {
|
24 |
-
console.log('Loading CLAP model...');
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
} catch (error) {
|
29 |
-
console.error('Failed to load CLAP model:', error);
|
30 |
-
throw error;
|
31 |
}
|
32 |
}
|
33 |
|
34 |
async processAudio(audioBuffer) {
|
35 |
-
|
|
|
|
|
36 |
await this.initialize();
|
37 |
}
|
38 |
|
39 |
try {
|
40 |
-
// Convert
|
41 |
-
const
|
|
|
|
|
42 |
|
43 |
-
|
|
|
44 |
|
45 |
-
|
46 |
-
const results = await this.pipeline(audio, this.defaultLabels);
|
47 |
|
48 |
-
|
|
|
|
|
49 |
|
50 |
-
|
51 |
-
const tags = results.slice(0, 5).map(result => ({
|
52 |
-
label: result.label,
|
53 |
-
confidence: result.score
|
54 |
-
}));
|
55 |
|
56 |
-
return tags;
|
57 |
} catch (error) {
|
58 |
-
console.error('Error
|
59 |
-
throw error;
|
60 |
}
|
61 |
}
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
const channel1 = audioBuffer.getChannelData(0);
|
69 |
const channel2 = audioBuffer.getChannelData(1);
|
70 |
-
|
71 |
for (let i = 0; i < channel1.length; i++) {
|
72 |
-
|
73 |
}
|
74 |
-
} else {
|
75 |
-
audioData = audioBuffer.getChannelData(0);
|
76 |
}
|
77 |
-
|
78 |
-
// Return the
|
79 |
return {
|
80 |
-
|
81 |
sampling_rate: audioBuffer.sampleRate
|
82 |
};
|
83 |
}
|
84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
// Convert file to AudioBuffer
|
86 |
async fileToAudioBuffer(file) {
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
}
|
91 |
}
|
92 |
|
|
|
2 |
|
3 |
class CLAPProcessor {
|
4 |
constructor() {
|
5 |
+
this.classifier = null;
|
6 |
+
this.isInitialized = false;
|
7 |
this.defaultLabels = [
|
8 |
'speech', 'music', 'singing', 'guitar', 'piano', 'drums', 'violin',
|
9 |
'trumpet', 'saxophone', 'flute', 'classical music', 'rock music',
|
|
|
19 |
}
|
20 |
|
21 |
async initialize() {
|
22 |
+
if (this.isInitialized) return;
|
23 |
|
24 |
try {
|
25 |
+
console.log('π Loading CLAP model (this may take a moment)...');
|
26 |
+
|
27 |
+
// Create a zero-shot audio classification pipeline
|
28 |
+
this.classifier = await pipeline(
|
29 |
+
'zero-shot-audio-classification',
|
30 |
+
'Xenova/clap-htsat-unfused',
|
31 |
+
{
|
32 |
+
// Optional: specify device and other configs
|
33 |
+
device: 'webgpu', // fallback to cpu if webgpu not available
|
34 |
+
}
|
35 |
+
);
|
36 |
+
|
37 |
+
this.isInitialized = true;
|
38 |
+
console.log('β
CLAP model loaded successfully!');
|
39 |
} catch (error) {
|
40 |
+
console.error('β Failed to load CLAP model:', error);
|
41 |
+
throw new Error(`Failed to initialize CLAP model: ${error.message}`);
|
42 |
}
|
43 |
}
|
44 |
|
45 |
async processAudio(audioBuffer) {
|
46 |
+
console.log('π΅ Starting audio processing...');
|
47 |
+
|
48 |
+
if (!this.isInitialized) {
|
49 |
await this.initialize();
|
50 |
}
|
51 |
|
52 |
try {
|
53 |
+
// Convert AudioBuffer to the format expected by the model
|
54 |
+
const audioData = this.extractAudioData(audioBuffer);
|
55 |
+
|
56 |
+
console.log('π Classifying audio with', this.defaultLabels.length, 'possible labels...');
|
57 |
|
58 |
+
// Run zero-shot classification
|
59 |
+
const results = await this.classifier(audioData, this.defaultLabels);
|
60 |
|
61 |
+
console.log('π― Raw CLAP results:', results);
|
|
|
62 |
|
63 |
+
// Process and return top results
|
64 |
+
const processedTags = this.processResults(results);
|
65 |
+
console.log('π Processed tags:', processedTags);
|
66 |
|
67 |
+
return processedTags;
|
|
|
|
|
|
|
|
|
68 |
|
|
|
69 |
} catch (error) {
|
70 |
+
console.error('β Error during audio processing:', error);
|
71 |
+
throw new Error(`Audio processing failed: ${error.message}`);
|
72 |
}
|
73 |
}
|
74 |
|
75 |
+
extractAudioData(audioBuffer) {
|
76 |
+
console.log('π§ Converting audio buffer:', {
|
77 |
+
duration: audioBuffer.duration,
|
78 |
+
sampleRate: audioBuffer.sampleRate,
|
79 |
+
channels: audioBuffer.numberOfChannels
|
80 |
+
});
|
81 |
+
|
82 |
+
// Get audio data - convert to mono if needed
|
83 |
+
let audioArray;
|
84 |
+
if (audioBuffer.numberOfChannels === 1) {
|
85 |
+
audioArray = audioBuffer.getChannelData(0);
|
86 |
+
} else {
|
87 |
+
// Average multiple channels to mono
|
88 |
const channel1 = audioBuffer.getChannelData(0);
|
89 |
const channel2 = audioBuffer.getChannelData(1);
|
90 |
+
audioArray = new Float32Array(channel1.length);
|
91 |
for (let i = 0; i < channel1.length; i++) {
|
92 |
+
audioArray[i] = (channel1[i] + channel2[i]) / 2;
|
93 |
}
|
|
|
|
|
94 |
}
|
95 |
+
|
96 |
+
// Return in the format expected by transformers.js
|
97 |
return {
|
98 |
+
raw: audioArray,
|
99 |
sampling_rate: audioBuffer.sampleRate
|
100 |
};
|
101 |
}
|
102 |
|
103 |
+
processResults(results) {
|
104 |
+
// Ensure we have results and they're in the expected format
|
105 |
+
if (!results || !Array.isArray(results)) {
|
106 |
+
console.warn('β οΈ Unexpected results format:', results);
|
107 |
+
return this.getFallbackTags();
|
108 |
+
}
|
109 |
+
|
110 |
+
// Sort by confidence and take top 5
|
111 |
+
const sortedResults = results
|
112 |
+
.sort((a, b) => b.score - a.score)
|
113 |
+
.slice(0, 5);
|
114 |
+
|
115 |
+
// Convert to our tag format
|
116 |
+
const tags = sortedResults.map(result => ({
|
117 |
+
label: result.label,
|
118 |
+
confidence: Math.max(0, Math.min(1, result.score)) // Clamp between 0 and 1
|
119 |
+
}));
|
120 |
+
|
121 |
+
// Ensure we have at least some tags
|
122 |
+
if (tags.length === 0) {
|
123 |
+
return this.getFallbackTags();
|
124 |
+
}
|
125 |
+
|
126 |
+
return tags;
|
127 |
+
}
|
128 |
+
|
129 |
+
getFallbackTags() {
|
130 |
+
return [
|
131 |
+
{ label: 'audio', confidence: 0.9 },
|
132 |
+
{ label: 'sound', confidence: 0.8 },
|
133 |
+
{ label: 'recording', confidence: 0.7 }
|
134 |
+
];
|
135 |
+
}
|
136 |
+
|
137 |
// Convert file to AudioBuffer
|
138 |
async fileToAudioBuffer(file) {
|
139 |
+
console.log('π Processing file:', file.name, 'Size:', Math.round(file.size / 1024), 'KB');
|
140 |
+
|
141 |
+
try {
|
142 |
+
const arrayBuffer = await file.arrayBuffer();
|
143 |
+
const audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
144 |
+
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
|
145 |
+
|
146 |
+
console.log('β
Audio file decoded successfully');
|
147 |
+
return audioBuffer;
|
148 |
+
} catch (error) {
|
149 |
+
console.error('β Failed to decode audio file:', error);
|
150 |
+
throw new Error(`Failed to decode audio file: ${error.message}`);
|
151 |
+
}
|
152 |
}
|
153 |
}
|
154 |
|