Spaces:
Running
Running
Fix CLAP model implementation with proper pipeline API
Browse files- assets/{index-B-zNz8Yq.js → index-5J96wndd.js} +0 -0
- assets/vite-DcBtz0py.svg +1 -0
- index.html +2 -2
- src/App.jsx +12 -0
- src/clapProcessor.js +31 -96
assets/{index-B-zNz8Yq.js → index-5J96wndd.js}
RENAMED
The diff for this file is too large to render.
See raw diff
|
|
assets/vite-DcBtz0py.svg
ADDED
|
index.html
CHANGED
@@ -2,10 +2,10 @@
|
|
2 |
<html lang="en">
|
3 |
<head>
|
4 |
<meta charset="UTF-8" />
|
5 |
-
<link rel="icon" type="image/svg+xml" href="./vite.svg" />
|
6 |
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
7 |
<title>🎵 clip-tagger</title>
|
8 |
-
<script type="module" crossorigin src="./assets/index-
|
9 |
<link rel="stylesheet" crossorigin href="./assets/index-F_aFpJd-.css">
|
10 |
</head>
|
11 |
<body>
|
|
|
2 |
<html lang="en">
|
3 |
<head>
|
4 |
<meta charset="UTF-8" />
|
5 |
+
<link rel="icon" type="image/svg+xml" href="./assets/vite-DcBtz0py.svg" />
|
6 |
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
7 |
<title>🎵 clip-tagger</title>
|
8 |
+
<script type="module" crossorigin src="./assets/index-5J96wndd.js"></script>
|
9 |
<link rel="stylesheet" crossorigin href="./assets/index-F_aFpJd-.css">
|
10 |
</head>
|
11 |
<body>
|
src/App.jsx
CHANGED
@@ -109,8 +109,17 @@ function App() {
|
|
109 |
const hash = await feedbackStoreRef.current.hashAudioFile(file)
|
110 |
setAudioHash(hash)
|
111 |
|
|
|
112 |
const audioBuffer = await clapProcessorRef.current.fileToAudioBuffer(file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
const generatedTags = await clapProcessorRef.current.processAudio(audioBuffer)
|
|
|
114 |
|
115 |
// Store basic audio info for later use
|
116 |
const features = {
|
@@ -366,6 +375,9 @@ function App() {
|
|
366 |
{isLoading && (
|
367 |
<div className="loading">
|
368 |
<p>🧠 Analyzing audio with CLAP model...</p>
|
|
|
|
|
|
|
369 |
</div>
|
370 |
)}
|
371 |
|
|
|
109 |
const hash = await feedbackStoreRef.current.hashAudioFile(file)
|
110 |
setAudioHash(hash)
|
111 |
|
112 |
+
console.log('Converting file to audio buffer...')
|
113 |
const audioBuffer = await clapProcessorRef.current.fileToAudioBuffer(file)
|
114 |
+
console.log('Audio buffer created:', {
|
115 |
+
duration: audioBuffer.duration,
|
116 |
+
sampleRate: audioBuffer.sampleRate,
|
117 |
+
channels: audioBuffer.numberOfChannels
|
118 |
+
})
|
119 |
+
|
120 |
+
console.log('Processing audio with CLAP...')
|
121 |
const generatedTags = await clapProcessorRef.current.processAudio(audioBuffer)
|
122 |
+
console.log('Generated tags:', generatedTags)
|
123 |
|
124 |
// Store basic audio info for later use
|
125 |
const features = {
|
|
|
375 |
{isLoading && (
|
376 |
<div className="loading">
|
377 |
<p>🧠 Analyzing audio with CLAP model...</p>
|
378 |
+
<p style={{fontSize: '0.9em', opacity: 0.8}}>
|
379 |
+
{tags.length === 0 ? 'Loading model (~45MB)...' : 'Processing audio...'}
|
380 |
+
</p>
|
381 |
</div>
|
382 |
)}
|
383 |
|
src/clapProcessor.js
CHANGED
@@ -1,9 +1,8 @@
|
|
1 |
-
import { pipeline
|
2 |
|
3 |
class CLAPProcessor {
|
4 |
constructor() {
|
5 |
-
this.
|
6 |
-
this.processor = null;
|
7 |
this.defaultLabels = [
|
8 |
'speech', 'music', 'singing', 'guitar', 'piano', 'drums', 'violin',
|
9 |
'trumpet', 'saxophone', 'flute', 'classical music', 'rock music',
|
@@ -19,13 +18,12 @@ class CLAPProcessor {
|
|
19 |
}
|
20 |
|
21 |
async initialize() {
|
22 |
-
if (this.
|
23 |
|
24 |
try {
|
25 |
-
|
26 |
-
|
27 |
-
this.
|
28 |
-
|
29 |
console.log('CLAP model loaded successfully');
|
30 |
} catch (error) {
|
31 |
console.error('Failed to load CLAP model:', error);
|
@@ -34,27 +32,28 @@ class CLAPProcessor {
|
|
34 |
}
|
35 |
|
36 |
async processAudio(audioBuffer) {
|
37 |
-
if (!this.
|
38 |
await this.initialize();
|
39 |
}
|
40 |
|
41 |
try {
|
42 |
-
// Convert audio to the format expected by
|
43 |
const audio = await this.preprocessAudio(audioBuffer);
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
48 |
|
49 |
-
|
50 |
-
const textInputs = await this.processor.text(this.defaultLabels);
|
51 |
-
const textFeatures = await this.model.get_text_features(textInputs);
|
52 |
|
53 |
-
//
|
54 |
-
const
|
|
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
return this.getTopTags(similarities, 5);
|
58 |
} catch (error) {
|
59 |
console.error('Error processing audio:', error);
|
60 |
throw error;
|
@@ -62,89 +61,25 @@ class CLAPProcessor {
|
|
62 |
}
|
63 |
|
64 |
async preprocessAudio(audioBuffer) {
|
65 |
-
// Convert to mono
|
66 |
let audioData;
|
67 |
if (audioBuffer.numberOfChannels > 1) {
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
audioData[i] = sum / audioBuffer.numberOfChannels;
|
75 |
}
|
76 |
} else {
|
77 |
audioData = audioBuffer.getChannelData(0);
|
78 |
}
|
79 |
|
80 |
-
//
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
}
|
85 |
-
|
86 |
-
return audioData;
|
87 |
-
}
|
88 |
-
|
89 |
-
async resampleAudio(audioData, originalRate, targetRate) {
|
90 |
-
// Simple linear interpolation resampling
|
91 |
-
const ratio = originalRate / targetRate;
|
92 |
-
const newLength = Math.round(audioData.length / ratio);
|
93 |
-
const resampled = new Float32Array(newLength);
|
94 |
-
|
95 |
-
for (let i = 0; i < newLength; i++) {
|
96 |
-
const originalIndex = i * ratio;
|
97 |
-
const indexFloor = Math.floor(originalIndex);
|
98 |
-
const indexCeil = Math.min(indexFloor + 1, audioData.length - 1);
|
99 |
-
const fraction = originalIndex - indexFloor;
|
100 |
-
|
101 |
-
resampled[i] = audioData[indexFloor] * (1 - fraction) + audioData[indexCeil] * fraction;
|
102 |
-
}
|
103 |
-
|
104 |
-
return resampled;
|
105 |
-
}
|
106 |
-
|
107 |
-
async calculateSimilarities(audioFeatures, textFeatures) {
|
108 |
-
// Calculate cosine similarity between audio and text features
|
109 |
-
const audioVector = audioFeatures.data;
|
110 |
-
const similarities = [];
|
111 |
-
|
112 |
-
for (let i = 0; i < this.defaultLabels.length; i++) {
|
113 |
-
const textVector = textFeatures.data.slice(
|
114 |
-
i * audioVector.length,
|
115 |
-
(i + 1) * audioVector.length
|
116 |
-
);
|
117 |
-
|
118 |
-
const similarity = this.cosineSimilarity(audioVector, textVector);
|
119 |
-
similarities.push(similarity);
|
120 |
-
}
|
121 |
-
|
122 |
-
return similarities;
|
123 |
-
}
|
124 |
-
|
125 |
-
cosineSimilarity(vecA, vecB) {
|
126 |
-
let dotProduct = 0;
|
127 |
-
let normA = 0;
|
128 |
-
let normB = 0;
|
129 |
-
|
130 |
-
for (let i = 0; i < vecA.length; i++) {
|
131 |
-
dotProduct += vecA[i] * vecB[i];
|
132 |
-
normA += vecA[i] * vecA[i];
|
133 |
-
normB += vecB[i] * vecB[i];
|
134 |
-
}
|
135 |
-
|
136 |
-
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
137 |
-
}
|
138 |
-
|
139 |
-
getTopTags(similarities, topK = 5) {
|
140 |
-
const tagged = this.defaultLabels.map((label, index) => ({
|
141 |
-
label,
|
142 |
-
confidence: Math.max(0, similarities[index]) // Ensure non-negative
|
143 |
-
}));
|
144 |
-
|
145 |
-
return tagged
|
146 |
-
.sort((a, b) => b.confidence - a.confidence)
|
147 |
-
.slice(0, topK);
|
148 |
}
|
149 |
|
150 |
// Convert file to AudioBuffer
|
|
|
1 |
+
import { pipeline } from '@xenova/transformers';
|
2 |
|
3 |
class CLAPProcessor {
|
4 |
constructor() {
|
5 |
+
this.pipeline = null;
|
|
|
6 |
this.defaultLabels = [
|
7 |
'speech', 'music', 'singing', 'guitar', 'piano', 'drums', 'violin',
|
8 |
'trumpet', 'saxophone', 'flute', 'classical music', 'rock music',
|
|
|
18 |
}
|
19 |
|
20 |
async initialize() {
|
21 |
+
if (this.pipeline) return;
|
22 |
|
23 |
try {
|
24 |
+
console.log('Loading CLAP model...');
|
25 |
+
// Use the pipeline API which is more stable
|
26 |
+
this.pipeline = await pipeline('zero-shot-audio-classification', 'Xenova/clap-htsat-unfused');
|
|
|
27 |
console.log('CLAP model loaded successfully');
|
28 |
} catch (error) {
|
29 |
console.error('Failed to load CLAP model:', error);
|
|
|
32 |
}
|
33 |
|
34 |
async processAudio(audioBuffer) {
|
35 |
+
if (!this.pipeline) {
|
36 |
await this.initialize();
|
37 |
}
|
38 |
|
39 |
try {
|
40 |
+
// Convert audio to the format expected by the model
|
41 |
const audio = await this.preprocessAudio(audioBuffer);
|
42 |
|
43 |
+
console.log('Processing audio with CLAP...');
|
44 |
+
|
45 |
+
// Use the pipeline for zero-shot classification
|
46 |
+
const results = await this.pipeline(audio, this.defaultLabels);
|
47 |
|
48 |
+
console.log('CLAP results:', results);
|
|
|
|
|
49 |
|
50 |
+
// Transform results to our format
|
51 |
+
const tags = results.slice(0, 5).map(result => ({
|
52 |
+
label: result.label,
|
53 |
+
confidence: result.score
|
54 |
+
}));
|
55 |
|
56 |
+
return tags;
|
|
|
57 |
} catch (error) {
|
58 |
console.error('Error processing audio:', error);
|
59 |
throw error;
|
|
|
61 |
}
|
62 |
|
63 |
async preprocessAudio(audioBuffer) {
|
64 |
+
// Convert to mono and get raw audio data
|
65 |
let audioData;
|
66 |
if (audioBuffer.numberOfChannels > 1) {
|
67 |
+
// Convert stereo to mono by averaging channels
|
68 |
+
const channel1 = audioBuffer.getChannelData(0);
|
69 |
+
const channel2 = audioBuffer.getChannelData(1);
|
70 |
+
audioData = new Float32Array(channel1.length);
|
71 |
+
for (let i = 0; i < channel1.length; i++) {
|
72 |
+
audioData[i] = (channel1[i] + channel2[i]) / 2;
|
|
|
73 |
}
|
74 |
} else {
|
75 |
audioData = audioBuffer.getChannelData(0);
|
76 |
}
|
77 |
|
78 |
+
// Return the audio data with sample rate info
|
79 |
+
return {
|
80 |
+
data: audioData,
|
81 |
+
sampling_rate: audioBuffer.sampleRate
|
82 |
+
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
}
|
84 |
|
85 |
// Convert file to AudioBuffer
|