sohei1l commited on
Commit
e9a623f
·
1 Parent(s): 9d3a5fa

Fix CLAP model implementation with proper pipeline API

Browse files
assets/{index-B-zNz8Yq.js → index-5J96wndd.js} RENAMED
The diff for this file is too large to render. See raw diff
 
assets/vite-DcBtz0py.svg ADDED
index.html CHANGED
@@ -2,10 +2,10 @@
2
  <html lang="en">
3
  <head>
4
  <meta charset="UTF-8" />
5
- <link rel="icon" type="image/svg+xml" href="./vite.svg" />
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
  <title>🎵 clip-tagger</title>
8
- <script type="module" crossorigin src="./assets/index-B-zNz8Yq.js"></script>
9
  <link rel="stylesheet" crossorigin href="./assets/index-F_aFpJd-.css">
10
  </head>
11
  <body>
 
2
  <html lang="en">
3
  <head>
4
  <meta charset="UTF-8" />
5
+ <link rel="icon" type="image/svg+xml" href="./assets/vite-DcBtz0py.svg" />
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
  <title>🎵 clip-tagger</title>
8
+ <script type="module" crossorigin src="./assets/index-5J96wndd.js"></script>
9
  <link rel="stylesheet" crossorigin href="./assets/index-F_aFpJd-.css">
10
  </head>
11
  <body>
src/App.jsx CHANGED
@@ -109,8 +109,17 @@ function App() {
109
  const hash = await feedbackStoreRef.current.hashAudioFile(file)
110
  setAudioHash(hash)
111
 
 
112
  const audioBuffer = await clapProcessorRef.current.fileToAudioBuffer(file)
 
 
 
 
 
 
 
113
  const generatedTags = await clapProcessorRef.current.processAudio(audioBuffer)
 
114
 
115
  // Store basic audio info for later use
116
  const features = {
@@ -366,6 +375,9 @@ function App() {
366
  {isLoading && (
367
  <div className="loading">
368
  <p>🧠 Analyzing audio with CLAP model...</p>
 
 
 
369
  </div>
370
  )}
371
 
 
109
  const hash = await feedbackStoreRef.current.hashAudioFile(file)
110
  setAudioHash(hash)
111
 
112
+ console.log('Converting file to audio buffer...')
113
  const audioBuffer = await clapProcessorRef.current.fileToAudioBuffer(file)
114
+ console.log('Audio buffer created:', {
115
+ duration: audioBuffer.duration,
116
+ sampleRate: audioBuffer.sampleRate,
117
+ channels: audioBuffer.numberOfChannels
118
+ })
119
+
120
+ console.log('Processing audio with CLAP...')
121
  const generatedTags = await clapProcessorRef.current.processAudio(audioBuffer)
122
+ console.log('Generated tags:', generatedTags)
123
 
124
  // Store basic audio info for later use
125
  const features = {
 
375
  {isLoading && (
376
  <div className="loading">
377
  <p>🧠 Analyzing audio with CLAP model...</p>
378
+ <p style={{fontSize: '0.9em', opacity: 0.8}}>
379
+ {tags.length === 0 ? 'Loading model (~45MB)...' : 'Processing audio...'}
380
+ </p>
381
  </div>
382
  )}
383
 
src/clapProcessor.js CHANGED
@@ -1,9 +1,8 @@
1
- import { pipeline, AutoProcessor, ClapAudioModelWithProjection } from '@xenova/transformers';
2
 
3
  class CLAPProcessor {
4
  constructor() {
5
- this.model = null;
6
- this.processor = null;
7
  this.defaultLabels = [
8
  'speech', 'music', 'singing', 'guitar', 'piano', 'drums', 'violin',
9
  'trumpet', 'saxophone', 'flute', 'classical music', 'rock music',
@@ -19,13 +18,12 @@ class CLAPProcessor {
19
  }
20
 
21
  async initialize() {
22
- if (this.model && this.processor) return;
23
 
24
  try {
25
- // Load the CLAP model and processor
26
- this.processor = await AutoProcessor.from_pretrained('Xenova/clap-htsat-unfused');
27
- this.model = await ClapAudioModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused');
28
-
29
  console.log('CLAP model loaded successfully');
30
  } catch (error) {
31
  console.error('Failed to load CLAP model:', error);
@@ -34,27 +32,28 @@ class CLAPProcessor {
34
  }
35
 
36
  async processAudio(audioBuffer) {
37
- if (!this.model || !this.processor) {
38
  await this.initialize();
39
  }
40
 
41
  try {
42
- // Convert audio to the format expected by CLAP
43
  const audio = await this.preprocessAudio(audioBuffer);
44
 
45
- // Process audio through the model
46
- const audioInputs = await this.processor(audio);
47
- const audioFeatures = await this.model.get_audio_features(audioInputs);
 
48
 
49
- // Process text labels
50
- const textInputs = await this.processor.text(this.defaultLabels);
51
- const textFeatures = await this.model.get_text_features(textInputs);
52
 
53
- // Calculate similarities
54
- const similarities = await this.calculateSimilarities(audioFeatures, textFeatures);
 
 
 
55
 
56
- // Return top tags with confidence scores
57
- return this.getTopTags(similarities, 5);
58
  } catch (error) {
59
  console.error('Error processing audio:', error);
60
  throw error;
@@ -62,89 +61,25 @@ class CLAPProcessor {
62
  }
63
 
64
  async preprocessAudio(audioBuffer) {
65
- // Convert to mono if stereo
66
  let audioData;
67
  if (audioBuffer.numberOfChannels > 1) {
68
- audioData = new Float32Array(audioBuffer.length);
69
- for (let i = 0; i < audioBuffer.length; i++) {
70
- let sum = 0;
71
- for (let channel = 0; channel < audioBuffer.numberOfChannels; channel++) {
72
- sum += audioBuffer.getChannelData(channel)[i];
73
- }
74
- audioData[i] = sum / audioBuffer.numberOfChannels;
75
  }
76
  } else {
77
  audioData = audioBuffer.getChannelData(0);
78
  }
79
 
80
- // Resample to 48kHz if needed (CLAP expects 48kHz)
81
- const targetSampleRate = 48000;
82
- if (audioBuffer.sampleRate !== targetSampleRate) {
83
- audioData = await this.resampleAudio(audioData, audioBuffer.sampleRate, targetSampleRate);
84
- }
85
-
86
- return audioData;
87
- }
88
-
89
- async resampleAudio(audioData, originalRate, targetRate) {
90
- // Simple linear interpolation resampling
91
- const ratio = originalRate / targetRate;
92
- const newLength = Math.round(audioData.length / ratio);
93
- const resampled = new Float32Array(newLength);
94
-
95
- for (let i = 0; i < newLength; i++) {
96
- const originalIndex = i * ratio;
97
- const indexFloor = Math.floor(originalIndex);
98
- const indexCeil = Math.min(indexFloor + 1, audioData.length - 1);
99
- const fraction = originalIndex - indexFloor;
100
-
101
- resampled[i] = audioData[indexFloor] * (1 - fraction) + audioData[indexCeil] * fraction;
102
- }
103
-
104
- return resampled;
105
- }
106
-
107
- async calculateSimilarities(audioFeatures, textFeatures) {
108
- // Calculate cosine similarity between audio and text features
109
- const audioVector = audioFeatures.data;
110
- const similarities = [];
111
-
112
- for (let i = 0; i < this.defaultLabels.length; i++) {
113
- const textVector = textFeatures.data.slice(
114
- i * audioVector.length,
115
- (i + 1) * audioVector.length
116
- );
117
-
118
- const similarity = this.cosineSimilarity(audioVector, textVector);
119
- similarities.push(similarity);
120
- }
121
-
122
- return similarities;
123
- }
124
-
125
- cosineSimilarity(vecA, vecB) {
126
- let dotProduct = 0;
127
- let normA = 0;
128
- let normB = 0;
129
-
130
- for (let i = 0; i < vecA.length; i++) {
131
- dotProduct += vecA[i] * vecB[i];
132
- normA += vecA[i] * vecA[i];
133
- normB += vecB[i] * vecB[i];
134
- }
135
-
136
- return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
137
- }
138
-
139
- getTopTags(similarities, topK = 5) {
140
- const tagged = this.defaultLabels.map((label, index) => ({
141
- label,
142
- confidence: Math.max(0, similarities[index]) // Ensure non-negative
143
- }));
144
-
145
- return tagged
146
- .sort((a, b) => b.confidence - a.confidence)
147
- .slice(0, topK);
148
  }
149
 
150
  // Convert file to AudioBuffer
 
1
+ import { pipeline } from '@xenova/transformers';
2
 
3
  class CLAPProcessor {
4
  constructor() {
5
+ this.pipeline = null;
 
6
  this.defaultLabels = [
7
  'speech', 'music', 'singing', 'guitar', 'piano', 'drums', 'violin',
8
  'trumpet', 'saxophone', 'flute', 'classical music', 'rock music',
 
18
  }
19
 
20
  async initialize() {
21
+ if (this.pipeline) return;
22
 
23
  try {
24
+ console.log('Loading CLAP model...');
25
+ // Use the pipeline API which is more stable
26
+ this.pipeline = await pipeline('zero-shot-audio-classification', 'Xenova/clap-htsat-unfused');
 
27
  console.log('CLAP model loaded successfully');
28
  } catch (error) {
29
  console.error('Failed to load CLAP model:', error);
 
32
  }
33
 
34
  async processAudio(audioBuffer) {
35
+ if (!this.pipeline) {
36
  await this.initialize();
37
  }
38
 
39
  try {
40
+ // Convert audio to the format expected by the model
41
  const audio = await this.preprocessAudio(audioBuffer);
42
 
43
+ console.log('Processing audio with CLAP...');
44
+
45
+ // Use the pipeline for zero-shot classification
46
+ const results = await this.pipeline(audio, this.defaultLabels);
47
 
48
+ console.log('CLAP results:', results);
 
 
49
 
50
+ // Transform results to our format
51
+ const tags = results.slice(0, 5).map(result => ({
52
+ label: result.label,
53
+ confidence: result.score
54
+ }));
55
 
56
+ return tags;
 
57
  } catch (error) {
58
  console.error('Error processing audio:', error);
59
  throw error;
 
61
  }
62
 
63
  async preprocessAudio(audioBuffer) {
64
+ // Convert to mono and get raw audio data
65
  let audioData;
66
  if (audioBuffer.numberOfChannels > 1) {
67
+ // Convert stereo to mono by averaging channels
68
+ const channel1 = audioBuffer.getChannelData(0);
69
+ const channel2 = audioBuffer.getChannelData(1);
70
+ audioData = new Float32Array(channel1.length);
71
+ for (let i = 0; i < channel1.length; i++) {
72
+ audioData[i] = (channel1[i] + channel2[i]) / 2;
 
73
  }
74
  } else {
75
  audioData = audioBuffer.getChannelData(0);
76
  }
77
 
78
+ // Return the audio data with sample rate info
79
+ return {
80
+ data: audioData,
81
+ sampling_rate: audioBuffer.sampleRate
82
+ };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  }
84
 
85
  // Convert file to AudioBuffer