SaoYear commited on
Commit
2547be7
·
1 Parent(s): 785ef15

add demo page application

Browse files
Files changed (1) hide show
  1. app.py +51 -15
app.py CHANGED
@@ -23,13 +23,16 @@ import librosa as lb
23
  import yaml
24
  import numpy as np
25
  import matplotlib.pyplot as plt
 
26
  from model.cleanmel import CleanMel
27
  from model.vocos.pretrained import Vocos
28
  from model.stft import InputSTFT, TargetMel
29
 
30
  DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
31
  def read_audio(file_path):
 
32
  audio, sample_rate = sf.read(file_path)
 
33
  if audio.ndim > 1:
34
  # select the loudest channel if stereo
35
  audio = audio[:, np.argmax(np.abs(audio).mean(axis=0))]
@@ -176,29 +179,39 @@ def reset_everything():
176
  demo = gr.Blocks()
177
  with gr.Blocks(title="CleanMel Demo") as demo:
178
  gr.Markdown("## CleanMel Demo")
179
- gr.Markdown("This demo showcases the CleanMel model for speech enhancement. \n \
180
- Since the model is running on CPU, it may take a while to process the audio. \n \
181
- Please be patient and wait for the result. \n")
 
 
 
 
182
 
183
  with gr.Row():
184
- audio_input = gr.Audio(label="Input Audio", type="filepath", sources="upload")
185
  with gr.Column():
 
 
 
 
 
 
 
 
186
 
187
- enhance_button_map_S = gr.Button("Enhance Audio (offline CleanMel_S_map), 4 mins for 10-second audio")
188
- enhance_button_mask_S = gr.Button("Enhance Audio (offline CleanMel_S_mask), 4 mins for 10-second audio")
189
-
190
- enhance_button_map_L = gr.Button("Enhance Audio (offline CleanMel_L_map), 10 mins for 10-second audio")
191
- enhance_button_mask_L = gr.Button("Enhance Audio (offline CleanMel_L_mask), 10 mins for 10-second audio")
 
192
  clear_btn = gr.Button(
193
  "🗑️ Clear All",
194
  variant="secondary",
195
  size="lg"
196
- )
197
-
198
  output_audio = gr.Audio(label="Enhanced Audio", type="filepath")
199
  output_mel = gr.Image(label="Output LogMel Spectrogram", type="filepath", visible=True)
200
  output_np = gr.File(label="Enhanced LogMel Spec. (.npy)", type="filepath")
201
-
202
  enhance_button_map_L.click(
203
  enhance_cleanmel_L_map,
204
  inputs=audio_input,
@@ -212,14 +225,37 @@ with gr.Blocks(title="CleanMel Demo") as demo:
212
  )
213
 
214
  enhance_button_map_S.click(
215
- enhance_cleanmel_S_map,
216
- inputs=audio_input,
217
  outputs=[output_audio, output_mel, output_np]
218
  )
219
 
220
  enhance_button_mask_S.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  enhance_cleanmel_S_mask,
222
- inputs=audio_input,
223
  outputs=[output_audio, output_mel, output_np]
224
  )
225
 
 
23
  import yaml
24
  import numpy as np
25
  import matplotlib.pyplot as plt
26
+ from pydub import AudioSegment
27
  from model.cleanmel import CleanMel
28
  from model.vocos.pretrained import Vocos
29
  from model.stft import InputSTFT, TargetMel
30
 
31
  DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
32
  def read_audio(file_path):
33
+ assert file_path.endswith(('.wav', '.flac')), "Unsupported audio format. Please upload a .wav, .flac file."
34
  audio, sample_rate = sf.read(file_path)
35
+
36
  if audio.ndim > 1:
37
  # select the loudest channel if stereo
38
  audio = audio[:, np.argmax(np.abs(audio).mean(axis=0))]
 
179
  demo = gr.Blocks()
180
  with gr.Blocks(title="CleanMel Demo") as demo:
181
  gr.Markdown("## CleanMel Demo")
182
+ gr.Markdown("This demo showcases the CleanMel model for speech enhancement. <br> \
183
+ Only **.wav** and **.flac** files are supported. <br> \
184
+ --- <br> \
185
+ The model is running on CPU. Please be patient and wait for the result. <br> \
186
+ Inference time reference: <br> \
187
+ - CleanMel_L: **10 mins** for **10-second** audio <br> \
188
+ - CleanMel_S: **4 mins** for **10-second** audio <br> ")
189
 
190
  with gr.Row():
 
191
  with gr.Column():
192
+ audio_input = gr.Audio(label="Input Audio", type="filepath", sources="upload")
193
+ audio_input_record = gr.Audio(label="Input Audio (Record)", type="filepath", sources="microphone")
194
+ with gr.Row():
195
+ with gr.Column():
196
+ enhance_button_map_S = gr.Button("Enhance File (offline CleanMel_S_map)")
197
+ enhance_button_mask_S = gr.Button("Enhance File (offline CleanMel_S_mask)")
198
+ enhance_button_map_L = gr.Button("Enhance File (offline CleanMel_L_map)")
199
+ enhance_button_mask_L = gr.Button("Enhance File (offline CleanMel_L_mask)")
200
 
201
+ with gr.Column():
202
+ enhance_button_map_Sr = gr.Button("Enhance Recorded Audio (offline CleanMel_S_map)")
203
+ enhance_button_mask_Sr = gr.Button("Enhance Recorded Audio (offline CleanMel_S_mask)")
204
+ enhance_button_map_Lr = gr.Button("Enhance Recorded Audio (offline CleanMel_L_map)")
205
+ enhance_button_mask_Lr = gr.Button("Enhance Recorded Audio (offline CleanMel_L_mask)")
206
+ with gr.Row():
207
  clear_btn = gr.Button(
208
  "🗑️ Clear All",
209
  variant="secondary",
210
  size="lg"
211
+ )
 
212
  output_audio = gr.Audio(label="Enhanced Audio", type="filepath")
213
  output_mel = gr.Image(label="Output LogMel Spectrogram", type="filepath", visible=True)
214
  output_np = gr.File(label="Enhanced LogMel Spec. (.npy)", type="filepath")
 
215
  enhance_button_map_L.click(
216
  enhance_cleanmel_L_map,
217
  inputs=audio_input,
 
225
  )
226
 
227
  enhance_button_map_S.click(
228
+ enhance_cleanmel_S_map,
229
+ inputs=audio_input,
230
  outputs=[output_audio, output_mel, output_np]
231
  )
232
 
233
  enhance_button_mask_S.click(
234
+ enhance_cleanmel_S_mask,
235
+ inputs=audio_input,
236
+ outputs=[output_audio, output_mel, output_np]
237
+ )
238
+ enhance_button_map_Lr.click(
239
+ enhance_cleanmel_L_map,
240
+ inputs=audio_input_record,
241
+ outputs=[output_audio, output_mel, output_np]
242
+ )
243
+
244
+ enhance_button_mask_Lr.click(
245
+ enhance_cleanmel_L_mask,
246
+ inputs=audio_input_record,
247
+ outputs=[output_audio, output_mel, output_np]
248
+ )
249
+
250
+ enhance_button_map_Sr.click(
251
+ enhance_cleanmel_S_map,
252
+ inputs=audio_input_record,
253
+ outputs=[output_audio, output_mel, output_np]
254
+ )
255
+
256
+ enhance_button_mask_Sr.click(
257
  enhance_cleanmel_S_mask,
258
+ inputs=audio_input_record,
259
  outputs=[output_audio, output_mel, output_np]
260
  )
261