SaoYear
commited on
Commit
·
2547be7
1
Parent(s):
785ef15
add demo page application
Browse files
app.py
CHANGED
@@ -23,13 +23,16 @@ import librosa as lb
|
|
23 |
import yaml
|
24 |
import numpy as np
|
25 |
import matplotlib.pyplot as plt
|
|
|
26 |
from model.cleanmel import CleanMel
|
27 |
from model.vocos.pretrained import Vocos
|
28 |
from model.stft import InputSTFT, TargetMel
|
29 |
|
30 |
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
31 |
def read_audio(file_path):
|
|
|
32 |
audio, sample_rate = sf.read(file_path)
|
|
|
33 |
if audio.ndim > 1:
|
34 |
# select the loudest channel if stereo
|
35 |
audio = audio[:, np.argmax(np.abs(audio).mean(axis=0))]
|
@@ -176,29 +179,39 @@ def reset_everything():
|
|
176 |
demo = gr.Blocks()
|
177 |
with gr.Blocks(title="CleanMel Demo") as demo:
|
178 |
gr.Markdown("## CleanMel Demo")
|
179 |
-
gr.Markdown("This demo showcases the CleanMel model for speech enhancement. \
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
182 |
|
183 |
with gr.Row():
|
184 |
-
audio_input = gr.Audio(label="Input Audio", type="filepath", sources="upload")
|
185 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
|
|
192 |
clear_btn = gr.Button(
|
193 |
"🗑️ Clear All",
|
194 |
variant="secondary",
|
195 |
size="lg"
|
196 |
-
)
|
197 |
-
|
198 |
output_audio = gr.Audio(label="Enhanced Audio", type="filepath")
|
199 |
output_mel = gr.Image(label="Output LogMel Spectrogram", type="filepath", visible=True)
|
200 |
output_np = gr.File(label="Enhanced LogMel Spec. (.npy)", type="filepath")
|
201 |
-
|
202 |
enhance_button_map_L.click(
|
203 |
enhance_cleanmel_L_map,
|
204 |
inputs=audio_input,
|
@@ -212,14 +225,37 @@ with gr.Blocks(title="CleanMel Demo") as demo:
|
|
212 |
)
|
213 |
|
214 |
enhance_button_map_S.click(
|
215 |
-
enhance_cleanmel_S_map,
|
216 |
-
inputs=audio_input,
|
217 |
outputs=[output_audio, output_mel, output_np]
|
218 |
)
|
219 |
|
220 |
enhance_button_mask_S.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
enhance_cleanmel_S_mask,
|
222 |
-
inputs=
|
223 |
outputs=[output_audio, output_mel, output_np]
|
224 |
)
|
225 |
|
|
|
23 |
import yaml
|
24 |
import numpy as np
|
25 |
import matplotlib.pyplot as plt
|
26 |
+
from pydub import AudioSegment
|
27 |
from model.cleanmel import CleanMel
|
28 |
from model.vocos.pretrained import Vocos
|
29 |
from model.stft import InputSTFT, TargetMel
|
30 |
|
31 |
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
32 |
def read_audio(file_path):
|
33 |
+
assert file_path.endswith(('.wav', '.flac')), "Unsupported audio format. Please upload a .wav, .flac file."
|
34 |
audio, sample_rate = sf.read(file_path)
|
35 |
+
|
36 |
if audio.ndim > 1:
|
37 |
# select the loudest channel if stereo
|
38 |
audio = audio[:, np.argmax(np.abs(audio).mean(axis=0))]
|
|
|
179 |
demo = gr.Blocks()
|
180 |
with gr.Blocks(title="CleanMel Demo") as demo:
|
181 |
gr.Markdown("## CleanMel Demo")
|
182 |
+
gr.Markdown("This demo showcases the CleanMel model for speech enhancement. <br> \
|
183 |
+
Only **.wav** and **.flac** files are supported. <br> \
|
184 |
+
--- <br> \
|
185 |
+
The model is running on CPU. Please be patient and wait for the result. <br> \
|
186 |
+
Inference time reference: <br> \
|
187 |
+
- CleanMel_L: **10 mins** for **10-second** audio <br> \
|
188 |
+
- CleanMel_S: **4 mins** for **10-second** audio <br> ")
|
189 |
|
190 |
with gr.Row():
|
|
|
191 |
with gr.Column():
|
192 |
+
audio_input = gr.Audio(label="Input Audio", type="filepath", sources="upload")
|
193 |
+
audio_input_record = gr.Audio(label="Input Audio (Record)", type="filepath", sources="microphone")
|
194 |
+
with gr.Row():
|
195 |
+
with gr.Column():
|
196 |
+
enhance_button_map_S = gr.Button("Enhance File (offline CleanMel_S_map)")
|
197 |
+
enhance_button_mask_S = gr.Button("Enhance File (offline CleanMel_S_mask)")
|
198 |
+
enhance_button_map_L = gr.Button("Enhance File (offline CleanMel_L_map)")
|
199 |
+
enhance_button_mask_L = gr.Button("Enhance File (offline CleanMel_L_mask)")
|
200 |
|
201 |
+
with gr.Column():
|
202 |
+
enhance_button_map_Sr = gr.Button("Enhance Recorded Audio (offline CleanMel_S_map)")
|
203 |
+
enhance_button_mask_Sr = gr.Button("Enhance Recorded Audio (offline CleanMel_S_mask)")
|
204 |
+
enhance_button_map_Lr = gr.Button("Enhance Recorded Audio (offline CleanMel_L_map)")
|
205 |
+
enhance_button_mask_Lr = gr.Button("Enhance Recorded Audio (offline CleanMel_L_mask)")
|
206 |
+
with gr.Row():
|
207 |
clear_btn = gr.Button(
|
208 |
"🗑️ Clear All",
|
209 |
variant="secondary",
|
210 |
size="lg"
|
211 |
+
)
|
|
|
212 |
output_audio = gr.Audio(label="Enhanced Audio", type="filepath")
|
213 |
output_mel = gr.Image(label="Output LogMel Spectrogram", type="filepath", visible=True)
|
214 |
output_np = gr.File(label="Enhanced LogMel Spec. (.npy)", type="filepath")
|
|
|
215 |
enhance_button_map_L.click(
|
216 |
enhance_cleanmel_L_map,
|
217 |
inputs=audio_input,
|
|
|
225 |
)
|
226 |
|
227 |
enhance_button_map_S.click(
|
228 |
+
enhance_cleanmel_S_map,
|
229 |
+
inputs=audio_input,
|
230 |
outputs=[output_audio, output_mel, output_np]
|
231 |
)
|
232 |
|
233 |
enhance_button_mask_S.click(
|
234 |
+
enhance_cleanmel_S_mask,
|
235 |
+
inputs=audio_input,
|
236 |
+
outputs=[output_audio, output_mel, output_np]
|
237 |
+
)
|
238 |
+
enhance_button_map_Lr.click(
|
239 |
+
enhance_cleanmel_L_map,
|
240 |
+
inputs=audio_input_record,
|
241 |
+
outputs=[output_audio, output_mel, output_np]
|
242 |
+
)
|
243 |
+
|
244 |
+
enhance_button_mask_Lr.click(
|
245 |
+
enhance_cleanmel_L_mask,
|
246 |
+
inputs=audio_input_record,
|
247 |
+
outputs=[output_audio, output_mel, output_np]
|
248 |
+
)
|
249 |
+
|
250 |
+
enhance_button_map_Sr.click(
|
251 |
+
enhance_cleanmel_S_map,
|
252 |
+
inputs=audio_input_record,
|
253 |
+
outputs=[output_audio, output_mel, output_np]
|
254 |
+
)
|
255 |
+
|
256 |
+
enhance_button_mask_Sr.click(
|
257 |
enhance_cleanmel_S_mask,
|
258 |
+
inputs=audio_input_record,
|
259 |
outputs=[output_audio, output_mel, output_np]
|
260 |
)
|
261 |
|