Spaces:
Sleeping
Sleeping
speed improvements and documentation
Browse files- Modules/ToucanTTS/InferenceToucanTTS.py +4 -4
- README.md +5 -5
- app.py +2 -2
Modules/ToucanTTS/InferenceToucanTTS.py
CHANGED
@@ -222,7 +222,7 @@ class ToucanTTS(torch.nn.Module):
|
|
222 |
reduced_pitch_space = self.pitch_latent_reduction(encoded_texts).transpose(1, 2)
|
223 |
pitch_predictions = self.pitch_predictor(mu=reduced_pitch_space,
|
224 |
mask=text_masks.float(),
|
225 |
-
n_timesteps=
|
226 |
temperature=prosody_creativity,
|
227 |
c=utterance_embedding) if gold_pitch is None else gold_pitch
|
228 |
# because of the way we are processing the data, the last few elements of a sequence will always receive an unnaturally low pitch value. To fix this, we just overwrite them here.
|
@@ -235,7 +235,7 @@ class ToucanTTS(torch.nn.Module):
|
|
235 |
reduced_energy_space = self.energy_latent_reduction(encoded_texts + embedded_pitch_curve).transpose(1, 2)
|
236 |
energy_predictions = self.energy_predictor(mu=reduced_energy_space,
|
237 |
mask=text_masks.float(),
|
238 |
-
n_timesteps=
|
239 |
temperature=prosody_creativity,
|
240 |
c=utterance_embedding) if gold_energy is None else gold_energy
|
241 |
|
@@ -249,7 +249,7 @@ class ToucanTTS(torch.nn.Module):
|
|
249 |
reduced_duration_space = self.duration_latent_reduction(encoded_texts + embedded_pitch_curve + embedded_energy_curve).transpose(1, 2)
|
250 |
predicted_durations = torch.clamp(torch.ceil(self.duration_predictor(mu=reduced_duration_space,
|
251 |
mask=text_masks.float(),
|
252 |
-
n_timesteps=
|
253 |
temperature=prosody_creativity,
|
254 |
c=utterance_embedding)), min=0.0).long().squeeze(1) if gold_durations is None else gold_durations.squeeze(1)
|
255 |
|
@@ -277,7 +277,7 @@ class ToucanTTS(torch.nn.Module):
|
|
277 |
|
278 |
refined_codec_frames = self.flow_matching_decoder(mu=preliminary_spectrogram.transpose(1, 2),
|
279 |
mask=make_non_pad_mask([len(decoded_speech[0])], device=decoded_speech.device).unsqueeze(-2),
|
280 |
-
n_timesteps=
|
281 |
temperature=0.2, # low temperature, so the model follows the specified prosody curves better.
|
282 |
c=None).transpose(1, 2)
|
283 |
|
|
|
222 |
reduced_pitch_space = self.pitch_latent_reduction(encoded_texts).transpose(1, 2)
|
223 |
pitch_predictions = self.pitch_predictor(mu=reduced_pitch_space,
|
224 |
mask=text_masks.float(),
|
225 |
+
n_timesteps=10,
|
226 |
temperature=prosody_creativity,
|
227 |
c=utterance_embedding) if gold_pitch is None else gold_pitch
|
228 |
# because of the way we are processing the data, the last few elements of a sequence will always receive an unnaturally low pitch value. To fix this, we just overwrite them here.
|
|
|
235 |
reduced_energy_space = self.energy_latent_reduction(encoded_texts + embedded_pitch_curve).transpose(1, 2)
|
236 |
energy_predictions = self.energy_predictor(mu=reduced_energy_space,
|
237 |
mask=text_masks.float(),
|
238 |
+
n_timesteps=10,
|
239 |
temperature=prosody_creativity,
|
240 |
c=utterance_embedding) if gold_energy is None else gold_energy
|
241 |
|
|
|
249 |
reduced_duration_space = self.duration_latent_reduction(encoded_texts + embedded_pitch_curve + embedded_energy_curve).transpose(1, 2)
|
250 |
predicted_durations = torch.clamp(torch.ceil(self.duration_predictor(mu=reduced_duration_space,
|
251 |
mask=text_masks.float(),
|
252 |
+
n_timesteps=10,
|
253 |
temperature=prosody_creativity,
|
254 |
c=utterance_embedding)), min=0.0).long().squeeze(1) if gold_durations is None else gold_durations.squeeze(1)
|
255 |
|
|
|
277 |
|
278 |
refined_codec_frames = self.flow_matching_decoder(mu=preliminary_spectrogram.transpose(1, 2),
|
279 |
mask=make_non_pad_mask([len(decoded_speech[0])], device=decoded_speech.device).unsqueeze(-2),
|
280 |
+
n_timesteps=10,
|
281 |
temperature=0.2, # low temperature, so the model follows the specified prosody curves better.
|
282 |
c=None).transpose(1, 2)
|
283 |
|
README.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.29.1
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
10 |
license: mit
|
11 |
---
|
|
|
1 |
---
|
2 |
+
title: NeutralToucan
|
3 |
+
emoji: 🎨🦜
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: blue
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.29.1
|
8 |
app_file: app.py
|
9 |
+
pinned: false
|
10 |
license: mit
|
11 |
---
|
app.py
CHANGED
@@ -14,8 +14,8 @@ class TTSWebUI:
|
|
14 |
|
15 |
def __init__(self,
|
16 |
gpu_id="cpu",
|
17 |
-
title="
|
18 |
-
article="
|
19 |
tts_model_path=None,
|
20 |
vocoder_model_path=None,
|
21 |
embedding_gan_path=None,
|
|
|
14 |
|
15 |
def __init__(self,
|
16 |
gpu_id="cpu",
|
17 |
+
title="Phoneme Synthesis with Neutral Accent and Many Speakers",
|
18 |
+
article="Put in a string of IPA characters and have it pronounced in a way that is averaged across many languages. Use ~ to get a pause and include any punctuation marks you would normally use. If you enable the checkbox, the model will take much longer, but the result will be spoken by 10 artificial voices at the same time. <br>",
|
19 |
tts_model_path=None,
|
20 |
vocoder_model_path=None,
|
21 |
embedding_gan_path=None,
|