Flux9665 commited on
Commit
687c689
·
1 Parent(s): 401875a

speed improvements and documentation

Browse files
Files changed (3) hide show
  1. Modules/ToucanTTS/InferenceToucanTTS.py +4 -4
  2. README.md +5 -5
  3. app.py +2 -2
Modules/ToucanTTS/InferenceToucanTTS.py CHANGED
@@ -222,7 +222,7 @@ class ToucanTTS(torch.nn.Module):
222
  reduced_pitch_space = self.pitch_latent_reduction(encoded_texts).transpose(1, 2)
223
  pitch_predictions = self.pitch_predictor(mu=reduced_pitch_space,
224
  mask=text_masks.float(),
225
- n_timesteps=20,
226
  temperature=prosody_creativity,
227
  c=utterance_embedding) if gold_pitch is None else gold_pitch
228
  # because of the way we are processing the data, the last few elements of a sequence will always receive an unnaturally low pitch value. To fix this, we just overwrite them here.
@@ -235,7 +235,7 @@ class ToucanTTS(torch.nn.Module):
235
  reduced_energy_space = self.energy_latent_reduction(encoded_texts + embedded_pitch_curve).transpose(1, 2)
236
  energy_predictions = self.energy_predictor(mu=reduced_energy_space,
237
  mask=text_masks.float(),
238
- n_timesteps=20,
239
  temperature=prosody_creativity,
240
  c=utterance_embedding) if gold_energy is None else gold_energy
241
 
@@ -249,7 +249,7 @@ class ToucanTTS(torch.nn.Module):
249
  reduced_duration_space = self.duration_latent_reduction(encoded_texts + embedded_pitch_curve + embedded_energy_curve).transpose(1, 2)
250
  predicted_durations = torch.clamp(torch.ceil(self.duration_predictor(mu=reduced_duration_space,
251
  mask=text_masks.float(),
252
- n_timesteps=20,
253
  temperature=prosody_creativity,
254
  c=utterance_embedding)), min=0.0).long().squeeze(1) if gold_durations is None else gold_durations.squeeze(1)
255
 
@@ -277,7 +277,7 @@ class ToucanTTS(torch.nn.Module):
277
 
278
  refined_codec_frames = self.flow_matching_decoder(mu=preliminary_spectrogram.transpose(1, 2),
279
  mask=make_non_pad_mask([len(decoded_speech[0])], device=decoded_speech.device).unsqueeze(-2),
280
- n_timesteps=30,
281
  temperature=0.2, # low temperature, so the model follows the specified prosody curves better.
282
  c=None).transpose(1, 2)
283
 
 
222
  reduced_pitch_space = self.pitch_latent_reduction(encoded_texts).transpose(1, 2)
223
  pitch_predictions = self.pitch_predictor(mu=reduced_pitch_space,
224
  mask=text_masks.float(),
225
+ n_timesteps=10,
226
  temperature=prosody_creativity,
227
  c=utterance_embedding) if gold_pitch is None else gold_pitch
228
  # because of the way we are processing the data, the last few elements of a sequence will always receive an unnaturally low pitch value. To fix this, we just overwrite them here.
 
235
  reduced_energy_space = self.energy_latent_reduction(encoded_texts + embedded_pitch_curve).transpose(1, 2)
236
  energy_predictions = self.energy_predictor(mu=reduced_energy_space,
237
  mask=text_masks.float(),
238
+ n_timesteps=10,
239
  temperature=prosody_creativity,
240
  c=utterance_embedding) if gold_energy is None else gold_energy
241
 
 
249
  reduced_duration_space = self.duration_latent_reduction(encoded_texts + embedded_pitch_curve + embedded_energy_curve).transpose(1, 2)
250
  predicted_durations = torch.clamp(torch.ceil(self.duration_predictor(mu=reduced_duration_space,
251
  mask=text_masks.float(),
252
+ n_timesteps=10,
253
  temperature=prosody_creativity,
254
  c=utterance_embedding)), min=0.0).long().squeeze(1) if gold_durations is None else gold_durations.squeeze(1)
255
 
 
277
 
278
  refined_codec_frames = self.flow_matching_decoder(mu=preliminary_spectrogram.transpose(1, 2),
279
  mask=make_non_pad_mask([len(decoded_speech[0])], device=decoded_speech.device).unsqueeze(-2),
280
+ n_timesteps=10,
281
  temperature=0.2, # low temperature, so the model follows the specified prosody curves better.
282
  c=None).transpose(1, 2)
283
 
README.md CHANGED
@@ -1,11 +1,11 @@
1
  ---
2
- title: MassivelyMultilingualTTS
3
- emoji: 🌍🦜
4
- colorFrom: purple
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.29.1
8
  app_file: app.py
9
- pinned: true
10
  license: mit
11
  ---
 
1
  ---
2
+ title: NeutralToucan
3
+ emoji: 🎨🦜
4
+ colorFrom: green
5
+ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 5.29.1
8
  app_file: app.py
9
+ pinned: false
10
  license: mit
11
  ---
app.py CHANGED
@@ -14,8 +14,8 @@ class TTSWebUI:
14
 
15
  def __init__(self,
16
  gpu_id="cpu",
17
- title="Controllable Text-to-Speech for over 7000 Languages",
18
- article="The biggest thank you to Hugging Face🤗 for sponsoring the GPU for this space! <br> To get the code, models, additional features, and more information, check out our toolkit: https://github.com/DigitalPhonetics/IMS-Toucan <br>",
19
  tts_model_path=None,
20
  vocoder_model_path=None,
21
  embedding_gan_path=None,
 
14
 
15
  def __init__(self,
16
  gpu_id="cpu",
17
+ title="Phoneme Synthesis with Neutral Accent and Many Speakers",
18
+ article="Put in a string of IPA characters and have it pronounced in a way that is averaged across many languages. Use ~ to get a pause and include any punctuation marks you would normally use. If you enable the checkbox, the model will take much longer, but the result will be spoken by 10 artificial voices at the same time. <br>",
19
  tts_model_path=None,
20
  vocoder_model_path=None,
21
  embedding_gan_path=None,