NeutralToucan

Sleeping

App Files Files

Flux9665 commited on May 17

Commit

687c689

1 Parent(s): 401875a

speed improvements and documentation

Browse files

Files changed (3) hide show

Modules/ToucanTTS/InferenceToucanTTS.py +4 -4
README.md +5 -5
app.py +2 -2

Modules/ToucanTTS/InferenceToucanTTS.py CHANGED Viewed

@@ -222,7 +222,7 @@ class ToucanTTS(torch.nn.Module):
         reduced_pitch_space = self.pitch_latent_reduction(encoded_texts).transpose(1, 2)
         pitch_predictions = self.pitch_predictor(mu=reduced_pitch_space,
                                                  mask=text_masks.float(),
-                                                 n_timesteps=20,
                                                  temperature=prosody_creativity,
                                                  c=utterance_embedding) if gold_pitch is None else gold_pitch
         # because of the way we are processing the data, the last few elements of a sequence will always receive an unnaturally low pitch value. To fix this, we just overwrite them here.
@@ -235,7 +235,7 @@ class ToucanTTS(torch.nn.Module):
         reduced_energy_space = self.energy_latent_reduction(encoded_texts + embedded_pitch_curve).transpose(1, 2)
         energy_predictions = self.energy_predictor(mu=reduced_energy_space,
                                                    mask=text_masks.float(),
-                                                   n_timesteps=20,
                                                    temperature=prosody_creativity,
                                                    c=utterance_embedding) if gold_energy is None else gold_energy
@@ -249,7 +249,7 @@ class ToucanTTS(torch.nn.Module):
         reduced_duration_space = self.duration_latent_reduction(encoded_texts + embedded_pitch_curve + embedded_energy_curve).transpose(1, 2)
         predicted_durations = torch.clamp(torch.ceil(self.duration_predictor(mu=reduced_duration_space,
                                                                              mask=text_masks.float(),
-                                                                             n_timesteps=20,
                                                                              temperature=prosody_creativity,
                                                                              c=utterance_embedding)), min=0.0).long().squeeze(1) if gold_durations is None else gold_durations.squeeze(1)
@@ -277,7 +277,7 @@ class ToucanTTS(torch.nn.Module):
         refined_codec_frames = self.flow_matching_decoder(mu=preliminary_spectrogram.transpose(1, 2),
                                                           mask=make_non_pad_mask([len(decoded_speech[0])], device=decoded_speech.device).unsqueeze(-2),
-                                                          n_timesteps=30,
                                                           temperature=0.2,  # low temperature, so the model follows the specified prosody curves better.
                                                           c=None).transpose(1, 2)

         reduced_pitch_space = self.pitch_latent_reduction(encoded_texts).transpose(1, 2)
         pitch_predictions = self.pitch_predictor(mu=reduced_pitch_space,
                                                  mask=text_masks.float(),
+                                                 n_timesteps=10,
                                                  temperature=prosody_creativity,
                                                  c=utterance_embedding) if gold_pitch is None else gold_pitch
         # because of the way we are processing the data, the last few elements of a sequence will always receive an unnaturally low pitch value. To fix this, we just overwrite them here.
         reduced_energy_space = self.energy_latent_reduction(encoded_texts + embedded_pitch_curve).transpose(1, 2)
         energy_predictions = self.energy_predictor(mu=reduced_energy_space,
                                                    mask=text_masks.float(),
+                                                   n_timesteps=10,
                                                    temperature=prosody_creativity,
                                                    c=utterance_embedding) if gold_energy is None else gold_energy
         reduced_duration_space = self.duration_latent_reduction(encoded_texts + embedded_pitch_curve + embedded_energy_curve).transpose(1, 2)
         predicted_durations = torch.clamp(torch.ceil(self.duration_predictor(mu=reduced_duration_space,
                                                                              mask=text_masks.float(),
+                                                                             n_timesteps=10,
                                                                              temperature=prosody_creativity,
                                                                              c=utterance_embedding)), min=0.0).long().squeeze(1) if gold_durations is None else gold_durations.squeeze(1)
         refined_codec_frames = self.flow_matching_decoder(mu=preliminary_spectrogram.transpose(1, 2),
                                                           mask=make_non_pad_mask([len(decoded_speech[0])], device=decoded_speech.device).unsqueeze(-2),
+                                                          n_timesteps=10,
                                                           temperature=0.2,  # low temperature, so the model follows the specified prosody curves better.
                                                           c=None).transpose(1, 2)

README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 ---
-title: MassivelyMultilingualTTS
-emoji: 🌍🦜
-colorFrom: purple
-colorTo: pink
 sdk: gradio
 sdk_version: 5.29.1
 app_file: app.py
-pinned: true
 license: mit
 ---

 ---
+title: NeutralToucan
+emoji: 🎨🦜
+colorFrom: green
+colorTo: blue
 sdk: gradio
 sdk_version: 5.29.1
 app_file: app.py
+pinned: false
 license: mit
 ---

app.py CHANGED Viewed

@@ -14,8 +14,8 @@ class TTSWebUI:
     def __init__(self,
                  gpu_id="cpu",
-                 title="Controllable Text-to-Speech for over 7000 Languages",
-                 article="The biggest thank you to Hugging Face🤗 for sponsoring the GPU for this space! <br> To get the code, models, additional features, and more information, check out our toolkit: https://github.com/DigitalPhonetics/IMS-Toucan <br>",
                  tts_model_path=None,
                  vocoder_model_path=None,
                  embedding_gan_path=None,

     def __init__(self,
                  gpu_id="cpu",
+                 title="Phoneme Synthesis with Neutral Accent and Many Speakers",
+                 article="Put in a string of IPA characters and have it pronounced in a way that is averaged across many languages. Use ~ to get a pause and include any punctuation marks you would normally use. If you enable the checkbox, the model will take much longer, but the result will be spoken by 10 artificial voices at the same time. <br>",
                  tts_model_path=None,
                  vocoder_model_path=None,
                  embedding_gan_path=None,