from transformers import VitsModel, AutoTokenizer import torch import scipy.io.wavfile import gradio as gr import numpy as np # Load model and tokenizer model = VitsModel.from_pretrained("Toadoum/swahili-mms-tts-finetuned", device_map="auto") tokenizer = AutoTokenizer.from_pretrained("Toadoum/swahili-mms-tts-finetuned") def text_to_speech(text): # Tokenize input text inputs = tokenizer(text, return_tensors="pt") # Generate waveform with torch.no_grad(): output = model(**inputs).waveform # Convert to numpy array output_np = output.squeeze().cpu().numpy() # Get sampling rate from model config sampling_rate = model.config.sampling_rate # Return as tuple for Gradio audio component return (sampling_rate, output_np) # Create Gradio interface demo = gr.Interface( fn=text_to_speech, inputs=gr.Textbox( label="Enter Swahili Text", value="""Neurotech Africa ni kampuni kutoka Tanzania inaongoza mapinduzi ya kidigitali nchini na barani Afrika kwa suluhisho za Akili bandia (AI). Tunajenga AI ambayo inasaidia biashara kuboresha uzoefu wa wateja kupitia teknolojia za kisasa za mazungumzo.""" ), outputs=gr.Audio(label="Generated Speech"), title="Swahili Text-to-Speech", description="Convert Swahili text to speech using a fine-tuned MMS-TTS model", allow_flagging="never" ) # Launch the app if __name__ == "__main__": demo.launch()