import torch
import gradio as gr
from transformers import VitsModel, VitsTokenizer

# Load the TTS model and tokenizer for Acehnese
model_id = "facebook/mms-tts-ace"
tokenizer = VitsTokenizer.from_pretrained(model_id)
model = VitsModel.from_pretrained(model_id)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# TTS function
def tts_aceh(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model(**inputs)
    waveform = output.waveform[0].cpu().numpy()
    sample_rate = model.config.sampling_rate
    return (sample_rate, waveform)

# Gradio UI
demo = gr.Interface(
    fn=tts_aceh,
    inputs=gr.Textbox(label="Enter Acehnese text"),
    outputs=gr.Audio(type="numpy", label="Generated Speech"),
    title="Acehnese TTS (Text-to-Speech)",
    description=(
        "This is a text-to-speech tool for the Acehnese language using Meta's MMS model. "
        "To use: 1) Enter text in Acehnese, 2) Click Submit to hear it spoken aloud.\n\n"
        "Note: Reuse, redistribution, or derivative use is not allowed unless you ask for permission. "
        "Enjoy responsibly, and feel free to share feedback or support!"
    )
)

if __name__ == "__main__":
    demo.launch()