leenag commited on
Commit
66d0bf1
·
verified ·
1 Parent(s): d0eaaa1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -0
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from parler_tts import ParlerTTSForConditionalGeneration
4
+ from transformers import AutoTokenizer
5
+ import soundfile as sf
6
+ import uuid
7
+
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ model_name = "ai4bharat/indic-parler-tts"
10
+ model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ desc_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
13
+
14
+ def synthesize(language, text, gender, emotion, speed, pitch, quality):
15
+ desc = (
16
+ f"A native {language} {gender.lower()} speaker with a {emotion.lower()} and expressive tone, "
17
+ f"speaking at a {speed.lower()} rate with {pitch.lower()} pitch and {quality.lower()} voice quality."
18
+ )
19
+ desc_inputs = desc_tokenizer(desc, return_tensors="pt").to(device)
20
+ text_inputs = tokenizer(text, return_tensors="pt").to(device)
21
+
22
+ gen_audio = model.generate(
23
+ input_ids=desc_inputs.input_ids,
24
+ attention_mask=desc_inputs.attention_mask,
25
+ prompt_input_ids=text_inputs.input_ids,
26
+ prompt_attention_mask=torch.ones_like(text_inputs.input_ids).to(device)
27
+ )
28
+
29
+ audio_np = gen_audio.cpu().numpy().squeeze()
30
+ filename = f"{uuid.uuid4()}.wav"
31
+ sf.write(filename, audio_np, model.config.sampling_rate)
32
+ return filename
33
+
34
+ iface = gr.Interface(
35
+ fn=synthesize,
36
+ inputs=[
37
+ gr.Dropdown(["Malayalam", "English", "Hindi", "Tamil"], label="Language"),
38
+ gr.Textbox(label="Text to Synthesize", lines=4),
39
+ gr.Radio(["Male", "Female"], label="Speaker Gender"),
40
+ gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Emotion"),
41
+ gr.Dropdown(["Slow", "Moderate", "Fast"], label="Speaking Rate"),
42
+ gr.Dropdown(["Low", "Normal", "High"], label="Pitch"),
43
+ gr.Dropdown(["Basic", "Refined"], label="Voice Quality"),
44
+ ],
45
+ outputs=gr.Audio(type="filepath", label="Synthesized Audio"),
46
+ allow_flagging="never",
47
+ title="Multilingual TTS using Indic Parler-TTS",
48
+ description="Type text, choose a speaker style, and get synthesized speech for Malayalam, Hindi, Tamil, or English."
49
+ )
50
+
51
+ iface.launch()