leenag commited on
Commit
5e6c5bb
·
verified ·
1 Parent(s): 89f1a27

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -25
app.py CHANGED
@@ -1,51 +1,100 @@
1
- import gradio as gr
2
  import torch
3
- from parler_tts import ParlerTTSForConditionalGeneration
4
- from transformers import AutoTokenizer
5
  import soundfile as sf
6
  import uuid
 
 
 
 
 
7
 
8
- device = "cuda" if torch.cuda.is_available() else "cpu"
9
  model_name = "ai4bharat/indic-parler-tts"
10
- model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
 
 
 
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
  desc_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def synthesize(language, text, gender, emotion, speed, pitch, quality):
15
- desc = (
16
- f"A native {language} {gender.lower()} speaker with a {emotion.lower()} and expressive tone, "
17
  f"speaking at a {speed.lower()} rate with {pitch.lower()} pitch and {quality.lower()} voice quality."
18
  )
19
- desc_inputs = desc_tokenizer(desc, return_tensors="pt").to(device)
20
- text_inputs = tokenizer(text, return_tensors="pt").to(device)
21
-
22
- gen_audio = model.generate(
23
- input_ids=desc_inputs.input_ids,
24
- attention_mask=desc_inputs.attention_mask,
25
- prompt_input_ids=text_inputs.input_ids,
26
- prompt_attention_mask=torch.ones_like(text_inputs.input_ids).to(device)
27
- )
28
 
29
- audio_np = gen_audio.cpu().numpy().squeeze()
30
- filename = f"{uuid.uuid4()}.wav"
31
- sf.write(filename, audio_np, model.config.sampling_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  return filename
33
 
 
34
  iface = gr.Interface(
35
  fn=synthesize,
36
  inputs=[
37
- gr.Dropdown(["Malayalam", "English", "Hindi", "Tamil"], label="Language"),
38
- gr.Textbox(label="Text to Synthesize", lines=4),
39
  gr.Radio(["Male", "Female"], label="Speaker Gender"),
40
  gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Emotion"),
41
  gr.Dropdown(["Slow", "Moderate", "Fast"], label="Speaking Rate"),
42
  gr.Dropdown(["Low", "Normal", "High"], label="Pitch"),
43
  gr.Dropdown(["Basic", "Refined"], label="Voice Quality"),
44
  ],
45
- outputs=gr.Audio(type="filepath", label="Synthesized Audio"),
46
- allow_flagging="never",
47
- title="Multilingual TTS using Indic Parler-TTS",
48
- description="Type text, choose a speaker style, and get synthesized speech for Malayalam, Hindi, Tamil, or English."
49
  )
50
 
51
  iface.launch()
 
 
1
  import torch
 
 
2
  import soundfile as sf
3
  import uuid
4
+ import gradio as gr
5
+ import numpy as np
6
+ import re
7
+ from parler_tts import ParlerTTSForConditionalGeneration
8
+ from transformers import AutoTokenizer
9
 
10
+ # Load model and tokenizers
11
  model_name = "ai4bharat/indic-parler-tts"
12
+ device = "cpu"
13
+
14
+ print("Loading model...")
15
+ model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device).eval()
16
  tokenizer = AutoTokenizer.from_pretrained(model_name)
17
  desc_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
18
 
19
+ print("Applying dynamic quantization...")
20
+ quantized_model = torch.quantization.quantize_dynamic(
21
+ model,
22
+ {torch.nn.Linear},
23
+ dtype=torch.qint8
24
+ )
25
+
26
+ # Sentence splitter (splits by full stop, exclamation, or question mark)
27
+ def split_text(text, max_len=150):
28
+ # First, try to split by sentence punctuation
29
+ chunks = re.split(r'(?<=[.!?]) +', text)
30
+
31
+ # If any chunk is still too long, split further
32
+ refined_chunks = []
33
+ for chunk in chunks:
34
+ if len(chunk) <= max_len:
35
+ refined_chunks.append(chunk)
36
+ else:
37
+ # Break on space while respecting max_len
38
+ words = chunk.split()
39
+ buffer = []
40
+ length = 0
41
+ for word in words:
42
+ buffer.append(word)
43
+ length += len(word) + 1
44
+ if length > max_len:
45
+ refined_chunks.append(' '.join(buffer))
46
+ buffer = []
47
+ length = 0
48
+ if buffer:
49
+ refined_chunks.append(' '.join(buffer))
50
+ return refined_chunks
51
+
52
+ # Main synthesis function
53
  def synthesize(language, text, gender, emotion, speed, pitch, quality):
54
+ description = (
55
+ f"A native {language.lower()} {gender.lower()} speaker with a {emotion.lower()} and expressive tone, "
56
  f"speaking at a {speed.lower()} rate with {pitch.lower()} pitch and {quality.lower()} voice quality."
57
  )
 
 
 
 
 
 
 
 
 
58
 
59
+ description_input = desc_tokenizer(description, return_tensors="pt").to(device)
60
+
61
+ chunks = split_text(text)
62
+ audio_pieces = []
63
+
64
+ for chunk in chunks:
65
+ prompt_input = tokenizer(chunk, return_tensors="pt").to(device)
66
+ with torch.no_grad():
67
+ generation = quantized_model.generate(
68
+ input_ids=description_input.input_ids,
69
+ attention_mask=description_input.attention_mask,
70
+ prompt_input_ids=prompt_input.input_ids,
71
+ prompt_attention_mask=torch.ones_like(prompt_input.input_ids).to(device)
72
+ )
73
+ audio_chunk = generation.cpu().numpy().squeeze()
74
+ audio_pieces.append(audio_chunk)
75
+
76
+ # Concatenate all audio chunks
77
+ final_audio = np.concatenate(audio_pieces)
78
+
79
+ filename = f"{uuid.uuid4().hex}.wav"
80
+ sf.write(filename, final_audio, quantized_model.config.sampling_rate)
81
  return filename
82
 
83
+ # Gradio Interface
84
  iface = gr.Interface(
85
  fn=synthesize,
86
  inputs=[
87
+ gr.Dropdown(["Malayalam", "Hindi", "Tamil", "English"], label="Language"),
88
+ gr.Textbox(label="Text to Synthesize", lines=6, placeholder="Enter your sentence here..."),
89
  gr.Radio(["Male", "Female"], label="Speaker Gender"),
90
  gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Emotion"),
91
  gr.Dropdown(["Slow", "Moderate", "Fast"], label="Speaking Rate"),
92
  gr.Dropdown(["Low", "Normal", "High"], label="Pitch"),
93
  gr.Dropdown(["Basic", "Refined"], label="Voice Quality"),
94
  ],
95
+ outputs=gr.Audio(type="filepath", label="Synthesized Speech"),
96
+ title="Multilingual Indic TTS (Quantized + Chunked)",
97
+ description="Fast CPU-based TTS with quantized Parler-TTS and text chunking for Malayalam, Hindi, Tamil, and English.",
 
98
  )
99
 
100
  iface.launch()