🎤 Long-Form Text-to-Speech Generator
Convert any length of text to natural human-like speech using free AI models
import gradio as gr import torch import numpy as np import re from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset import soundfile as sf import io import tempfile import os from pydub import AudioSegment from pydub.silence import split_on_silence import nltk from nltk.tokenize import sent_tokenize import warnings warnings.filterwarnings("ignore") # Download required NLTK data try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') class LongFormTTS: def __init__(self): print("Loading TTS models...") # Load SpeechT5 models (free and high quality) self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Load speaker embeddings dataset embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) print("Models loaded successfully!") def preprocess_text(self, text): """Clean and prepare text for TTS""" # Remove extra whitespace text = re.sub(r'\s+', ' ', text.strip()) # Handle common abbreviations abbreviations = { 'Dr.': 'Doctor', 'Mr.': 'Mister', 'Mrs.': 'Missus', 'Ms.': 'Miss', 'Prof.': 'Professor', 'etc.': 'etcetera', 'vs.': 'versus', 'e.g.': 'for example', 'i.e.': 'that is', } for abbr, full in abbreviations.items(): text = text.replace(abbr, full) # Handle numbers (basic) text = re.sub(r'\b(\d+)\b', lambda m: self.number_to_words(int(m.group())), text) return text def number_to_words(self, num): """Convert numbers to words (basic implementation)""" if num == 0: return "zero" ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"] teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen"] tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] if num < 10: return ones[num] elif num < 20: return teens[num - 10] elif num < 100: return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10]) elif num < 1000: return ones[num // 100] + " hundred" + ("" if num % 100 == 0 else " " + self.number_to_words(num % 100)) else: return str(num) # Fallback for larger numbers def chunk_text(self, text, max_length=500): """Split text into manageable chunks while preserving sentence boundaries""" sentences = sent_tokenize(text) chunks = [] current_chunk = "" for sentence in sentences: # If single sentence is too long, split by clauses if len(sentence) > max_length: clauses = re.split(r'[,;:]', sentence) for clause in clauses: clause = clause.strip() if len(current_chunk + clause) > max_length: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = clause else: # Even single clause is too long, force split words = clause.split() temp_chunk = "" for word in words: if len(temp_chunk + word) > max_length: if temp_chunk: chunks.append(temp_chunk.strip()) temp_chunk = word else: chunks.append(word) # Single word longer than limit else: temp_chunk += " " + word if temp_chunk else word if temp_chunk: current_chunk = temp_chunk else: current_chunk += " " + clause if current_chunk else clause else: if len(current_chunk + sentence) > max_length: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence else: chunks.append(sentence) else: current_chunk += " " + sentence if current_chunk else sentence if current_chunk: chunks.append(current_chunk.strip()) return [chunk for chunk in chunks if chunk.strip()] def generate_speech_chunk(self, text_chunk): """Generate speech for a single text chunk""" try: inputs = self.processor(text=text_chunk, return_tensors="pt") speech = self.model.generate_speech( inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder ) return speech.numpy() except Exception as e: print(f"Error generating speech for chunk: {e}") return np.array([]) def generate_long_speech(self, text, progress_callback=None): """Generate speech for long text by processing in chunks""" # Preprocess text text = self.preprocess_text(text) # Split into chunks chunks = self.chunk_text(text) print(f"Split text into {len(chunks)} chunks") if not chunks: return np.array([]), 16000 # Generate speech for each chunk audio_segments = [] total_chunks = len(chunks) for i, chunk in enumerate(chunks): if progress_callback: progress_callback(f"Processing chunk {i+1}/{total_chunks}: {chunk[:50]}...") speech_chunk = self.generate_speech_chunk(chunk) if len(speech_chunk) > 0: audio_segments.append(speech_chunk) # Add small pause between chunks (200ms of silence) pause_duration = int(0.2 * 16000) # 200ms at 16kHz silence = np.zeros(pause_duration) audio_segments.append(silence) if not audio_segments: return np.array([]), 16000 # Concatenate all audio segments final_audio = np.concatenate(audio_segments) return final_audio, 16000 # Initialize TTS system tts_system = LongFormTTS() def text_to_speech_interface(text, progress=gr.Progress()): """Main interface function for Gradio""" if not text.strip(): return None, "Please enter some text to convert to speech." def progress_callback(message): progress(0.5, desc=message) try: progress(0.1, desc="Starting text-to-speech conversion...") audio, sample_rate = tts_system.generate_long_speech(text, progress_callback) if len(audio) == 0: return None, "Failed to generate audio. Please try again." progress(0.9, desc="Finalizing audio...") # Save to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: sf.write(tmp_file.name, audio, sample_rate) audio_path = tmp_file.name progress(1.0, desc="Complete!") return audio_path, f"✅ Successfully generated {len(audio)/sample_rate:.1f} seconds of audio!" except Exception as e: error_msg = f"❌ Error: {str(e)}" print(error_msg) return None, error_msg # Create Gradio interface def create_interface(): with gr.Blocks( title="🎤 Long-Form Text-to-Speech Generator", theme=gr.themes.Soft(), css=""" .main-header { text-align: center; margin-bottom: 2rem; } .feature-box { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; margin: 1rem 0; } """ ) as demo: gr.HTML("""
Convert any length of text to natural human-like speech using free AI models
💡 Tip: The system works best with well-formatted text with proper punctuation!