import gradio as gr
import torch
import numpy as np
import re
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import soundfile as sf
import io
import tempfile
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence
import nltk
from nltk.tokenize import sent_tokenize
import warnings
warnings.filterwarnings("ignore")

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

class LongFormTTS:
    def __init__(self):
        print("Loading TTS models...")
        
        # Load SpeechT5 models (free and high quality)
        self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
        
        # Load speaker embeddings dataset
        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
        self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
        
        print("Models loaded successfully!")
    
    def preprocess_text(self, text):
        """Clean and prepare text for TTS"""
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text.strip())
        
        # Handle common abbreviations
        abbreviations = {
            'Dr.': 'Doctor',
            'Mr.': 'Mister',
            'Mrs.': 'Missus',
            'Ms.': 'Miss',
            'Prof.': 'Professor',
            'etc.': 'etcetera',
            'vs.': 'versus',
            'e.g.': 'for example',
            'i.e.': 'that is',
        }
        
        for abbr, full in abbreviations.items():
            text = text.replace(abbr, full)
        
        # Handle numbers (basic)
        text = re.sub(r'\b(\d+)\b', lambda m: self.number_to_words(int(m.group())), text)
        
        return text
    
    def number_to_words(self, num):
        """Convert numbers to words (basic implementation)"""
        if num == 0:
            return "zero"
        
        ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
        teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", 
                "sixteen", "seventeen", "eighteen", "nineteen"]
        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
        
        if num < 10:
            return ones[num]
        elif num < 20:
            return teens[num - 10]
        elif num < 100:
            return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10])
        elif num < 1000:
            return ones[num // 100] + " hundred" + ("" if num % 100 == 0 else " " + self.number_to_words(num % 100))
        else:
            return str(num)  # Fallback for larger numbers
    
    def chunk_text(self, text, max_length=500):
        """Split text into manageable chunks while preserving sentence boundaries"""
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            # If single sentence is too long, split by clauses
            if len(sentence) > max_length:
                clauses = re.split(r'[,;:]', sentence)
                for clause in clauses:
                    clause = clause.strip()
                    if len(current_chunk + clause) > max_length:
                        if current_chunk:
                            chunks.append(current_chunk.strip())
                            current_chunk = clause
                        else:
                            # Even single clause is too long, force split
                            words = clause.split()
                            temp_chunk = ""
                            for word in words:
                                if len(temp_chunk + word) > max_length:
                                    if temp_chunk:
                                        chunks.append(temp_chunk.strip())
                                        temp_chunk = word
                                    else:
                                        chunks.append(word)  # Single word longer than limit
                                else:
                                    temp_chunk += " " + word if temp_chunk else word
                            if temp_chunk:
                                current_chunk = temp_chunk
                    else:
                        current_chunk += " " + clause if current_chunk else clause
            else:
                if len(current_chunk + sentence) > max_length:
                    if current_chunk:
                        chunks.append(current_chunk.strip())
                        current_chunk = sentence
                    else:
                        chunks.append(sentence)
                else:
                    current_chunk += " " + sentence if current_chunk else sentence
        
        if current_chunk:
            chunks.append(current_chunk.strip())
        
        return [chunk for chunk in chunks if chunk.strip()]
    
    def generate_speech_chunk(self, text_chunk):
        """Generate speech for a single text chunk"""
        try:
            inputs = self.processor(text=text_chunk, return_tensors="pt")
            speech = self.model.generate_speech(
                inputs["input_ids"], 
                self.speaker_embeddings, 
                vocoder=self.vocoder
            )
            return speech.numpy()
        except Exception as e:
            print(f"Error generating speech for chunk: {e}")
            return np.array([])
    
    def generate_long_speech(self, text, progress_callback=None):
        """Generate speech for long text by processing in chunks"""
        # Preprocess text
        text = self.preprocess_text(text)
        
        # Split into chunks
        chunks = self.chunk_text(text)
        print(f"Split text into {len(chunks)} chunks")
        
        if not chunks:
            return np.array([]), 16000
        
        # Generate speech for each chunk
        audio_segments = []
        total_chunks = len(chunks)
        
        for i, chunk in enumerate(chunks):
            if progress_callback:
                progress_callback(f"Processing chunk {i+1}/{total_chunks}: {chunk[:50]}...")
            
            speech_chunk = self.generate_speech_chunk(chunk)
            if len(speech_chunk) > 0:
                audio_segments.append(speech_chunk)
            
            # Add small pause between chunks (200ms of silence)
            pause_duration = int(0.2 * 16000)  # 200ms at 16kHz
            silence = np.zeros(pause_duration)
            audio_segments.append(silence)
        
        if not audio_segments:
            return np.array([]), 16000
        
        # Concatenate all audio segments
        final_audio = np.concatenate(audio_segments)
        
        return final_audio, 16000

# Initialize TTS system
tts_system = LongFormTTS()

def text_to_speech_interface(text, progress=gr.Progress()):
    """Main interface function for Gradio"""
    if not text.strip():
        return None, "Please enter some text to convert to speech."
    
    def progress_callback(message):
        progress(0.5, desc=message)
    
    try:
        progress(0.1, desc="Starting text-to-speech conversion...")
        
        audio, sample_rate = tts_system.generate_long_speech(text, progress_callback)
        
        if len(audio) == 0:
            return None, "Failed to generate audio. Please try again."
        
        progress(0.9, desc="Finalizing audio...")
        
        # Save to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            sf.write(tmp_file.name, audio, sample_rate)
            audio_path = tmp_file.name
        
        progress(1.0, desc="Complete!")
        
        return audio_path, f"✅ Successfully generated {len(audio)/sample_rate:.1f} seconds of audio!"
        
    except Exception as e:
        error_msg = f"❌ Error: {str(e)}"
        print(error_msg)
        return None, error_msg

# Create Gradio interface
def create_interface():
    with gr.Blocks(
        title="🎤 Long-Form Text-to-Speech Generator",
        theme=gr.themes.Soft(),
        css="""
        .main-header {
            text-align: center;
            margin-bottom: 2rem;
        }
        .feature-box {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 1rem;
            border-radius: 10px;
            margin: 1rem 0;
        }
        """
    ) as demo:
        
        gr.HTML("""
        <div class="main-header">
            <h1>🎤 Long-Form Text-to-Speech Generator</h1>
            <p>Convert any length of text to natural human-like speech using free AI models</p>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=2):
                text_input = gr.Textbox(
                    label="📝 Enter your text",
                    placeholder="Type or paste any text here... No length limit!",
                    lines=10,
                    max_lines=20
                )
                
                generate_btn = gr.Button(
                    "🎯 Generate Speech",
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column(scale=1):
                gr.HTML("""
                <div class="feature-box">
                    <h3>✨ Features</h3>
                    <ul>
                        <li>🚀 Unlimited text length</li>
                        <li>🤖 Human-like voice quality</li>
                        <li>⚡ Smart text chunking</li>
                        <li>🆓 Completely free to use</li>
                        <li>🔧 Automatic text preprocessing</li>
                    </ul>
                </div>
                """)
        
        status_text = gr.Textbox(
            label="📊 Status",
            interactive=False,
            value="Ready to generate speech!"
        )
        
        audio_output = gr.Audio(
            label="🔊 Generated Speech",
            type="filepath"
        )
        
        # Event handlers
        generate_btn.click(
            fn=text_to_speech_interface,
            inputs=[text_input],
            outputs=[audio_output, status_text]
        )
        
        # Example texts
        gr.Examples(
            examples=[
                ["Hello! This is a test of the long-form text-to-speech system. It can handle texts of any length by intelligently splitting them into smaller chunks while maintaining natural flow and pronunciation."],
                ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet and is commonly used for testing text-to-speech systems."],
                ["In a hole in the ground there lived a hobbit. Not a nasty, dirty, wet hole filled with the ends of worms and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it to sit down on or to eat: it was a hobbit-hole, and that means comfort."]
            ],
            inputs=[text_input]
        )
        
        gr.HTML("""
        <div style="margin-top: 2rem; padding: 1rem; background: #f0f0f0; border-radius: 5px;">
            <h4>🔧 How it works:</h4>
            <ol>
                <li><strong>Text Preprocessing:</strong> Cleans and normalizes your input text</li>
                <li><strong>Smart Chunking:</strong> Splits long text at sentence boundaries</li>
                <li><strong>Speech Generation:</strong> Uses Microsoft's SpeechT5 model for each chunk</li>
                <li><strong>Audio Merging:</strong> Combines all chunks with natural pauses</li>
            </ol>
            <p><em>💡 Tip: The system works best with well-formatted text with proper punctuation!</em></p>
        </div>
        """)
    
    return demo

# Launch the app
if __name__ == "__main__":
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )