import os import requests import tempfile import gradio as gr from moviepy import VideoFileClip from speechbrain.inference.interfaces import foreign_class import whisper from together import Together # Initialize Whisper once _whisper_model = whisper.load_model("base") # Initialize SpeechBrain classifier once _classifier = foreign_class( source="warisqr7/accent-id-commonaccent_xlsr-en-english", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier" ) # Helper to download direct‐mp4 URL to a temp file def download_video(url: str) -> str: resp = requests.get(url, stream=True) resp.raise_for_status() tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") for chunk in resp.iter_content(8192): tmp.write(chunk) tmp.close() return tmp.name # Helper to extract audio to a temp file def extract_audio(video_path: str) -> str: tmp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name clip = VideoFileClip(video_path) clip.audio.write_audiofile(tmp_audio, logger=None) clip.close() return tmp_audio # Main pipeline def analyze_url(video_url): try: # 1. Download & extract vid = download_video(video_url) aud = extract_audio(vid) # 2. Accent classification out_prob, score, idx, lab = _classifier.classify_file(aud) accent = lab[0] conf_pct = round(float(score) * 100, 2) # 3. Transcription result = _whisper_model.transcribe(aud) transcript = result["text"] # 4. LLM analysis api_key = os.getenv('API_KEY') client = Together(api_key=api_key) prompt = f""" You are an English-speaking coach. Given this transcript of a spoken English audio with an {accent} accent and classification confidence {conf_pct}%: \"\"\"{transcript}\"\"\" Evaluate how confident the speaker sounds for a job interview based on fluency, clarity, filler usage, professional English, and pacing. Provide: - A proficiency score between 0 and 100 - A brief explanation - Give Bullet points, but nothing in bold. """ resp = client.chat.completions.create( model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free", messages=[{"role": "user", "content": prompt}] ) analysis = resp.choices[0].message.content.strip() # Clean up temp files os.remove(vid) os.remove(aud) return accent, f"{conf_pct}%", transcript, analysis except Exception as e: return "Error", "", "", str(e) # Build Gradio interface with gr.Blocks(title="English Accent & Proficiency Analyzer") as demo: gr.Markdown("## 🎙️ English Accent Detection & Proficiency Analysis") with gr.Row(): inp = gr.Textbox(label="Direct MP4 Video URL", placeholder="https://...") run = gr.Button("Analyze") with gr.Row(): out1 = gr.Textbox(label="Detected Accent") out2 = gr.Textbox(label="Accent Classification Confidence Score") out3 = gr.Textbox(label="Transcript", lines=5) out4 = gr.Textbox(label="Proficiency Analysis", lines=10) run.click( fn=analyze_url, inputs=inp, outputs=[out1, out2, out3, out4], api_name="analyze" ) if __name__ == "__main__": demo.launch()