import gradio as gr import torch from transformers import AutoProcessor, VoxtralForConditionalGeneration import spaces #### Functions @spaces.GPU def process_transcript(language: str, audio_path: str) -> str: """Process the audio file to return its transcription. Args: language: The language of the audio. audio_path: The path to the audio file. Returns: The transcribed text of the audio. """ if audio_path is None: return "Please provide some input audio: either upload an audio file or use the microphone." else: id_language = dict_languages[language] inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name) inputs = inputs.to(device, dtype=torch.bfloat16) outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS) decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True) return decoded_outputs[0] ### @spaces.GPU def process_translate(language: str, audio_path: str) -> str: if audio_path is None: return "Please provide some input audio: either upload an audio file or use the microphone." else: conversation = [ { "role": "user", "content": [ { "type": "audio", "path": audio_path, }, {"type": "text", "text": "Translate this in "+language}, ], } ] inputs = processor.apply_chat_template(conversation) inputs = inputs.to(device, dtype=torch.bfloat16) outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS) decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True) return decoded_outputs[0] ### @spaces.GPU def process_chat(question: str, audio_path: str) -> str: if audio_path is None: return "Please provide some input audio: either upload an audio file or use the microphone." else: conversation = [ { "role": "user", "content": [ { "type": "audio", "path": audio_path, }, {"type": "text", "text": question}, ], } ] inputs = processor.apply_chat_template(conversation) inputs = inputs.to(device, dtype=torch.bfloat16) outputs = model.generate(**inputs, max_new_tokens=500) decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True) return decoded_outputs[0] ### def disable_buttons(): return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False) def enable_buttons(): return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True) ### ### Initializations MAX_TOKENS = 32000 device = "cuda" if torch.cuda.is_available() else "cpu" print(f"*** Device: {device}") model_name = 'mistralai/Voxtral-Mini-3B-2507' processor = AutoProcessor.from_pretrained(model_name) model = VoxtralForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map=device) # Supported languages dict_languages = {"English": "en", "French": "fr", "German": "de", "Spanish": "es", "Italian": "it", "Portuguese": "pt", "Dutch": "nl", "Hindi": "hi"} #### Gradio interface with gr.Blocks(title="Voxtral") as voxtral: gr.Markdown("# **Voxtral Mini Evaluation**") gr.Markdown("""#### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \ capabilities while retaining best-in-class text performance. #### It excels at speech transcription, translation and audio understanding.""") with gr.Accordion("🔎 More on Voxtral", open=False): gr.Markdown("""## **Key Features:** #### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities. ##### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly ##### - **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding ##### - **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models ##### - **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian) ##### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents ##### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""") gr.Markdown("### **1. Upload an audio file, record via microphone, or select a demo file:**") gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*") with gr.Row(): sel_audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Set an audio file to process it:") example = [["mapo_tofu.mp3"]] gr.Examples( examples=example, inputs=sel_audio, outputs=None, fn=None, cache_examples=False, run_on_click=False ) with gr.Row(): gr.Markdown("### **2. Choose one of theese tasks:**") with gr.Row(): with gr.Column(): with gr.Accordion("📝 Transcription", open=True): sel_language = gr.Dropdown( choices=list(dict_languages.keys()), value="English", label="Select the language of the audio file:" ) submit_transcript = gr.Button("Extract transcription", variant="primary") text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10) with gr.Column(): with gr.Accordion("🔁 Translation", open=True): sel_translate_language = gr.Dropdown( choices=list(dict_languages.keys()), value="English", label="Select the language for translation:" ) submit_translate = gr.Button("Translate audio file", variant="primary") text_translate = gr.Textbox(label="💬 Generated translation", lines=10) with gr.Column(): with gr.Accordion("🤖 Ask audio file", open=True): question_chat = gr.Textbox(label="Enter your question about audio file:", placeholder="Enter your question about audio file") submit_chat = gr.Button("Ask audio file", variant="primary") example_chat = [["What is the subject of this audio file?"], ["Quels sont les ingrédients ?"]] gr.Examples( examples=example_chat, inputs=question_chat, outputs=None, fn=None, cache_examples=False, run_on_click=False ) text_chat = gr.Textbox(label="💬 Model answer", lines=10) ### Processing # Transcription submit_transcript.click( disable_buttons, outputs=[submit_transcript, submit_translate, submit_chat], trigger_mode="once", ).then( fn=process_transcript, inputs=[sel_language, sel_audio], outputs=text_transcript ).then( enable_buttons, outputs=[submit_transcript, submit_translate, submit_chat], ) # Translation submit_translate.click( disable_buttons, outputs=[submit_transcript, submit_translate, submit_chat], trigger_mode="once", ).then( fn=process_translate, inputs=[sel_translate_language, sel_audio], outputs=text_translate ).then( enable_buttons, outputs=[submit_transcript, submit_translate, submit_chat], ) # Chat submit_chat.click( disable_buttons, outputs=[submit_transcript, submit_translate, submit_chat], trigger_mode="once", ).then( fn=process_chat, inputs=[question_chat, sel_audio], outputs=text_chat ).then( enable_buttons, outputs=[submit_transcript, submit_translate, submit_chat], ) ### Launch the app if __name__ == "__main__": voxtral.queue().launch()