Voxtral_Mini_Evaluation

Sleeping

App Files Files Community

Voxtral_Mini_Evaluation / app.py

Loren

Update app.py

6645fa0 verified 12 days ago

raw

history blame contribute delete

9.28 kB

	import gradio as gr
	import torch
	from transformers import AutoProcessor, VoxtralForConditionalGeneration
	import spaces

	#### Functions

	@spaces.GPU
	def process_transcript(language: str, audio_path: str) -> str:
	"""Process the audio file to return its transcription.

	Args:
	language: The language of the audio.
	audio_path: The path to the audio file.

	Returns:
	The transcribed text of the audio.
	"""

	if audio_path is None:
	return "Please provide some input audio: either upload an audio file or use the microphone."
	else:
	id_language = dict_languages[language]
	inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name)
	inputs = inputs.to(device, dtype=torch.bfloat16)
	outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
	decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)

	return decoded_outputs[0]
	###

	@spaces.GPU
	def process_translate(language: str, audio_path: str) -> str:
	if audio_path is None:
	return "Please provide some input audio: either upload an audio file or use the microphone."
	else:
	conversation = [
	{
	"role": "user",
	"content": [
	{
	"type": "audio",
	"path": audio_path,
	},
	{"type": "text", "text": "Translate this in "+language},
	],
	}
	]

	inputs = processor.apply_chat_template(conversation)
	inputs = inputs.to(device, dtype=torch.bfloat16)

	outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
	decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)

	return decoded_outputs[0]
	###

	@spaces.GPU
	def process_chat(question: str, audio_path: str) -> str:
	if audio_path is None:
	return "Please provide some input audio: either upload an audio file or use the microphone."
	else:
	conversation = [
	{
	"role": "user",
	"content": [
	{
	"type": "audio",
	"path": audio_path,
	},
	{"type": "text", "text": question},
	],
	}
	]

	inputs = processor.apply_chat_template(conversation)
	inputs = inputs.to(device, dtype=torch.bfloat16)

	outputs = model.generate(**inputs, max_new_tokens=500)
	decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)

	return decoded_outputs[0]
	###

	def disable_buttons():
	return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)

	def enable_buttons():
	return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
	###

	### Initializations

	MAX_TOKENS = 32000

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"*** Device: {device}")
	model_name = 'mistralai/Voxtral-Mini-3B-2507'

	processor = AutoProcessor.from_pretrained(model_name)
	model = VoxtralForConditionalGeneration.from_pretrained(model_name,
	torch_dtype=torch.bfloat16,
	device_map=device)
	# Supported languages
	dict_languages = {"English": "en",
	"French": "fr",
	"German": "de",
	"Spanish": "es",
	"Italian": "it",
	"Portuguese": "pt",
	"Dutch": "nl",
	"Hindi": "hi"}


	#### Gradio interface
	with gr.Blocks(title="Voxtral") as voxtral:
	gr.Markdown("# Voxtral Mini Evaluation")
	gr.Markdown("""#### Voxtral Mini is an enhancement of Ministral 3B, incorporating state-of-the-art audio input \
	capabilities while retaining best-in-class text performance.
	#### It excels at speech transcription, translation and audio understanding.""")

	with gr.Accordion("🔎 More on Voxtral", open=False):
	gr.Markdown("""## Key Features:

	#### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
	##### - Dedicated transcription mode: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
	##### - Long-form context: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding
	##### - Built-in Q&A and summarization: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models
	##### - Natively multilingual: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)
	##### - Function-calling straight from voice: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
	##### - Highly capable at text: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")


	gr.Markdown("### 1. Upload an audio file, record via microphone, or select a demo file:")
	gr.Markdown("### (Voxtral handles audios up to 30 minutes for transcription)")

	with gr.Row():
	sel_audio = gr.Audio(sources=["upload", "microphone"], type="filepath",
	label="Set an audio file to process it:")
	example = [["mapo_tofu.mp3"]]
	gr.Examples(
	examples=example,
	inputs=sel_audio,
	outputs=None,
	fn=None,
	cache_examples=False,
	run_on_click=False
	)

	with gr.Row():
	gr.Markdown("### 2. Choose one of theese tasks:")

	with gr.Row():
	with gr.Column():
	with gr.Accordion("📝 Transcription", open=True):
	sel_language = gr.Dropdown(
	choices=list(dict_languages.keys()),
	value="English",
	label="Select the language of the audio file:"
	)
	submit_transcript = gr.Button("Extract transcription", variant="primary")
	text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)

	with gr.Column():
	with gr.Accordion("🔁 Translation", open=True):
	sel_translate_language = gr.Dropdown(
	choices=list(dict_languages.keys()),
	value="English",
	label="Select the language for translation:"
	)

	submit_translate = gr.Button("Translate audio file", variant="primary")
	text_translate = gr.Textbox(label="💬 Generated translation", lines=10)

	with gr.Column():
	with gr.Accordion("🤖 Ask audio file", open=True):
	question_chat = gr.Textbox(label="Enter your question about audio file:", placeholder="Enter your question about audio file")
	submit_chat = gr.Button("Ask audio file", variant="primary")
	example_chat = [["What is the subject of this audio file?"], ["Quels sont les ingrédients ?"]]
	gr.Examples(
	examples=example_chat,
	inputs=question_chat,
	outputs=None,
	fn=None,
	cache_examples=False,
	run_on_click=False
	)
	text_chat = gr.Textbox(label="💬 Model answer", lines=10)

	### Processing

	# Transcription
	submit_transcript.click(
	disable_buttons,
	outputs=[submit_transcript, submit_translate, submit_chat],
	trigger_mode="once",
	).then(
	fn=process_transcript,
	inputs=[sel_language, sel_audio],
	outputs=text_transcript
	).then(
	enable_buttons,
	outputs=[submit_transcript, submit_translate, submit_chat],
	)

	# Translation
	submit_translate.click(
	disable_buttons,
	outputs=[submit_transcript, submit_translate, submit_chat],
	trigger_mode="once",
	).then(
	fn=process_translate,
	inputs=[sel_translate_language, sel_audio],
	outputs=text_translate
	).then(
	enable_buttons,
	outputs=[submit_transcript, submit_translate, submit_chat],
	)

	# Chat
	submit_chat.click(
	disable_buttons,
	outputs=[submit_transcript, submit_translate, submit_chat],
	trigger_mode="once",
	).then(
	fn=process_chat,
	inputs=[question_chat, sel_audio],
	outputs=text_chat
	).then(
	enable_buttons,
	outputs=[submit_transcript, submit_translate, submit_chat],
	)

	### Launch the app

	if __name__ == "__main__":
	voxtral.queue().launch()