Spaces:

MohamedRashad
/

Voxtral

Running on Zero

App Files Files Community

Voxtral / app.py

MohamedRashad

feat: Enable MCP (#4)

c01fc79 verified 14 days ago

raw

history blame contribute delete

5.38 kB

	import gradio as gr
	import spaces
	import torch
	from transformers import AutoProcessor, VoxtralForConditionalGeneration

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Load model and processor
	voxtral_mini_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers")
	voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)

	voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers")
	voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)

	LANGUAGES = {
	"English": "en",
	"French": "fr",
	"German": "de",
	"Spanish": "es",
	"Italian": "it",
	"Portuguese": "pt",
	"Dutch": "nl",
	"Russian": "ru",
	"Chinese": "zh",
	"Japanese": "ja",
	"Arabic": "ar",
	}

	@spaces.GPU()
	def process_audio(audio_path, model_name, lang_name, max_tokens=500):
	"""Process audio with selected Voxtral model and return the generated response.

	This function takes an audio file and processes it using the selected Voxtral model
	to generate a transcription in the specified language.

	Args:
	audio_path: Path to the audio file to be transcribed.
	model_name: Name of the Voxtral model to use ("Voxtral Mini (3B)" or "Voxtral Small (24B)").
	lang_name: Name of the language for transcription (e.g., "English", "French", etc.).
	max_tokens: Maximum number of tokens to generate in the output (default: 500).

	Returns:
	String containing the transcribed text from the audio file, or an error message
	if the audio file is missing or an invalid model is selected.
	"""
	if not audio_path:
	return "Please upload an audio file."

	if model_name == "Voxtral Mini (3B)":
	model = voxtral_mini_model
	processor = voxtral_mini_processor
	repo_id = "MohamedRashad/Voxtral-Mini-3B-2507-transformers"
	elif model_name == "Voxtral Small (24B)":
	model = voxtral_small_model
	processor = voxtral_small_processor
	repo_id = "MohamedRashad/Voxtral-Small-24B-2507-transformers"
	else:
	return "Invalid model selected."

	language = LANGUAGES[lang_name]
	inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=repo_id)
	inputs = inputs.to(device, dtype=torch.bfloat16)

	outputs = model.generate(**inputs, max_new_tokens=max_tokens)
	decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)

	return decoded_outputs[0]



	# Define Gradio interface
	with gr.Blocks(title="Voxtral Demo") as demo:
	gr.Markdown("# Voxtral Transcription Demo")
	gr.Markdown("Upload an audio file and get a transcription from Voxtral.")
	gr.Markdown("You can find the `transformers` version of Voxtral here: [3B](https://huggingface.co/MohamedRashad/Voxtral-Mini-3B-2507-transformers), [24B](https://huggingface.co/MohamedRashad/Voxtral-Small-24B-2507-transformers)")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(type="filepath", label="Upload Audio")

	model_selector = gr.Dropdown(
	choices=["Voxtral Mini (3B)", "Voxtral Small (24B)"],
	value="Voxtral Mini (3B)",
	label="Select Model"
	)

	language = gr.Dropdown(
	choices=list(LANGUAGES.keys()),
	value="English",
	label="Language"
	)

	max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
	submit_btn = gr.Button("Extract Transcription", variant="primary")

	with gr.Column():
	output_text = gr.Textbox(label="Generated Response", lines=10)

	submit_btn.click(
	fn=process_audio,
	inputs=[audio_input, model_selector, language, max_tokens],
	outputs=output_text
	)

	gr.Examples(
	examples=[
	["examples/english_armstrong_small_step.mp3", "Voxtral Mini (3B)", "English", 500],
	["examples/french_mathis_voice_intro.mp3", "Voxtral Mini (3B)", "French", 500],
	["examples/german_spehr_voice_intro.mp3", "Voxtral Mini (3B)", "German", 500],
	["examples/japanese_ann01_announcement.mp3", "Voxtral Mini (3B)", "Japanese", 500],
	["examples/arabic_news_report.mp3", "Voxtral Mini (3B)", "Arabic", 500],
	["examples/arabic_yousif_saif_football.mp3", "Voxtral Small (24B)", "Arabic", 500],
	],
	inputs=[audio_input, model_selector, language, max_tokens],
	example_labels=[
	"Neil Armstrong's 'small step' (English, 24s)",
	"Rémi Mathis voice intro (French, 16s)",
	"Christoph Spehr voice intro (German, 28s)",
	"Ann01 announcement (Japanese, 22s)",
	"News Report (Arabic, 10s)",
	"Football Commentry (Arabic, 11s)",
	]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.queue().launch(share=False, ssr_mode=False, mcp_server=True)