Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import spaces | |
import torch | |
from transformers import AutoProcessor, VoxtralForConditionalGeneration | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Using device: {device}") | |
# Load model and processor | |
voxtral_mini_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers") | |
voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device) | |
voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers") | |
voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device) | |
LANGUAGES = { | |
"English": "en", | |
"French": "fr", | |
"German": "de", | |
"Spanish": "es", | |
"Italian": "it", | |
"Portuguese": "pt", | |
"Dutch": "nl", | |
"Russian": "ru", | |
"Chinese": "zh", | |
"Japanese": "ja", | |
"Arabic": "ar", | |
} | |
def process_audio(audio_path, model_name, lang_name, max_tokens=500): | |
"""Process audio with selected Voxtral model and return the generated response. | |
This function takes an audio file and processes it using the selected Voxtral model | |
to generate a transcription in the specified language. | |
Args: | |
audio_path: Path to the audio file to be transcribed. | |
model_name: Name of the Voxtral model to use ("Voxtral Mini (3B)" or "Voxtral Small (24B)"). | |
lang_name: Name of the language for transcription (e.g., "English", "French", etc.). | |
max_tokens: Maximum number of tokens to generate in the output (default: 500). | |
Returns: | |
String containing the transcribed text from the audio file, or an error message | |
if the audio file is missing or an invalid model is selected. | |
""" | |
if not audio_path: | |
return "Please upload an audio file." | |
if model_name == "Voxtral Mini (3B)": | |
model = voxtral_mini_model | |
processor = voxtral_mini_processor | |
repo_id = "MohamedRashad/Voxtral-Mini-3B-2507-transformers" | |
elif model_name == "Voxtral Small (24B)": | |
model = voxtral_small_model | |
processor = voxtral_small_processor | |
repo_id = "MohamedRashad/Voxtral-Small-24B-2507-transformers" | |
else: | |
return "Invalid model selected." | |
language = LANGUAGES[lang_name] | |
inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=repo_id) | |
inputs = inputs.to(device, dtype=torch.bfloat16) | |
outputs = model.generate(**inputs, max_new_tokens=max_tokens) | |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True) | |
return decoded_outputs[0] | |
# Define Gradio interface | |
with gr.Blocks(title="Voxtral Demo") as demo: | |
gr.Markdown("# Voxtral Transcription Demo") | |
gr.Markdown("Upload an audio file and get a transcription from Voxtral.") | |
gr.Markdown("You can find the `transformers` version of Voxtral here: [3B](https://huggingface.co/MohamedRashad/Voxtral-Mini-3B-2507-transformers), [24B](https://huggingface.co/MohamedRashad/Voxtral-Small-24B-2507-transformers)") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio(type="filepath", label="Upload Audio") | |
model_selector = gr.Dropdown( | |
choices=["Voxtral Mini (3B)", "Voxtral Small (24B)"], | |
value="Voxtral Mini (3B)", | |
label="Select Model" | |
) | |
language = gr.Dropdown( | |
choices=list(LANGUAGES.keys()), | |
value="English", | |
label="Language" | |
) | |
max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens") | |
submit_btn = gr.Button("Extract Transcription", variant="primary") | |
with gr.Column(): | |
output_text = gr.Textbox(label="Generated Response", lines=10) | |
submit_btn.click( | |
fn=process_audio, | |
inputs=[audio_input, model_selector, language, max_tokens], | |
outputs=output_text | |
) | |
gr.Examples( | |
examples=[ | |
["examples/english_armstrong_small_step.mp3", "Voxtral Mini (3B)", "English", 500], | |
["examples/french_mathis_voice_intro.mp3", "Voxtral Mini (3B)", "French", 500], | |
["examples/german_spehr_voice_intro.mp3", "Voxtral Mini (3B)", "German", 500], | |
["examples/japanese_ann01_announcement.mp3", "Voxtral Mini (3B)", "Japanese", 500], | |
["examples/arabic_news_report.mp3", "Voxtral Mini (3B)", "Arabic", 500], | |
["examples/arabic_yousif_saif_football.mp3", "Voxtral Small (24B)", "Arabic", 500], | |
], | |
inputs=[audio_input, model_selector, language, max_tokens], | |
example_labels=[ | |
"Neil Armstrong's 'small step' (English, 24s)", | |
"Rémi Mathis voice intro (French, 16s)", | |
"Christoph Spehr voice intro (German, 28s)", | |
"Ann01 announcement (Japanese, 22s)", | |
"News Report (Arabic, 10s)", | |
"Football Commentry (Arabic, 11s)", | |
] | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
demo.queue().launch(share=False, ssr_mode=False, mcp_server=True) |