Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,378 Bytes
f0d5b79 aa14541 f0d5b79 aa14541 f0d5b79 ba6a1e9 f0d5b79 ba6a1e9 c01fc79 f0d5b79 aa14541 f0d5b79 aa14541 f0d5b79 ba6a1e9 346a58e f0d5b79 ba6a1e9 f0d5b79 e76aabe 1e39ba8 f0d5b79 ba6a1e9 de4881c f0d5b79 ba6a1e9 f0d5b79 1f12c0c 4c59c80 29f0e6f 1f12c0c 02b1764 1f12c0c 5dcee49 1f12c0c f0d5b79 c01fc79 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import gradio as gr
import spaces
import torch
from transformers import AutoProcessor, VoxtralForConditionalGeneration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Load model and processor
voxtral_mini_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers")
voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)
voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers")
voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)
LANGUAGES = {
"English": "en",
"French": "fr",
"German": "de",
"Spanish": "es",
"Italian": "it",
"Portuguese": "pt",
"Dutch": "nl",
"Russian": "ru",
"Chinese": "zh",
"Japanese": "ja",
"Arabic": "ar",
}
@spaces.GPU()
def process_audio(audio_path, model_name, lang_name, max_tokens=500):
"""Process audio with selected Voxtral model and return the generated response.
This function takes an audio file and processes it using the selected Voxtral model
to generate a transcription in the specified language.
Args:
audio_path: Path to the audio file to be transcribed.
model_name: Name of the Voxtral model to use ("Voxtral Mini (3B)" or "Voxtral Small (24B)").
lang_name: Name of the language for transcription (e.g., "English", "French", etc.).
max_tokens: Maximum number of tokens to generate in the output (default: 500).
Returns:
String containing the transcribed text from the audio file, or an error message
if the audio file is missing or an invalid model is selected.
"""
if not audio_path:
return "Please upload an audio file."
if model_name == "Voxtral Mini (3B)":
model = voxtral_mini_model
processor = voxtral_mini_processor
repo_id = "MohamedRashad/Voxtral-Mini-3B-2507-transformers"
elif model_name == "Voxtral Small (24B)":
model = voxtral_small_model
processor = voxtral_small_processor
repo_id = "MohamedRashad/Voxtral-Small-24B-2507-transformers"
else:
return "Invalid model selected."
language = LANGUAGES[lang_name]
inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=repo_id)
inputs = inputs.to(device, dtype=torch.bfloat16)
outputs = model.generate(**inputs, max_new_tokens=max_tokens)
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
return decoded_outputs[0]
# Define Gradio interface
with gr.Blocks(title="Voxtral Demo") as demo:
gr.Markdown("# Voxtral Transcription Demo")
gr.Markdown("Upload an audio file and get a transcription from Voxtral.")
gr.Markdown("You can find the `transformers` version of Voxtral here: [3B](https://huggingface.co/MohamedRashad/Voxtral-Mini-3B-2507-transformers), [24B](https://huggingface.co/MohamedRashad/Voxtral-Small-24B-2507-transformers)")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Upload Audio")
model_selector = gr.Dropdown(
choices=["Voxtral Mini (3B)", "Voxtral Small (24B)"],
value="Voxtral Mini (3B)",
label="Select Model"
)
language = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="English",
label="Language"
)
max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
submit_btn = gr.Button("Extract Transcription", variant="primary")
with gr.Column():
output_text = gr.Textbox(label="Generated Response", lines=10)
submit_btn.click(
fn=process_audio,
inputs=[audio_input, model_selector, language, max_tokens],
outputs=output_text
)
gr.Examples(
examples=[
["examples/english_armstrong_small_step.mp3", "Voxtral Mini (3B)", "English", 500],
["examples/french_mathis_voice_intro.mp3", "Voxtral Mini (3B)", "French", 500],
["examples/german_spehr_voice_intro.mp3", "Voxtral Mini (3B)", "German", 500],
["examples/japanese_ann01_announcement.mp3", "Voxtral Mini (3B)", "Japanese", 500],
["examples/arabic_news_report.mp3", "Voxtral Mini (3B)", "Arabic", 500],
["examples/arabic_yousif_saif_football.mp3", "Voxtral Small (24B)", "Arabic", 500],
],
inputs=[audio_input, model_selector, language, max_tokens],
example_labels=[
"Neil Armstrong's 'small step' (English, 24s)",
"Rémi Mathis voice intro (French, 16s)",
"Christoph Spehr voice intro (German, 28s)",
"Ann01 announcement (Japanese, 22s)",
"News Report (Arabic, 10s)",
"Football Commentry (Arabic, 11s)",
]
)
# Launch the app
if __name__ == "__main__":
demo.queue().launch(share=False, ssr_mode=False, mcp_server=True) |