Spaces:

MohamedRashad
/

Voxtral

Running on Zero

File size: 5,378 Bytes

f0d5b79
 
 
 
 
 
 
 
 
aa14541
 
f0d5b79
aa14541
 
f0d5b79
ba6a1e9
 
 
 
 
 
 
 
 
 
 
 
 
 
f0d5b79
ba6a1e9
c01fc79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0d5b79
 
 
 
 
 
aa14541
f0d5b79
 
 
aa14541
f0d5b79
 
 
ba6a1e9
346a58e
f0d5b79
 
 
 
 
 
 
ba6a1e9
 
f0d5b79
 
e76aabe
 
1e39ba8
f0d5b79
 
 
 
 
 
 
 
 
 
 
 
ba6a1e9
de4881c
f0d5b79
 
 
 
ba6a1e9
f0d5b79
 
 
 
 
 
 
 
 
 
1f12c0c
 
 
 
 
 
4c59c80
29f0e6f
1f12c0c
 
02b1764
1f12c0c
 
 
5dcee49
 
 
1f12c0c
 
 
f0d5b79
 
c01fc79

import gradio as gr
import spaces
import torch
from transformers import AutoProcessor, VoxtralForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model and processor
voxtral_mini_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers")
voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)

voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers")
voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)

LANGUAGES = {
    "English": "en",
    "French": "fr",
    "German": "de",
    "Spanish": "es",
    "Italian": "it",
    "Portuguese": "pt",
    "Dutch": "nl",
    "Russian": "ru",
    "Chinese": "zh",
    "Japanese": "ja",
    "Arabic": "ar",
}

@spaces.GPU()
def process_audio(audio_path, model_name, lang_name, max_tokens=500):
    """Process audio with selected Voxtral model and return the generated response.
    
    This function takes an audio file and processes it using the selected Voxtral model
    to generate a transcription in the specified language.
    
    Args:
        audio_path: Path to the audio file to be transcribed.
        model_name: Name of the Voxtral model to use ("Voxtral Mini (3B)" or "Voxtral Small (24B)").
        lang_name: Name of the language for transcription (e.g., "English", "French", etc.).
        max_tokens: Maximum number of tokens to generate in the output (default: 500).
    
    Returns:
        String containing the transcribed text from the audio file, or an error message
        if the audio file is missing or an invalid model is selected.
    """
    if not audio_path:
        return "Please upload an audio file."

    if model_name == "Voxtral Mini (3B)":
        model = voxtral_mini_model
        processor = voxtral_mini_processor
        repo_id = "MohamedRashad/Voxtral-Mini-3B-2507-transformers"
    elif model_name == "Voxtral Small (24B)":
        model = voxtral_small_model
        processor = voxtral_small_processor
        repo_id = "MohamedRashad/Voxtral-Small-24B-2507-transformers"
    else:
        return "Invalid model selected."
    
    language = LANGUAGES[lang_name]
    inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=repo_id)
    inputs = inputs.to(device, dtype=torch.bfloat16)
    
    outputs = model.generate(**inputs, max_new_tokens=max_tokens)
    decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    return decoded_outputs[0]



# Define Gradio interface
with gr.Blocks(title="Voxtral Demo") as demo:
    gr.Markdown("# Voxtral Transcription Demo")
    gr.Markdown("Upload an audio file and get a transcription from Voxtral.")
    gr.Markdown("You can find the `transformers` version of Voxtral here: [3B](https://huggingface.co/MohamedRashad/Voxtral-Mini-3B-2507-transformers), [24B](https://huggingface.co/MohamedRashad/Voxtral-Small-24B-2507-transformers)")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath", label="Upload Audio")
            
            model_selector = gr.Dropdown(
                choices=["Voxtral Mini (3B)", "Voxtral Small (24B)"],
                value="Voxtral Mini (3B)",
                label="Select Model"
            )
            
            language = gr.Dropdown(
                choices=list(LANGUAGES.keys()),
                value="English",
                label="Language"
            )
            
            max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
            submit_btn = gr.Button("Extract Transcription", variant="primary")
        
        with gr.Column():
            output_text = gr.Textbox(label="Generated Response", lines=10)
    
    submit_btn.click(
        fn=process_audio,
        inputs=[audio_input, model_selector, language, max_tokens],
        outputs=output_text
    )

    gr.Examples(
        examples=[
            ["examples/english_armstrong_small_step.mp3", "Voxtral Mini (3B)", "English", 500],
            ["examples/french_mathis_voice_intro.mp3", "Voxtral Mini (3B)", "French", 500],
            ["examples/german_spehr_voice_intro.mp3", "Voxtral Mini (3B)", "German", 500],
            ["examples/japanese_ann01_announcement.mp3", "Voxtral Mini (3B)", "Japanese", 500],
            ["examples/arabic_news_report.mp3", "Voxtral Mini (3B)", "Arabic", 500],
            ["examples/arabic_yousif_saif_football.mp3", "Voxtral Small (24B)", "Arabic", 500],
        ],
        inputs=[audio_input, model_selector, language, max_tokens],
        example_labels=[
            "Neil Armstrong's 'small step' (English, 24s)",
            "Rémi Mathis voice intro (French, 16s)",
            "Christoph Spehr voice intro (German, 28s)",
            "Ann01 announcement (Japanese, 22s)",
            "News Report (Arabic, 10s)",
            "Football Commentry (Arabic, 11s)",
        ]
    )

# Launch the app
if __name__ == "__main__":
    demo.queue().launch(share=False, ssr_mode=False, mcp_server=True)