""" Wav2Vec2 XLS-R 1B Portuguese - Hugging Face Space """ import gradio as gr import torch import librosa import numpy as np from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import warnings warnings.filterwarnings("ignore") # Initialize model and processor device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_name = "jonatasgrosman/wav2vec2-xls-r-1b-portuguese" print(f"Loading model {model_name}...") processor = Wav2Vec2Processor.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name) model.to(device) model.eval() print(f"Model loaded on device: {device}") def transcribe_audio(audio_path): """Transcribe audio using Wav2Vec2""" try: # Load and preprocess audio speech_array, sampling_rate = librosa.load(audio_path, sr=16000, mono=True) # Process with model inputs = processor( speech_array, sampling_rate=16000, return_tensors="pt", padding=True ) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): logits = model(**inputs).logits # Decode predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0]) # Calculate confidence probs = torch.softmax(logits, dim=-1) confidence = torch.max(probs).item() return transcription, confidence except Exception as e: return f"Error: {str(e)}", 0.0 def process_audio(audio): """Process audio input from Gradio""" if audio is None: return "Please provide an audio file.", "" transcription, confidence = transcribe_audio(audio) # Format output output = f"**Transcription:** {transcription}\n\n" output += f"**Confidence:** {confidence:.2%}" return output, transcription # Create Gradio interface with gr.Blocks(title="Wav2Vec2 XLS-R 1B Portuguese") as demo: gr.Markdown("# 🎙️ Wav2Vec2 XLS-R 1B - Portuguese ASR") gr.Markdown("Speech recognition for Portuguese using jonatasgrosman/wav2vec2-xls-r-1b-portuguese") with gr.Row(): with gr.Column(): audio_input = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Audio Input" ) submit_btn = gr.Button("Transcribe", variant="primary") with gr.Column(): output_text = gr.Markdown(label="Results") transcription_output = gr.Textbox( label="Transcription Text", lines=3, interactive=False ) submit_btn.click( fn=process_audio, inputs=[audio_input], outputs=[output_text, transcription_output] ) # Examples section removed - was causing FileNotFoundError # Launch the app - let Hugging Face Spaces handle the configuration if __name__ == "__main__": demo.launch() # Remove server_name and server_port for HF Spaces compatibility