File size: 3,146 Bytes
c555b3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3146d76
c555b3d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
Wav2Vec2 XLS-R 1B Portuguese - Hugging Face Space
"""

import gradio as gr
import torch
import librosa
import numpy as np
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import warnings

warnings.filterwarnings("ignore")

# Initialize model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "jonatasgrosman/wav2vec2-xls-r-1b-portuguese"

print(f"Loading model {model_name}...")
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
model.to(device)
model.eval()
print(f"Model loaded on device: {device}")

def transcribe_audio(audio_path):
    """Transcribe audio using Wav2Vec2"""
    try:
        # Load and preprocess audio
        speech_array, sampling_rate = librosa.load(audio_path, sr=16000, mono=True)
        
        # Process with model
        inputs = processor(
            speech_array,
            sampling_rate=16000,
            return_tensors="pt",
            padding=True
        )
        
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            logits = model(**inputs).logits
            
        # Decode
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])
        
        # Calculate confidence
        probs = torch.softmax(logits, dim=-1)
        confidence = torch.max(probs).item()
        
        return transcription, confidence
        
    except Exception as e:
        return f"Error: {str(e)}", 0.0

def process_audio(audio):
    """Process audio input from Gradio"""
    if audio is None:
        return "Please provide an audio file.", ""
    
    transcription, confidence = transcribe_audio(audio)
    
    # Format output
    output = f"**Transcription:** {transcription}\n\n"
    output += f"**Confidence:** {confidence:.2%}"
    
    return output, transcription

# Create Gradio interface
with gr.Blocks(title="Wav2Vec2 XLS-R 1B Portuguese") as demo:
    gr.Markdown("# 🎙️ Wav2Vec2 XLS-R 1B - Portuguese ASR")
    gr.Markdown("Speech recognition for Portuguese using jonatasgrosman/wav2vec2-xls-r-1b-portuguese")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="Audio Input"
            )
            
            submit_btn = gr.Button("Transcribe", variant="primary")
            
        with gr.Column():
            output_text = gr.Markdown(label="Results")
            transcription_output = gr.Textbox(
                label="Transcription Text",
                lines=3,
                interactive=False
            )
    
    submit_btn.click(
        fn=process_audio,
        inputs=[audio_input],
        outputs=[output_text, transcription_output]
    )
    
    # Examples section removed - was causing FileNotFoundError

# Launch the app - let Hugging Face Spaces handle the configuration
if __name__ == "__main__":
    demo.launch()  # Remove server_name and server_port for HF Spaces compatibility