import gradio as gr from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import torch # Cargar el modelo y el procesador model = Wav2Vec2ForCTC.from_pretrained("openai/whisper-large-v2") processor = Wav2Vec2Processor.from_pretrained("openai/whisper-large-v2") def asr(audio_file_path): # Cargar archivo de audio input_audio, _ = librosa.load(audio_file_path, sr=16000) # Preprocesar audio input_values = processor(input_audio, return_tensors="pt", sampling_rate=16000).input_values # Realizar inferencia logits = model(input_values).logits # Decodificar los logits a texto predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0]) return transcription # Crear interfaz de Gradio iface = gr.Interface(fn=asr, inputs=gr.inputs.Audio(source="microphone", type="file"), outputs="text") iface.launch()