import gradio as gr import torch import soundfile as sf from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor # Load model processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h") model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h") def transcribe(audio): # Load audio speech, rate = sf.read(audio) if rate != 16000: return "Vui lòng cung cấp file audio 16kHz." # Preprocess and predict inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(**inputs).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0]) return transcription # Gradio UI gr.Interface( fn=transcribe, inputs=gr.Audio(type="filepath", label="Upload audio (16kHz, mono)"), outputs="text", title="Wav2Vec2 Vietnamese STT", description="Nhận dạng giọng nói tiếng Việt bằng mô hình wav2vec2-base-vietnamese-250h từ VLSP." ).launch()