Spaces:
Running
Running
import gradio as gr | |
import torch | |
import soundfile as sf | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
# Load model | |
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h") | |
model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h") | |
def transcribe(audio): | |
# Load audio | |
speech, rate = sf.read(audio) | |
if rate != 16000: | |
return "Vui lòng cung cấp file audio 16kHz." | |
# Preprocess and predict | |
inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True) | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.decode(predicted_ids[0]) | |
return transcription | |
# Gradio UI | |
gr.Interface( | |
fn=transcribe, | |
inputs=gr.Audio(type="filepath", label="Upload audio (16kHz, mono)"), | |
outputs="text", | |
title="Wav2Vec2 Vietnamese STT", | |
description="Nhận dạng giọng nói tiếng Việt bằng mô hình wav2vec2-base-vietnamese-250h từ VLSP." | |
).launch() | |