vankienemk's picture
Update app.py
d0b7fd5 verified
raw
history blame
1.08 kB
import gradio as gr
import torch
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# Load model
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
def transcribe(audio):
# Load audio
speech, rate = sf.read(audio)
if rate != 16000:
return "Vui lòng cung cấp file audio 16kHz."
# Preprocess and predict
inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
return transcription
# Gradio UI
gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath", label="Upload audio (16kHz, mono)"),
outputs="text",
title="Wav2Vec2 Vietnamese STT",
description="Nhận dạng giọng nói tiếng Việt bằng mô hình wav2vec2-base-vietnamese-250h từ VLSP."
).launch()