vankienemk's picture
Update app.py
15eda7f verified
raw
history blame
1.36 kB
import gradio as gr
import torch
import numpy as np
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from scipy.signal import resample
# Load model
processor = Wav2Vec2Processor.from_pretrained("Menlo/Ichigo-whisper-v0.1")
model = Wav2Vec2ForCTC.from_pretrained("Menlo/Ichigo-whisper-v0.1")
model = torch.compile(model)
def transcribe(audio):
if audio is None:
return "Không có âm thanh."
sample_rate, audio_data = audio
target_rate = 16000
# Nếu sample rate khác 16kHz thì chuyển về
if sample_rate != target_rate:
duration = len(audio_data) / sample_rate
new_length = int(duration * target_rate)
audio_data = resample(audio_data, new_length)
# Dự đoán
inputs = processor(audio_data, sampling_rate=target_rate, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
return transcription
# Gradio UI
gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["microphone"], type="numpy", label="Ghi âm từ micro (16kHz mono)"),
outputs="text",
title="STT Tiếng Việt với Wav2Vec2",
description="Ghi âm và nhận dạng giọng nói tiếng Việt bằng mô hình FPTAI/wav2vec2-base"
).launch()