Spaces:
Sleeping
Sleeping
File size: 1,670 Bytes
cd70c07 9119cb9 cd70c07 1cf7af2 cd70c07 1cf7af2 cd70c07 62d3719 8591444 1cf7af2 9119cb9 8591444 aeb1916 8591444 1cf7af2 9119cb9 8591444 1cf7af2 9119cb9 8591444 aeb1916 1cf7af2 9119cb9 1cf7af2 9119cb9 8591444 a1ec8a4 8591444 9119cb9 62d3719 9119cb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import gradio as gr
import torch
import numpy as np
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from scipy.signal import resample
# Load model and processor
model_id = "facebook/wav2vec2-large-960h-lv60-self"
processor = Wav2Vec2Processor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)
# Transcription function
def transcribe(audio_tuple):
if audio_tuple is None:
return "⚠️ No audio received."
audio, sample_rate = audio_tuple
if sample_rate is None or audio is None:
return "⚠️ Audio or sample rate missing."
# Flatten if stereo (2D) to mono (1D)
if len(audio.shape) == 2:
audio = np.mean(audio, axis=1)
# Resample if not 16000Hz
if sample_rate != 16000:
number_of_samples = round(len(audio) * 16000 / sample_rate)
audio = resample(audio, number_of_samples)
# Normalize to [-1, 1]
audio = audio.astype(np.float32)
# Process audio
input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values
# Inference
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription.lower()
# Gradio interface
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["microphone"], type="numpy", label="🎤 Speak a word or letter"),
outputs=gr.Textbox(label="📝 Transcription"),
title="🔤 Wav2Vec2 Speech Transcriber",
description="Speak into the mic and get real-time transcription using Hugging Face Wav2Vec2."
)
demo.launch()
|