exp1 / app.py
gaur3009's picture
Update app.py
0b87ee4 verified
# app.py
import gradio as gr
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import scipy.signal
from transformers import pipeline
# Load ASR model once (Wav2Vec2 large)
asr = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h")
# === DSP & feature extraction functions ===
def lowpass_filter(y, sr, cutoff=7000):
nyq = 0.5 * sr
norm_cutoff = cutoff / nyq
b, a = scipy.signal.butter(6, norm_cutoff, btype='low', analog=False)
y_filt = scipy.signal.lfilter(b, a, y)
return y_filt
def compute_mfcc(y, sr, n_mfcc=13):
# Pre-emphasis
y = scipy.signal.lfilter([1, -0.97], 1, y)
# Frame & window params
hop_length = int(0.010 * sr)
win_length = int(0.025 * sr)
# Compute MFCC
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc,
hop_length=hop_length, n_fft=512)
# Ξ” and ΔΔ
delta = librosa.feature.delta(mfcc)
delta2 = librosa.feature.delta(mfcc, order=2)
# Mean & variance normalization
mean = np.mean(mfcc, axis=1, keepdims=True)
std = np.std(mfcc, axis=1, keepdims=True)
mfcc_norm = (mfcc - mean) / (std + 1e-6)
return mfcc_norm, delta, delta2
def plot_features(y, sr, mfcc, delta, delta2):
fig, axs = plt.subplots(4, 1, figsize=(10, 10))
# Waveform
axs[0].set(title="Waveform")
librosa.display.waveshow(y, sr=sr, ax=axs[0])
# Spectrogram
S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=512, hop_length=256)
S_dB = librosa.power_to_db(S, ref=np.max)
librosa.display.specshow(S_dB, sr=sr, hop_length=256, x_axis='time', y_axis='mel', ax=axs[1])
axs[1].set(title="Mel Spectrogram")
# MFCC
librosa.display.specshow(mfcc, x_axis='time', ax=axs[2])
axs[2].set(title="MFCC (normalized)")
# Ξ”
librosa.display.specshow(delta, x_axis='time', ax=axs[3])
axs[3].set(title="Delta features")
plt.tight_layout()
return fig
def process(audio):
if audio is None:
return "No audio provided.", plt.figure()
sr, y = audio
if y is None:
return "Audio data is empty.", plt.figure()
y = y.astype(np.float32)
# Lowpass filter
y_filt = lowpass_filter(y, sr)
# Compute features
mfcc, delta, delta2 = compute_mfcc(y_filt, sr)
# Run ASR (Wav2Vec2)
text = asr({"array": y_filt, "sampling_rate": sr})["text"]
# Plot
fig = plot_features(y_filt, sr, mfcc, delta, delta2)
return text, fig
# === Gradio UI ===
demo = gr.Interface(
fn=process,
inputs=gr.Audio(sources=["microphone", "upload"], type="numpy"),
outputs=[
gr.Textbox(label="Transcription"),
gr.Plot(label="Features Visualization")
],
title="🧠 Advanced Speech AI Demo with Wav2Vec2",
description="Upload or record audio β†’ filters + MFCC+Ξ”+ΔΔ β†’ transcription with Wav2Vec2 β†’ visual plots"
)
if __name__ == "__main__":
demo.launch()