# app.py import gradio as gr import numpy as np import librosa import librosa.display import matplotlib.pyplot as plt import scipy.signal from transformers import pipeline # Load ASR model once (Wav2Vec2 large) asr = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h") # === DSP & feature extraction functions === def lowpass_filter(y, sr, cutoff=7000): nyq = 0.5 * sr norm_cutoff = cutoff / nyq b, a = scipy.signal.butter(6, norm_cutoff, btype='low', analog=False) y_filt = scipy.signal.lfilter(b, a, y) return y_filt def compute_mfcc(y, sr, n_mfcc=13): # Pre-emphasis y = scipy.signal.lfilter([1, -0.97], 1, y) # Frame & window params hop_length = int(0.010 * sr) win_length = int(0.025 * sr) # Compute MFCC mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, n_fft=512) # Δ and ΔΔ delta = librosa.feature.delta(mfcc) delta2 = librosa.feature.delta(mfcc, order=2) # Mean & variance normalization mean = np.mean(mfcc, axis=1, keepdims=True) std = np.std(mfcc, axis=1, keepdims=True) mfcc_norm = (mfcc - mean) / (std + 1e-6) return mfcc_norm, delta, delta2 def plot_features(y, sr, mfcc, delta, delta2): fig, axs = plt.subplots(4, 1, figsize=(10, 10)) # Waveform axs[0].set(title="Waveform") librosa.display.waveshow(y, sr=sr, ax=axs[0]) # Spectrogram S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=512, hop_length=256) S_dB = librosa.power_to_db(S, ref=np.max) librosa.display.specshow(S_dB, sr=sr, hop_length=256, x_axis='time', y_axis='mel', ax=axs[1]) axs[1].set(title="Mel Spectrogram") # MFCC librosa.display.specshow(mfcc, x_axis='time', ax=axs[2]) axs[2].set(title="MFCC (normalized)") # Δ librosa.display.specshow(delta, x_axis='time', ax=axs[3]) axs[3].set(title="Delta features") plt.tight_layout() return fig def process(audio): if audio is None: return "No audio provided.", plt.figure() sr, y = audio if y is None: return "Audio data is empty.", plt.figure() y = y.astype(np.float32) # Lowpass filter y_filt = lowpass_filter(y, sr) # Compute features mfcc, delta, delta2 = compute_mfcc(y_filt, sr) # Run ASR (Wav2Vec2) text = asr({"array": y_filt, "sampling_rate": sr})["text"] # Plot fig = plot_features(y_filt, sr, mfcc, delta, delta2) return text, fig # === Gradio UI === demo = gr.Interface( fn=process, inputs=gr.Audio(sources=["microphone", "upload"], type="numpy"), outputs=[ gr.Textbox(label="Transcription"), gr.Plot(label="Features Visualization") ], title="🧠 Advanced Speech AI Demo with Wav2Vec2", description="Upload or record audio → filters + MFCC+Δ+ΔΔ → transcription with Wav2Vec2 → visual plots" ) if __name__ == "__main__": demo.launch()