|
|
|
|
|
import gradio as gr |
|
import numpy as np |
|
import librosa |
|
import librosa.display |
|
import matplotlib.pyplot as plt |
|
import scipy.signal |
|
from transformers import pipeline |
|
|
|
|
|
asr = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h") |
|
|
|
|
|
|
|
def lowpass_filter(y, sr, cutoff=7000): |
|
nyq = 0.5 * sr |
|
norm_cutoff = cutoff / nyq |
|
b, a = scipy.signal.butter(6, norm_cutoff, btype='low', analog=False) |
|
y_filt = scipy.signal.lfilter(b, a, y) |
|
return y_filt |
|
|
|
def compute_mfcc(y, sr, n_mfcc=13): |
|
|
|
y = scipy.signal.lfilter([1, -0.97], 1, y) |
|
|
|
hop_length = int(0.010 * sr) |
|
win_length = int(0.025 * sr) |
|
|
|
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, |
|
hop_length=hop_length, n_fft=512) |
|
|
|
delta = librosa.feature.delta(mfcc) |
|
delta2 = librosa.feature.delta(mfcc, order=2) |
|
|
|
mean = np.mean(mfcc, axis=1, keepdims=True) |
|
std = np.std(mfcc, axis=1, keepdims=True) |
|
mfcc_norm = (mfcc - mean) / (std + 1e-6) |
|
return mfcc_norm, delta, delta2 |
|
|
|
def plot_features(y, sr, mfcc, delta, delta2): |
|
fig, axs = plt.subplots(4, 1, figsize=(10, 10)) |
|
|
|
axs[0].set(title="Waveform") |
|
librosa.display.waveshow(y, sr=sr, ax=axs[0]) |
|
|
|
S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=512, hop_length=256) |
|
S_dB = librosa.power_to_db(S, ref=np.max) |
|
librosa.display.specshow(S_dB, sr=sr, hop_length=256, x_axis='time', y_axis='mel', ax=axs[1]) |
|
axs[1].set(title="Mel Spectrogram") |
|
|
|
librosa.display.specshow(mfcc, x_axis='time', ax=axs[2]) |
|
axs[2].set(title="MFCC (normalized)") |
|
|
|
librosa.display.specshow(delta, x_axis='time', ax=axs[3]) |
|
axs[3].set(title="Delta features") |
|
plt.tight_layout() |
|
return fig |
|
|
|
def process(audio): |
|
if audio is None: |
|
return "No audio provided.", plt.figure() |
|
|
|
sr, y = audio |
|
if y is None: |
|
return "Audio data is empty.", plt.figure() |
|
|
|
y = y.astype(np.float32) |
|
|
|
|
|
y_filt = lowpass_filter(y, sr) |
|
|
|
|
|
mfcc, delta, delta2 = compute_mfcc(y_filt, sr) |
|
|
|
|
|
text = asr({"array": y_filt, "sampling_rate": sr})["text"] |
|
|
|
|
|
fig = plot_features(y_filt, sr, mfcc, delta, delta2) |
|
|
|
return text, fig |
|
|
|
|
|
|
|
demo = gr.Interface( |
|
fn=process, |
|
inputs=gr.Audio(sources=["microphone", "upload"], type="numpy"), |
|
outputs=[ |
|
gr.Textbox(label="Transcription"), |
|
gr.Plot(label="Features Visualization") |
|
], |
|
title="π§ Advanced Speech AI Demo with Wav2Vec2", |
|
description="Upload or record audio β filters + MFCC+Ξ+ΞΞ β transcription with Wav2Vec2 β visual plots" |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |