File size: 2,954 Bytes
2ec0359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b87ee4
 
 
2ec0359
0b87ee4
 
 
2ec0359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# app.py

import gradio as gr
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import scipy.signal
from transformers import pipeline

# Load ASR model once (Wav2Vec2 large)
asr = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h")

# === DSP & feature extraction functions ===

def lowpass_filter(y, sr, cutoff=7000):
    nyq = 0.5 * sr
    norm_cutoff = cutoff / nyq
    b, a = scipy.signal.butter(6, norm_cutoff, btype='low', analog=False)
    y_filt = scipy.signal.lfilter(b, a, y)
    return y_filt

def compute_mfcc(y, sr, n_mfcc=13):
    # Pre-emphasis
    y = scipy.signal.lfilter([1, -0.97], 1, y)
    # Frame & window params
    hop_length = int(0.010 * sr)
    win_length = int(0.025 * sr)
    # Compute MFCC
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc,
                                hop_length=hop_length, n_fft=512)
    # Ξ” and ΔΔ
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # Mean & variance normalization
    mean = np.mean(mfcc, axis=1, keepdims=True)
    std = np.std(mfcc, axis=1, keepdims=True)
    mfcc_norm = (mfcc - mean) / (std + 1e-6)
    return mfcc_norm, delta, delta2

def plot_features(y, sr, mfcc, delta, delta2):
    fig, axs = plt.subplots(4, 1, figsize=(10, 10))
    # Waveform
    axs[0].set(title="Waveform")
    librosa.display.waveshow(y, sr=sr, ax=axs[0])
    # Spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=512, hop_length=256)
    S_dB = librosa.power_to_db(S, ref=np.max)
    librosa.display.specshow(S_dB, sr=sr, hop_length=256, x_axis='time', y_axis='mel', ax=axs[1])
    axs[1].set(title="Mel Spectrogram")
    # MFCC
    librosa.display.specshow(mfcc, x_axis='time', ax=axs[2])
    axs[2].set(title="MFCC (normalized)")
    # Ξ”
    librosa.display.specshow(delta, x_axis='time', ax=axs[3])
    axs[3].set(title="Delta features")
    plt.tight_layout()
    return fig

def process(audio):
    if audio is None:
        return "No audio provided.", plt.figure()

    sr, y = audio
    if y is None:
        return "Audio data is empty.", plt.figure()

    y = y.astype(np.float32)

    # Lowpass filter
    y_filt = lowpass_filter(y, sr)

    # Compute features
    mfcc, delta, delta2 = compute_mfcc(y_filt, sr)

    # Run ASR (Wav2Vec2)
    text = asr({"array": y_filt, "sampling_rate": sr})["text"]

    # Plot
    fig = plot_features(y_filt, sr, mfcc, delta, delta2)

    return text, fig

# === Gradio UI ===

demo = gr.Interface(
    fn=process,
    inputs=gr.Audio(sources=["microphone", "upload"], type="numpy"),
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Plot(label="Features Visualization")
    ],
    title="🧠 Advanced Speech AI Demo with Wav2Vec2",
    description="Upload or record audio β†’ filters + MFCC+Ξ”+ΔΔ β†’ transcription with Wav2Vec2 β†’ visual plots"
)

if __name__ == "__main__":
    demo.launch()