Spaces:

kaysrubio
/

audio_feature_extraction

Sleeping

File size: 3,720 Bytes

import gradio as gr
import numpy as np
import librosa

API_KEY = "682d2362-894c-800c-af30-a4c56b7f074b"
FRAME_LENGTH = 1024
HOP_LENGTH = FRAME_LENGTH//4
sr = 22050

# Function to calculate amplitude envelope for each frame
def amplitude_envelope(audio, frame_length, hop_length):
    return np.array([max(audio[i:i+frame_length]) for i in range(0, len(audio), hop_length)])

# Function to extract Short-Time Fourier Transform
def st_fourier_transform(audio, n_fft, hop_length, type="power"):
    stft = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
    if type=="power":
        stft_abs = np.abs(stft) ** 2 # use for a power spectrogram
        stft_dB = librosa.power_to_db(np.abs(stft_abs), ref=np.max)
    elif type=="amplitude":
        stft_abs = np.abs(stft) # use for an amplitude spectrogram
        stft_dB = librosa.amplitude_to_db(np.abs(stft_abs), ref=np.max)
    else:
        raise ValueError(f"Error: type should be 'power' or 'amplitude'")
    return stft_dB

def extract_audio_features(file, key):
    if key != API_KEY:
        raise gr.Error("Invalid API key.")

    try:
        output = {}
        # Load audio with Librosa.load and resample
        file, _ = librosa.load(file, sr=sr)

        # Calculate amplitude envelope 
        ae = amplitude_envelope(file, FRAME_LENGTH, HOP_LENGTH)
        output['amplitude_envelope'] = ae

        # Calculate root-mean-square energy
        rms = librosa.feature.rms(y=file, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        output['root_mean_square_energy'] = rms

        # Calculate zero-crossing rate using librosa.feature.zero_crossing_rate
        zcr = librosa.feature.zero_crossing_rate(y=file, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        output['zero_crossing_rate'] = zcr

        # Apply short-time fourier transform
        stft = st_fourier_transform(file, FRAME_LENGTH, HOP_LENGTH)
        
        # Extract the first 13 MFCCs
        mfccs = librosa.feature.mfcc(y=file, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH, n_mfcc=13)
        output['mfccs'] = mfccs

        # Computing first MFCCs derivatives (how they change from frame-frame over time)
        delta_mfccs = librosa.feature.delta(mfccs)
        output['delta_mfccs'] = delta_mfccs

        # Calculate spectral centroid
        sc = librosa.feature.spectral_centroid(y=file, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        output['spectral_centroid'] = sc
    
        # Calculate spectral bandwidth
        sban = librosa.feature.spectral_bandwidth(y=file, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        output['spectral_bandwidth'] = sban
    except Exception as e:
        print(f"An error occurred: {e}")
        output = f"An error occurred: {e}"

    return output

# Create a function to generate a vertically stacked interface
def create_transcription_interface(source):
    with gr.Blocks() as interface:
        gr.Markdown("""
        Use microphone, upload .wav file.
        """)
        with gr.Column():
            audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
            output = gr.JSON(label="Results")
            api_key_input = gr.Textbox(label="API Key", type="password")
        audio_input.change(fn=extract_audio_features, inputs=[audio_input, api_key_input], outputs=output)
    return interface

# Create two interfaces (one for mic, one for file upload)
mic_transcribe = create_transcription_interface("microphone")
file_transcribe = create_transcription_interface("upload")

demo = gr.TabbedInterface(
    [mic_transcribe, file_transcribe],
    ["Microphone Input", "Upload .wav file"],
    title="Audio Feature Extraction",
)

demo.launch()