Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import librosa | |
API_KEY = "682d2362-894c-800c-af30-a4c56b7f074b" | |
FRAME_LENGTH = 1024 | |
HOP_LENGTH = FRAME_LENGTH//4 | |
sr = 22050 | |
# Function to calculate amplitude envelope for each frame | |
def amplitude_envelope(audio, frame_length, hop_length): | |
return np.array([max(audio[i:i+frame_length]) for i in range(0, len(audio), hop_length)]) | |
# Function to extract Short-Time Fourier Transform | |
def st_fourier_transform(audio, n_fft, hop_length, type="power"): | |
stft = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length) | |
if type=="power": | |
stft_abs = np.abs(stft) ** 2 # use for a power spectrogram | |
stft_dB = librosa.power_to_db(np.abs(stft_abs), ref=np.max) | |
elif type=="amplitude": | |
stft_abs = np.abs(stft) # use for an amplitude spectrogram | |
stft_dB = librosa.amplitude_to_db(np.abs(stft_abs), ref=np.max) | |
else: | |
raise ValueError(f"Error: type should be 'power' or 'amplitude'") | |
return stft_dB | |
def extract_audio_features(file, key): | |
if key != API_KEY: | |
raise gr.Error("Invalid API key.") | |
try: | |
output = {} | |
# Load audio with Librosa.load and resample | |
file, _ = librosa.load(file, sr=sr) | |
# Calculate amplitude envelope | |
ae = amplitude_envelope(file, FRAME_LENGTH, HOP_LENGTH) | |
output['amplitude_envelope'] = ae | |
# Calculate root-mean-square energy | |
rms = librosa.feature.rms(y=file, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0] | |
output['root_mean_square_energy'] = rms | |
# Calculate zero-crossing rate using librosa.feature.zero_crossing_rate | |
zcr = librosa.feature.zero_crossing_rate(y=file, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0] | |
output['zero_crossing_rate'] = zcr | |
# Apply short-time fourier transform | |
stft = st_fourier_transform(file, FRAME_LENGTH, HOP_LENGTH) | |
# Extract the first 13 MFCCs | |
mfccs = librosa.feature.mfcc(y=file, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH, n_mfcc=13) | |
output['mfccs'] = mfccs | |
# Computing first MFCCs derivatives (how they change from frame-frame over time) | |
delta_mfccs = librosa.feature.delta(mfccs) | |
output['delta_mfccs'] = delta_mfccs | |
# Calculate spectral centroid | |
sc = librosa.feature.spectral_centroid(y=file, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0] | |
output['spectral_centroid'] = sc | |
# Calculate spectral bandwidth | |
sban = librosa.feature.spectral_bandwidth(y=file, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0] | |
output['spectral_bandwidth'] = sban | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
output = f"An error occurred: {e}" | |
return output | |
# Create a function to generate a vertically stacked interface | |
def create_transcription_interface(source): | |
with gr.Blocks() as interface: | |
gr.Markdown(""" | |
Use microphone, upload .wav file. | |
""") | |
with gr.Column(): | |
audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio") | |
output = gr.JSON(label="Results") | |
api_key_input = gr.Textbox(label="API Key", type="password") | |
audio_input.change(fn=extract_audio_features, inputs=[audio_input, api_key_input], outputs=output) | |
return interface | |
# Create two interfaces (one for mic, one for file upload) | |
mic_transcribe = create_transcription_interface("microphone") | |
file_transcribe = create_transcription_interface("upload") | |
demo = gr.TabbedInterface( | |
[mic_transcribe, file_transcribe], | |
["Microphone Input", "Upload .wav file"], | |
title="Audio Feature Extraction", | |
) | |
demo.launch() |