Spaces:
Sleeping
Sleeping
File size: 3,720 Bytes
4b71fbc be60578 4b71fbc be60578 4b71fbc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import gradio as gr
import numpy as np
import librosa
API_KEY = "682d2362-894c-800c-af30-a4c56b7f074b"
FRAME_LENGTH = 1024
HOP_LENGTH = FRAME_LENGTH//4
sr = 22050
# Function to calculate amplitude envelope for each frame
def amplitude_envelope(audio, frame_length, hop_length):
return np.array([max(audio[i:i+frame_length]) for i in range(0, len(audio), hop_length)])
# Function to extract Short-Time Fourier Transform
def st_fourier_transform(audio, n_fft, hop_length, type="power"):
stft = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
if type=="power":
stft_abs = np.abs(stft) ** 2 # use for a power spectrogram
stft_dB = librosa.power_to_db(np.abs(stft_abs), ref=np.max)
elif type=="amplitude":
stft_abs = np.abs(stft) # use for an amplitude spectrogram
stft_dB = librosa.amplitude_to_db(np.abs(stft_abs), ref=np.max)
else:
raise ValueError(f"Error: type should be 'power' or 'amplitude'")
return stft_dB
def extract_audio_features(file, key):
if key != API_KEY:
raise gr.Error("Invalid API key.")
try:
output = {}
# Load audio with Librosa.load and resample
file, _ = librosa.load(file, sr=sr)
# Calculate amplitude envelope
ae = amplitude_envelope(file, FRAME_LENGTH, HOP_LENGTH)
output['amplitude_envelope'] = ae
# Calculate root-mean-square energy
rms = librosa.feature.rms(y=file, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
output['root_mean_square_energy'] = rms
# Calculate zero-crossing rate using librosa.feature.zero_crossing_rate
zcr = librosa.feature.zero_crossing_rate(y=file, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
output['zero_crossing_rate'] = zcr
# Apply short-time fourier transform
stft = st_fourier_transform(file, FRAME_LENGTH, HOP_LENGTH)
# Extract the first 13 MFCCs
mfccs = librosa.feature.mfcc(y=file, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH, n_mfcc=13)
output['mfccs'] = mfccs
# Computing first MFCCs derivatives (how they change from frame-frame over time)
delta_mfccs = librosa.feature.delta(mfccs)
output['delta_mfccs'] = delta_mfccs
# Calculate spectral centroid
sc = librosa.feature.spectral_centroid(y=file, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
output['spectral_centroid'] = sc
# Calculate spectral bandwidth
sban = librosa.feature.spectral_bandwidth(y=file, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
output['spectral_bandwidth'] = sban
except Exception as e:
print(f"An error occurred: {e}")
output = f"An error occurred: {e}"
return output
# Create a function to generate a vertically stacked interface
def create_transcription_interface(source):
with gr.Blocks() as interface:
gr.Markdown("""
Use microphone, upload .wav file.
""")
with gr.Column():
audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
output = gr.JSON(label="Results")
api_key_input = gr.Textbox(label="API Key", type="password")
audio_input.change(fn=extract_audio_features, inputs=[audio_input, api_key_input], outputs=output)
return interface
# Create two interfaces (one for mic, one for file upload)
mic_transcribe = create_transcription_interface("microphone")
file_transcribe = create_transcription_interface("upload")
demo = gr.TabbedInterface(
[mic_transcribe, file_transcribe],
["Microphone Input", "Upload .wav file"],
title="Audio Feature Extraction",
)
demo.launch() |