import gradio as gr import numpy as np import librosa API_KEY = "682d2362-894c-800c-af30-a4c56b7f074b" FRAME_LENGTH = 1024 HOP_LENGTH = FRAME_LENGTH//4 sr = 22050 # Function to calculate amplitude envelope for each frame def amplitude_envelope(audio, frame_length, hop_length): return np.array([max(audio[i:i+frame_length]) for i in range(0, len(audio), hop_length)]) # Function to extract Short-Time Fourier Transform def st_fourier_transform(audio, n_fft, hop_length, type="power"): stft = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length) if type=="power": stft_abs = np.abs(stft) ** 2 # use for a power spectrogram stft_dB = librosa.power_to_db(np.abs(stft_abs), ref=np.max) elif type=="amplitude": stft_abs = np.abs(stft) # use for an amplitude spectrogram stft_dB = librosa.amplitude_to_db(np.abs(stft_abs), ref=np.max) else: raise ValueError(f"Error: type should be 'power' or 'amplitude'") return stft_dB def extract_audio_features(file, key): if key != API_KEY: raise gr.Error("Invalid API key.") try: output = {} # Load audio with Librosa.load and resample file, _ = librosa.load(file, sr=sr) # Calculate amplitude envelope ae = amplitude_envelope(file, FRAME_LENGTH, HOP_LENGTH) output['amplitude_envelope'] = ae # Calculate root-mean-square energy rms = librosa.feature.rms(y=file, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0] output['root_mean_square_energy'] = rms # Calculate zero-crossing rate using librosa.feature.zero_crossing_rate zcr = librosa.feature.zero_crossing_rate(y=file, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0] output['zero_crossing_rate'] = zcr # Apply short-time fourier transform stft = st_fourier_transform(file, FRAME_LENGTH, HOP_LENGTH) # Extract the first 13 MFCCs mfccs = librosa.feature.mfcc(y=file, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH, n_mfcc=13) output['mfccs'] = mfccs # Computing first MFCCs derivatives (how they change from frame-frame over time) delta_mfccs = librosa.feature.delta(mfccs) output['delta_mfccs'] = delta_mfccs # Calculate spectral centroid sc = librosa.feature.spectral_centroid(y=file, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0] output['spectral_centroid'] = sc # Calculate spectral bandwidth sban = librosa.feature.spectral_bandwidth(y=file, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0] output['spectral_bandwidth'] = sban except Exception as e: print(f"An error occurred: {e}") output = f"An error occurred: {e}" return output # Create a function to generate a vertically stacked interface def create_transcription_interface(source): with gr.Blocks() as interface: gr.Markdown(""" Use microphone, upload .wav file. """) with gr.Column(): audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio") output = gr.JSON(label="Results") api_key_input = gr.Textbox(label="API Key", type="password") audio_input.change(fn=extract_audio_features, inputs=[audio_input, api_key_input], outputs=output) return interface # Create two interfaces (one for mic, one for file upload) mic_transcribe = create_transcription_interface("microphone") file_transcribe = create_transcription_interface("upload") demo = gr.TabbedInterface( [mic_transcribe, file_transcribe], ["Microphone Input", "Upload .wav file"], title="Audio Feature Extraction", ) demo.launch()