import gradio as gr import librosa from vad import EnergyVAD from typing import List API_KEY = "682d2362-894c-800c-af30-a4c56b7f074b" FRAME_LENGTH = 25 # milliseconds FRAME_SHIFT = 10 # milliseconds # EnergyVAD takes in different sampling rates but seems to work # best and avoid drift with 16000 rather than Librosa default of 22050 sr = 16000 # Generally a silence longer than a second is should separate one coda from another MIN_SILENCE_BETWEEN_CODAS = 1000 # in milliseconds MIN_SILENCE_BETWEEN_CLICKS = 10 # in milliseconds # Initialize VAD and get the output, which is an array of 0's and 1's, where 0 = frame with silence # and 1 = frame with voice activity above the energy threshold vad = EnergyVAD(frame_length=FRAME_LENGTH, frame_shift=FRAME_SHIFT, sample_rate=sr, energy_threshold=0.3) # Function that takes in # - vad_output as a List of 0's and 1's, where 0 = frame with silence and 1 = voice activity # - frame_shift in ms # - min_silence_duration, which is the minimum time of silence in ms that separates a group of sound # Returns periods of time with voice activity separated by silences longer than the min_silence_duration # Output format: [{'start': number, 'end': number}] def get_voice_activity_timestamps(*, vad_output: List[int], frame_shift: int, min_silence_duration: int = 1000): min_silence_frames = int(min_silence_duration / frame_shift) groups = [] start_idx = None silence_counter = 0 for i, frame in enumerate(vad_output): if frame == 1: if start_idx is None: start_idx = i silence_counter = 0 else: if start_idx is not None: silence_counter += 1 if silence_counter >= min_silence_frames: # Silence is long enough, so close the current voice group end_idx = i - silence_counter start_time = start_idx * frame_shift / 1000 end_time = (end_idx + 1) * frame_shift / 1000 groups.append({ 'start': round(start_time, 4), 'end': round(end_time, 4) }) start_idx = None silence_counter = 0 # Handle case where audio ends with voice activity if start_idx is not None: end_time = (len(vad_output)+1) * frame_shift / 1000 groups.append({ 'start': round(start_idx * frame_shift / 1000, 4), 'end': round(end_time, 4) }) return groups # Function that takes in # - vad_output as a List of 0's and 1's, where 0 = frame with silence and 1 = voice activity # - frame_shift in ms # - min_silence_duration, which is the minimum time of silence in ms that will be included # Returns timestamps for silences longer than the min_silence_duration # Output format: [{'start': number, 'end': number}] def get_timestamps_silences(*, vad_output: List[int], frame_shift: int, min_silence_duration: int=1000): min_silence_frames = int(min_silence_duration / frame_shift) groups = [] start_idx = None for i, frame in enumerate(vad_output): if frame==0: if start_idx is None: start_idx = i else: if start_idx is not None: end_idx = i duration = end_idx - start_idx if duration >= min_silence_frames: start_time = start_idx * frame_shift / 1000 end_time = end_idx * frame_shift / 1000 groups.append({ 'start': round(start_time, 2), 'end': round(end_time, 2) }) start_idx = None # Handle case where the last segment goes to the end if start_idx is not None: end_idx = len(vad_output) duration = end_idx - start_idx if duration >= min_silence_frames: start_time = start_idx * frame_shift / 1000 end_time = end_idx * frame_shift / 1000 groups.append({ 'start': round(start_time, 2), 'end': round(end_time, 2) }) return groups # Function to spit an audio into an array of individual audios by timestamp. # Assumes timestamps are in format {'start': time in seconds, 'end': time in seconds} def splitAudioByTimestamps(audio, timestamps, sr): audio_array = [] for i, ts in enumerate(timestamps): start_sample = int(float(ts['start']) * sr) # convert start time into sample index end_sample = int(float(ts['end']) * sr) segment = audio[start_sample:end_sample] # Extract the segment using start and ending sample index audio_array.append(segment) # append segment into the array return audio_array # Convert timestamps into durations # Returns timestamps in format {'start': time in seconds, 'end': time in seconds} def convert_timestamps_to_durations(timestamps): durations = [] for i, timestamps in enumerate(timestamps): durations.append(round(float(timestamps['end'])-float(timestamps['start']), 4)) return durations # Function to extract features from sperm whale codas and identify 1+1+3 codas def transcribe_whalish(file, key): if key != API_KEY: raise gr.Error("Invalid API key.") try: # Load audio with Librosa.load and resample file, _ = librosa.load(file, sr=sr) # Get VAD output to file vad_output = vad(file) # Get timestamps for codas codas = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS) # Take original audio, split into an array of files, one for each coda and trim off silence at beginning and end coda_audios = splitAudioByTimestamps(file, codas, sr) # Get timestamps for individual clicks throughout file clicks = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS) # Get timestamps for silences between codas inter_coda_intervals = get_timestamps_silences(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS) inter_coda_intervals # for each coda in coda_timestamps, look at the audio in coda_audio that is in the same position # in the array and extract features about it, then save that info to coda_timestamps object for i, coda_audio in enumerate(coda_audios): # get vad_output for each coda vad_output_for_coda = vad(coda_audio) codas[i]['vad'] = vad_output_for_coda # Get the timestamps for clicks inside each coda coda_clicks = get_voice_activity_timestamps(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS) codas[i]['click_timestamps'] = coda_clicks # Use timestamps of the clicks to find the total number of clicks number_of_clicks = len(coda_clicks) codas[i]['number_of_clicks'] = number_of_clicks # Use timestamps of the clicks to find total duration of the coda (time from beginning of first click # to end of last click) duration = 0 if number_of_clicks > 0: duration = float(coda_clicks[len(coda_clicks)-1]['end']) - float(coda_clicks[0]['start']) codas[i]['duration'] = duration # Use VAD output to extract timestamps of the silences coda_inter_click_intervals = get_timestamps_silences(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS) codas[i]['inter_click_intervals'] = coda_inter_click_intervals # Get the inter-click-interval durations in seconds inter_click_interval_durations = convert_timestamps_to_durations(coda_inter_click_intervals) codas[i]['inter_click_interval_durations'] = inter_click_interval_durations # Check if the coda conforms to 1+1+3 using a simple formula if number_of_clicks == 5 and inter_click_interval_durations[0] > 0.23 and inter_click_interval_durations[1] > 0.25 and inter_click_interval_durations[2] <= 0.2 and inter_click_interval_durations[3] <= 0.2: codas[i]['content'] = '1+1+3' else: codas[i]['content'] = '' # Calculate the inter-click-interval timestamps for inter-click-intervals # inside the codas, but use timestamps for the entire file inter_click_intervals = [] for i, coda in enumerate(codas): for i, inter_click_interval in enumerate(coda['inter_click_intervals']): new_interval = {'start': coda['start'] + inter_click_interval['start'], 'end': coda['start'] + inter_click_interval['end']} inter_click_intervals.append(new_interval) output = { 'vad': vad_output, 'codas': codas, 'clicks': clicks, 'inter_coda_intervals': inter_coda_intervals, 'inter_click_intervals': inter_click_intervals } except Exception as e: print(f"An error occurred: {e}") output = f"An error occurred: {e}" return output examples = [['spermwhale_dominica.wav']] # Create a function to generate a vertically stacked interface def create_transcription_interface(source): with gr.Blocks() as interface: gr.Markdown(""" Use microphone, upload .wav file, or choose an example below. """) with gr.Column(): audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio") output = gr.JSON(label="Results") api_key_input = gr.Textbox(label="API Key", type="password") audio_input.change(fn=transcribe_whalish, inputs=[audio_input, api_key_input], outputs=output) gr.Examples(examples=examples, inputs=[audio_input]) return interface # Create two interfaces (one for mic, one for file upload) mic_transcribe = create_transcription_interface("microphone") file_transcribe = create_transcription_interface("upload") demo = gr.TabbedInterface( [mic_transcribe, file_transcribe], ["Microphone Input", "Upload .wav file"], title="Transcribe Sperm Whalish 1+1+3 Coda", ) demo.launch()