Spaces:

kaysrubio
/

transcribe_sperm_whale_coda

Sleeping

DontFreakOut

tiny update in start_idx for silences

e28768f 3 months ago

10.5 kB

	import gradio as gr
	import librosa
	from vad import EnergyVAD
	from typing import List

	API_KEY = "682d2362-894c-800c-af30-a4c56b7f074b"
	FRAME_LENGTH = 25 # milliseconds
	FRAME_SHIFT = 10 # milliseconds

	# EnergyVAD takes in different sampling rates but seems to work
	# best and avoid drift with 16000 rather than Librosa default of 22050
	sr = 16000

	# Generally a silence longer than a second is should separate one coda from another
	MIN_SILENCE_BETWEEN_CODAS = 1000 # in milliseconds
	MIN_SILENCE_BETWEEN_CLICKS = 10 # in milliseconds

	# Initialize VAD and get the output, which is an array of 0's and 1's, where 0 = frame with silence
	# and 1 = frame with voice activity above the energy threshold
	vad = EnergyVAD(frame_length=FRAME_LENGTH, frame_shift=FRAME_SHIFT, sample_rate=sr, energy_threshold=0.3)

	# Function that takes in
	# - vad_output as a List of 0's and 1's, where 0 = frame with silence and 1 = voice activity
	# - frame_shift in ms
	# - min_silence_duration, which is the minimum time of silence in ms that separates a group of sound
	# Returns periods of time with voice activity separated by silences longer than the min_silence_duration
	# Output format: [{'start': number, 'end': number}]
	def get_voice_activity_timestamps(*, vad_output: List[int], frame_shift: int, min_silence_duration: int = 1000):
	min_silence_frames = int(min_silence_duration / frame_shift)

	groups = []
	start_idx = None
	silence_counter = 0

	for i, frame in enumerate(vad_output):
	if frame == 1:
	if start_idx is None:
	start_idx = i
	silence_counter = 0
	else:
	if start_idx is not None:
	silence_counter += 1
	if silence_counter >= min_silence_frames:
	# Silence is long enough, so close the current voice group
	end_idx = i - silence_counter
	start_time = start_idx * frame_shift / 1000
	end_time = (end_idx + 1) * frame_shift / 1000
	groups.append({
	'start': round(start_time, 4),
	'end': round(end_time, 4)
	})
	start_idx = None
	silence_counter = 0

	# Handle case where audio ends with voice activity
	if start_idx is not None:
	end_time = (len(vad_output)+1) * frame_shift / 1000
	groups.append({
	'start': round(start_idx * frame_shift / 1000, 4),
	'end': round(end_time, 4)
	})

	return groups


	# Function that takes in
	# - vad_output as a List of 0's and 1's, where 0 = frame with silence and 1 = voice activity
	# - frame_shift in ms
	# - min_silence_duration, which is the minimum time of silence in ms that will be included
	# Returns timestamps for silences longer than the min_silence_duration
	# Output format: [{'start': number, 'end': number}]
	def get_timestamps_silences(*, vad_output: List[int], frame_shift: int, min_silence_duration: int=1000):
	min_silence_frames = int(min_silence_duration / frame_shift)
	groups = []
	start_idx = None

	for i, frame in enumerate(vad_output):
	if frame==0:
	if start_idx is None:
	start_idx = i
	else:
	if start_idx is not None:
	end_idx = i
	duration = end_idx - start_idx

	if duration >= min_silence_frames:
	start_time = start_idx * frame_shift / 1000
	end_time = end_idx * frame_shift / 1000
	groups.append({
	'start': round(start_time, 2),
	'end': round(end_time, 2)
	})

	start_idx = None

	# Handle case where the last segment goes to the end
	if start_idx is not None:
	end_idx = len(vad_output)
	duration = end_idx - start_idx
	if duration >= min_silence_frames:
	start_time = start_idx * frame_shift / 1000
	end_time = end_idx * frame_shift / 1000
	groups.append({
	'start': round(start_time, 2),
	'end': round(end_time, 2)
	})

	return groups


	# Function to spit an audio into an array of individual audios by timestamp.
	# Assumes timestamps are in format {'start': time in seconds, 'end': time in seconds}
	def splitAudioByTimestamps(audio, timestamps, sr):
	audio_array = []
	for i, ts in enumerate(timestamps):
	start_sample = int(float(ts['start']) * sr) # convert start time into sample index
	end_sample = int(float(ts['end']) * sr)
	segment = audio[start_sample:end_sample] # Extract the segment using start and ending sample index
	audio_array.append(segment) # append segment into the array
	return audio_array

	# Convert timestamps into durations
	# Returns timestamps in format {'start': time in seconds, 'end': time in seconds}
	def convert_timestamps_to_durations(timestamps):
	durations = []
	for i, timestamps in enumerate(timestamps):
	durations.append(round(float(timestamps['end'])-float(timestamps['start']), 4))
	return durations

	# Function to extract features from sperm whale codas and identify 1+1+3 codas
	def transcribe_whalish(file, key):
	if key != API_KEY:
	raise gr.Error("Invalid API key.")

	try:
	# Load audio with Librosa.load and resample
	file, _ = librosa.load(file, sr=sr)

	# Get VAD output to file
	vad_output = vad(file)

	# Get timestamps for codas
	codas = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS)

	# Take original audio, split into an array of files, one for each coda and trim off silence at beginning and end
	coda_audios = splitAudioByTimestamps(file, codas, sr)

	# Get timestamps for individual clicks throughout file
	clicks = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)

	# Get timestamps for silences between codas
	inter_coda_intervals = get_timestamps_silences(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS)
	inter_coda_intervals

	# for each coda in coda_timestamps, look at the audio in coda_audio that is in the same position
	# in the array and extract features about it, then save that info to coda_timestamps object
	for i, coda_audio in enumerate(coda_audios):
	# get vad_output for each coda
	vad_output_for_coda = vad(coda_audio)
	codas[i]['vad'] = vad_output_for_coda

	# Get the timestamps for clicks inside each coda
	coda_clicks = get_voice_activity_timestamps(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)
	codas[i]['click_timestamps'] = coda_clicks

	# Use timestamps of the clicks to find the total number of clicks
	number_of_clicks = len(coda_clicks)
	codas[i]['number_of_clicks'] = number_of_clicks

	# Use timestamps of the clicks to find total duration of the coda (time from beginning of first click
	# to end of last click)
	duration = 0
	if number_of_clicks > 0:
	duration = float(coda_clicks[len(coda_clicks)-1]['end']) - float(coda_clicks[0]['start'])
	codas[i]['duration'] = duration

	# Use VAD output to extract timestamps of the silences
	coda_inter_click_intervals = get_timestamps_silences(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)
	codas[i]['inter_click_intervals'] = coda_inter_click_intervals

	# Get the inter-click-interval durations in seconds
	inter_click_interval_durations = convert_timestamps_to_durations(coda_inter_click_intervals)
	codas[i]['inter_click_interval_durations'] = inter_click_interval_durations

	# Check if the coda conforms to 1+1+3 using a simple formula
	if number_of_clicks == 5 and inter_click_interval_durations[0] > 0.23 and inter_click_interval_durations[1] > 0.25 and inter_click_interval_durations[2] <= 0.2 and inter_click_interval_durations[3] <= 0.2:
	codas[i]['content'] = '1+1+3'
	else:
	codas[i]['content'] = ''

	# Calculate the inter-click-interval timestamps for inter-click-intervals
	# inside the codas, but use timestamps for the entire file
	inter_click_intervals = []
	for i, coda in enumerate(codas):
	for i, inter_click_interval in enumerate(coda['inter_click_intervals']):
	new_interval = {'start': coda['start'] + inter_click_interval['start'], 'end': coda['start'] + inter_click_interval['end']}
	inter_click_intervals.append(new_interval)

	output = {
	'vad': vad_output,
	'codas': codas,
	'clicks': clicks,
	'inter_coda_intervals': inter_coda_intervals,
	'inter_click_intervals': inter_click_intervals
	}

	except Exception as e:
	print(f"An error occurred: {e}")
	output = f"An error occurred: {e}"

	return output

	examples = [['spermwhale_dominica.wav']]

	# Create a function to generate a vertically stacked interface
	def create_transcription_interface(source):
	with gr.Blocks() as interface:
	gr.Markdown("""
	Use microphone, upload .wav file, or choose an example below.
	""")
	with gr.Column():
	audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
	output = gr.JSON(label="Results")
	api_key_input = gr.Textbox(label="API Key", type="password")
	audio_input.change(fn=transcribe_whalish, inputs=[audio_input, api_key_input], outputs=output)
	gr.Examples(examples=examples, inputs=[audio_input])
	return interface

	# Create two interfaces (one for mic, one for file upload)
	mic_transcribe = create_transcription_interface("microphone")
	file_transcribe = create_transcription_interface("upload")

	demo = gr.TabbedInterface(
	[mic_transcribe, file_transcribe],
	["Microphone Input", "Upload .wav file"],
	title="Transcribe Sperm Whalish 1+1+3 Coda",
	)

	demo.launch()