DontFreakOut
tiny update in start_idx for silences
e28768f
import gradio as gr
import librosa
from vad import EnergyVAD
from typing import List
API_KEY = "682d2362-894c-800c-af30-a4c56b7f074b"
FRAME_LENGTH = 25 # milliseconds
FRAME_SHIFT = 10 # milliseconds
# EnergyVAD takes in different sampling rates but seems to work
# best and avoid drift with 16000 rather than Librosa default of 22050
sr = 16000
# Generally a silence longer than a second is should separate one coda from another
MIN_SILENCE_BETWEEN_CODAS = 1000 # in milliseconds
MIN_SILENCE_BETWEEN_CLICKS = 10 # in milliseconds
# Initialize VAD and get the output, which is an array of 0's and 1's, where 0 = frame with silence
# and 1 = frame with voice activity above the energy threshold
vad = EnergyVAD(frame_length=FRAME_LENGTH, frame_shift=FRAME_SHIFT, sample_rate=sr, energy_threshold=0.3)
# Function that takes in
# - vad_output as a List of 0's and 1's, where 0 = frame with silence and 1 = voice activity
# - frame_shift in ms
# - min_silence_duration, which is the minimum time of silence in ms that separates a group of sound
# Returns periods of time with voice activity separated by silences longer than the min_silence_duration
# Output format: [{'start': number, 'end': number}]
def get_voice_activity_timestamps(*, vad_output: List[int], frame_shift: int, min_silence_duration: int = 1000):
min_silence_frames = int(min_silence_duration / frame_shift)
groups = []
start_idx = None
silence_counter = 0
for i, frame in enumerate(vad_output):
if frame == 1:
if start_idx is None:
start_idx = i
silence_counter = 0
else:
if start_idx is not None:
silence_counter += 1
if silence_counter >= min_silence_frames:
# Silence is long enough, so close the current voice group
end_idx = i - silence_counter
start_time = start_idx * frame_shift / 1000
end_time = (end_idx + 1) * frame_shift / 1000
groups.append({
'start': round(start_time, 4),
'end': round(end_time, 4)
})
start_idx = None
silence_counter = 0
# Handle case where audio ends with voice activity
if start_idx is not None:
end_time = (len(vad_output)+1) * frame_shift / 1000
groups.append({
'start': round(start_idx * frame_shift / 1000, 4),
'end': round(end_time, 4)
})
return groups
# Function that takes in
# - vad_output as a List of 0's and 1's, where 0 = frame with silence and 1 = voice activity
# - frame_shift in ms
# - min_silence_duration, which is the minimum time of silence in ms that will be included
# Returns timestamps for silences longer than the min_silence_duration
# Output format: [{'start': number, 'end': number}]
def get_timestamps_silences(*, vad_output: List[int], frame_shift: int, min_silence_duration: int=1000):
min_silence_frames = int(min_silence_duration / frame_shift)
groups = []
start_idx = None
for i, frame in enumerate(vad_output):
if frame==0:
if start_idx is None:
start_idx = i
else:
if start_idx is not None:
end_idx = i
duration = end_idx - start_idx
if duration >= min_silence_frames:
start_time = start_idx * frame_shift / 1000
end_time = end_idx * frame_shift / 1000
groups.append({
'start': round(start_time, 2),
'end': round(end_time, 2)
})
start_idx = None
# Handle case where the last segment goes to the end
if start_idx is not None:
end_idx = len(vad_output)
duration = end_idx - start_idx
if duration >= min_silence_frames:
start_time = start_idx * frame_shift / 1000
end_time = end_idx * frame_shift / 1000
groups.append({
'start': round(start_time, 2),
'end': round(end_time, 2)
})
return groups
# Function to spit an audio into an array of individual audios by timestamp.
# Assumes timestamps are in format {'start': time in seconds, 'end': time in seconds}
def splitAudioByTimestamps(audio, timestamps, sr):
audio_array = []
for i, ts in enumerate(timestamps):
start_sample = int(float(ts['start']) * sr) # convert start time into sample index
end_sample = int(float(ts['end']) * sr)
segment = audio[start_sample:end_sample] # Extract the segment using start and ending sample index
audio_array.append(segment) # append segment into the array
return audio_array
# Convert timestamps into durations
# Returns timestamps in format {'start': time in seconds, 'end': time in seconds}
def convert_timestamps_to_durations(timestamps):
durations = []
for i, timestamps in enumerate(timestamps):
durations.append(round(float(timestamps['end'])-float(timestamps['start']), 4))
return durations
# Function to extract features from sperm whale codas and identify 1+1+3 codas
def transcribe_whalish(file, key):
if key != API_KEY:
raise gr.Error("Invalid API key.")
try:
# Load audio with Librosa.load and resample
file, _ = librosa.load(file, sr=sr)
# Get VAD output to file
vad_output = vad(file)
# Get timestamps for codas
codas = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS)
# Take original audio, split into an array of files, one for each coda and trim off silence at beginning and end
coda_audios = splitAudioByTimestamps(file, codas, sr)
# Get timestamps for individual clicks throughout file
clicks = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)
# Get timestamps for silences between codas
inter_coda_intervals = get_timestamps_silences(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS)
inter_coda_intervals
# for each coda in coda_timestamps, look at the audio in coda_audio that is in the same position
# in the array and extract features about it, then save that info to coda_timestamps object
for i, coda_audio in enumerate(coda_audios):
# get vad_output for each coda
vad_output_for_coda = vad(coda_audio)
codas[i]['vad'] = vad_output_for_coda
# Get the timestamps for clicks inside each coda
coda_clicks = get_voice_activity_timestamps(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)
codas[i]['click_timestamps'] = coda_clicks
# Use timestamps of the clicks to find the total number of clicks
number_of_clicks = len(coda_clicks)
codas[i]['number_of_clicks'] = number_of_clicks
# Use timestamps of the clicks to find total duration of the coda (time from beginning of first click
# to end of last click)
duration = 0
if number_of_clicks > 0:
duration = float(coda_clicks[len(coda_clicks)-1]['end']) - float(coda_clicks[0]['start'])
codas[i]['duration'] = duration
# Use VAD output to extract timestamps of the silences
coda_inter_click_intervals = get_timestamps_silences(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)
codas[i]['inter_click_intervals'] = coda_inter_click_intervals
# Get the inter-click-interval durations in seconds
inter_click_interval_durations = convert_timestamps_to_durations(coda_inter_click_intervals)
codas[i]['inter_click_interval_durations'] = inter_click_interval_durations
# Check if the coda conforms to 1+1+3 using a simple formula
if number_of_clicks == 5 and inter_click_interval_durations[0] > 0.23 and inter_click_interval_durations[1] > 0.25 and inter_click_interval_durations[2] <= 0.2 and inter_click_interval_durations[3] <= 0.2:
codas[i]['content'] = '1+1+3'
else:
codas[i]['content'] = ''
# Calculate the inter-click-interval timestamps for inter-click-intervals
# inside the codas, but use timestamps for the entire file
inter_click_intervals = []
for i, coda in enumerate(codas):
for i, inter_click_interval in enumerate(coda['inter_click_intervals']):
new_interval = {'start': coda['start'] + inter_click_interval['start'], 'end': coda['start'] + inter_click_interval['end']}
inter_click_intervals.append(new_interval)
output = {
'vad': vad_output,
'codas': codas,
'clicks': clicks,
'inter_coda_intervals': inter_coda_intervals,
'inter_click_intervals': inter_click_intervals
}
except Exception as e:
print(f"An error occurred: {e}")
output = f"An error occurred: {e}"
return output
examples = [['spermwhale_dominica.wav']]
# Create a function to generate a vertically stacked interface
def create_transcription_interface(source):
with gr.Blocks() as interface:
gr.Markdown("""
Use microphone, upload .wav file, or choose an example below.
""")
with gr.Column():
audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
output = gr.JSON(label="Results")
api_key_input = gr.Textbox(label="API Key", type="password")
audio_input.change(fn=transcribe_whalish, inputs=[audio_input, api_key_input], outputs=output)
gr.Examples(examples=examples, inputs=[audio_input])
return interface
# Create two interfaces (one for mic, one for file upload)
mic_transcribe = create_transcription_interface("microphone")
file_transcribe = create_transcription_interface("upload")
demo = gr.TabbedInterface(
[mic_transcribe, file_transcribe],
["Microphone Input", "Upload .wav file"],
title="Transcribe Sperm Whalish 1+1+3 Coda",
)
demo.launch()