|
import gradio as gr |
|
import librosa |
|
from vad import EnergyVAD |
|
from typing import List |
|
|
|
API_KEY = "682d2362-894c-800c-af30-a4c56b7f074b" |
|
FRAME_LENGTH = 25 |
|
FRAME_SHIFT = 10 |
|
|
|
|
|
|
|
sr = 16000 |
|
|
|
|
|
MIN_SILENCE_BETWEEN_CODAS = 1000 |
|
MIN_SILENCE_BETWEEN_CLICKS = 10 |
|
|
|
|
|
|
|
vad = EnergyVAD(frame_length=FRAME_LENGTH, frame_shift=FRAME_SHIFT, sample_rate=sr, energy_threshold=0.3) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_voice_activity_timestamps(*, vad_output: List[int], frame_shift: int, min_silence_duration: int = 1000): |
|
min_silence_frames = int(min_silence_duration / frame_shift) |
|
|
|
groups = [] |
|
start_idx = None |
|
silence_counter = 0 |
|
|
|
for i, frame in enumerate(vad_output): |
|
if frame == 1: |
|
if start_idx is None: |
|
start_idx = i |
|
silence_counter = 0 |
|
else: |
|
if start_idx is not None: |
|
silence_counter += 1 |
|
if silence_counter >= min_silence_frames: |
|
|
|
end_idx = i - silence_counter |
|
start_time = start_idx * frame_shift / 1000 |
|
end_time = (end_idx + 1) * frame_shift / 1000 |
|
groups.append({ |
|
'start': round(start_time, 4), |
|
'end': round(end_time, 4) |
|
}) |
|
start_idx = None |
|
silence_counter = 0 |
|
|
|
|
|
if start_idx is not None: |
|
end_time = (len(vad_output)+1) * frame_shift / 1000 |
|
groups.append({ |
|
'start': round(start_idx * frame_shift / 1000, 4), |
|
'end': round(end_time, 4) |
|
}) |
|
|
|
return groups |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_timestamps_silences(*, vad_output: List[int], frame_shift: int, min_silence_duration: int=1000): |
|
min_silence_frames = int(min_silence_duration / frame_shift) |
|
groups = [] |
|
start_idx = None |
|
|
|
for i, frame in enumerate(vad_output): |
|
if frame==0: |
|
if start_idx is None: |
|
start_idx = i |
|
else: |
|
if start_idx is not None: |
|
end_idx = i |
|
duration = end_idx - start_idx |
|
|
|
if duration >= min_silence_frames: |
|
start_time = start_idx * frame_shift / 1000 |
|
end_time = end_idx * frame_shift / 1000 |
|
groups.append({ |
|
'start': round(start_time, 2), |
|
'end': round(end_time, 2) |
|
}) |
|
|
|
start_idx = None |
|
|
|
|
|
if start_idx is not None: |
|
end_idx = len(vad_output) |
|
duration = end_idx - start_idx |
|
if duration >= min_silence_frames: |
|
start_time = start_idx * frame_shift / 1000 |
|
end_time = end_idx * frame_shift / 1000 |
|
groups.append({ |
|
'start': round(start_time, 2), |
|
'end': round(end_time, 2) |
|
}) |
|
|
|
return groups |
|
|
|
|
|
|
|
|
|
def splitAudioByTimestamps(audio, timestamps, sr): |
|
audio_array = [] |
|
for i, ts in enumerate(timestamps): |
|
start_sample = int(float(ts['start']) * sr) |
|
end_sample = int(float(ts['end']) * sr) |
|
segment = audio[start_sample:end_sample] |
|
audio_array.append(segment) |
|
return audio_array |
|
|
|
|
|
|
|
def convert_timestamps_to_durations(timestamps): |
|
durations = [] |
|
for i, timestamps in enumerate(timestamps): |
|
durations.append(round(float(timestamps['end'])-float(timestamps['start']), 4)) |
|
return durations |
|
|
|
|
|
def transcribe_whalish(file, key): |
|
if key != API_KEY: |
|
raise gr.Error("Invalid API key.") |
|
|
|
try: |
|
|
|
file, _ = librosa.load(file, sr=sr) |
|
|
|
|
|
vad_output = vad(file) |
|
|
|
|
|
codas = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS) |
|
|
|
|
|
coda_audios = splitAudioByTimestamps(file, codas, sr) |
|
|
|
|
|
clicks = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS) |
|
|
|
|
|
inter_coda_intervals = get_timestamps_silences(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS) |
|
inter_coda_intervals |
|
|
|
|
|
|
|
for i, coda_audio in enumerate(coda_audios): |
|
|
|
vad_output_for_coda = vad(coda_audio) |
|
codas[i]['vad'] = vad_output_for_coda |
|
|
|
|
|
coda_clicks = get_voice_activity_timestamps(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS) |
|
codas[i]['click_timestamps'] = coda_clicks |
|
|
|
|
|
number_of_clicks = len(coda_clicks) |
|
codas[i]['number_of_clicks'] = number_of_clicks |
|
|
|
|
|
|
|
duration = 0 |
|
if number_of_clicks > 0: |
|
duration = float(coda_clicks[len(coda_clicks)-1]['end']) - float(coda_clicks[0]['start']) |
|
codas[i]['duration'] = duration |
|
|
|
|
|
coda_inter_click_intervals = get_timestamps_silences(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS) |
|
codas[i]['inter_click_intervals'] = coda_inter_click_intervals |
|
|
|
|
|
inter_click_interval_durations = convert_timestamps_to_durations(coda_inter_click_intervals) |
|
codas[i]['inter_click_interval_durations'] = inter_click_interval_durations |
|
|
|
|
|
if number_of_clicks == 5 and inter_click_interval_durations[0] > 0.23 and inter_click_interval_durations[1] > 0.25 and inter_click_interval_durations[2] <= 0.2 and inter_click_interval_durations[3] <= 0.2: |
|
codas[i]['content'] = '1+1+3' |
|
else: |
|
codas[i]['content'] = '' |
|
|
|
|
|
|
|
inter_click_intervals = [] |
|
for i, coda in enumerate(codas): |
|
for i, inter_click_interval in enumerate(coda['inter_click_intervals']): |
|
new_interval = {'start': coda['start'] + inter_click_interval['start'], 'end': coda['start'] + inter_click_interval['end']} |
|
inter_click_intervals.append(new_interval) |
|
|
|
output = { |
|
'vad': vad_output, |
|
'codas': codas, |
|
'clicks': clicks, |
|
'inter_coda_intervals': inter_coda_intervals, |
|
'inter_click_intervals': inter_click_intervals |
|
} |
|
|
|
except Exception as e: |
|
print(f"An error occurred: {e}") |
|
output = f"An error occurred: {e}" |
|
|
|
return output |
|
|
|
examples = [['spermwhale_dominica.wav']] |
|
|
|
|
|
def create_transcription_interface(source): |
|
with gr.Blocks() as interface: |
|
gr.Markdown(""" |
|
Use microphone, upload .wav file, or choose an example below. |
|
""") |
|
with gr.Column(): |
|
audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio") |
|
output = gr.JSON(label="Results") |
|
api_key_input = gr.Textbox(label="API Key", type="password") |
|
audio_input.change(fn=transcribe_whalish, inputs=[audio_input, api_key_input], outputs=output) |
|
gr.Examples(examples=examples, inputs=[audio_input]) |
|
return interface |
|
|
|
|
|
mic_transcribe = create_transcription_interface("microphone") |
|
file_transcribe = create_transcription_interface("upload") |
|
|
|
demo = gr.TabbedInterface( |
|
[mic_transcribe, file_transcribe], |
|
["Microphone Input", "Upload .wav file"], |
|
title="Transcribe Sperm Whalish 1+1+3 Coda", |
|
) |
|
|
|
demo.launch() |