File size: 10,508 Bytes
aed3c8f 6565eac 8f4b13a aed3c8f 6565eac eb971ad 6565eac 8f4b13a 6565eac 8f4b13a 6565eac 8f4b13a 6565eac 8f4b13a 6565eac 8f4b13a e28768f 8f4b13a e28768f 8f4b13a e28768f 8f4b13a e28768f 8f4b13a e28768f 8f4b13a 6565eac 6432e79 6565eac 8f4b13a 5426d38 6565eac 8f4b13a 6565eac 8f4b13a 6565eac 8f4b13a f41e78b 6565eac ba453ae 6565eac 8f4b13a 5426d38 8f4b13a f41e78b 8f4b13a 6565eac f41e78b 8f4b13a 6565eac 5426d38 f41e78b 8f4b13a 6565eac 2daf38b aaff308 6565eac aaff308 2daf38b aaff308 6565eac 2daf38b 8f4b13a 6565eac 8f4b13a 6565eac 2daf38b 6565eac b5ec1b3 6565eac aed3c8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 |
import gradio as gr
import librosa
from vad import EnergyVAD
from typing import List
API_KEY = "682d2362-894c-800c-af30-a4c56b7f074b"
FRAME_LENGTH = 25 # milliseconds
FRAME_SHIFT = 10 # milliseconds
# EnergyVAD takes in different sampling rates but seems to work
# best and avoid drift with 16000 rather than Librosa default of 22050
sr = 16000
# Generally a silence longer than a second is should separate one coda from another
MIN_SILENCE_BETWEEN_CODAS = 1000 # in milliseconds
MIN_SILENCE_BETWEEN_CLICKS = 10 # in milliseconds
# Initialize VAD and get the output, which is an array of 0's and 1's, where 0 = frame with silence
# and 1 = frame with voice activity above the energy threshold
vad = EnergyVAD(frame_length=FRAME_LENGTH, frame_shift=FRAME_SHIFT, sample_rate=sr, energy_threshold=0.3)
# Function that takes in
# - vad_output as a List of 0's and 1's, where 0 = frame with silence and 1 = voice activity
# - frame_shift in ms
# - min_silence_duration, which is the minimum time of silence in ms that separates a group of sound
# Returns periods of time with voice activity separated by silences longer than the min_silence_duration
# Output format: [{'start': number, 'end': number}]
def get_voice_activity_timestamps(*, vad_output: List[int], frame_shift: int, min_silence_duration: int = 1000):
min_silence_frames = int(min_silence_duration / frame_shift)
groups = []
start_idx = None
silence_counter = 0
for i, frame in enumerate(vad_output):
if frame == 1:
if start_idx is None:
start_idx = i
silence_counter = 0
else:
if start_idx is not None:
silence_counter += 1
if silence_counter >= min_silence_frames:
# Silence is long enough, so close the current voice group
end_idx = i - silence_counter
start_time = start_idx * frame_shift / 1000
end_time = (end_idx + 1) * frame_shift / 1000
groups.append({
'start': round(start_time, 4),
'end': round(end_time, 4)
})
start_idx = None
silence_counter = 0
# Handle case where audio ends with voice activity
if start_idx is not None:
end_time = (len(vad_output)+1) * frame_shift / 1000
groups.append({
'start': round(start_idx * frame_shift / 1000, 4),
'end': round(end_time, 4)
})
return groups
# Function that takes in
# - vad_output as a List of 0's and 1's, where 0 = frame with silence and 1 = voice activity
# - frame_shift in ms
# - min_silence_duration, which is the minimum time of silence in ms that will be included
# Returns timestamps for silences longer than the min_silence_duration
# Output format: [{'start': number, 'end': number}]
def get_timestamps_silences(*, vad_output: List[int], frame_shift: int, min_silence_duration: int=1000):
min_silence_frames = int(min_silence_duration / frame_shift)
groups = []
start_idx = None
for i, frame in enumerate(vad_output):
if frame==0:
if start_idx is None:
start_idx = i
else:
if start_idx is not None:
end_idx = i
duration = end_idx - start_idx
if duration >= min_silence_frames:
start_time = start_idx * frame_shift / 1000
end_time = end_idx * frame_shift / 1000
groups.append({
'start': round(start_time, 2),
'end': round(end_time, 2)
})
start_idx = None
# Handle case where the last segment goes to the end
if start_idx is not None:
end_idx = len(vad_output)
duration = end_idx - start_idx
if duration >= min_silence_frames:
start_time = start_idx * frame_shift / 1000
end_time = end_idx * frame_shift / 1000
groups.append({
'start': round(start_time, 2),
'end': round(end_time, 2)
})
return groups
# Function to spit an audio into an array of individual audios by timestamp.
# Assumes timestamps are in format {'start': time in seconds, 'end': time in seconds}
def splitAudioByTimestamps(audio, timestamps, sr):
audio_array = []
for i, ts in enumerate(timestamps):
start_sample = int(float(ts['start']) * sr) # convert start time into sample index
end_sample = int(float(ts['end']) * sr)
segment = audio[start_sample:end_sample] # Extract the segment using start and ending sample index
audio_array.append(segment) # append segment into the array
return audio_array
# Convert timestamps into durations
# Returns timestamps in format {'start': time in seconds, 'end': time in seconds}
def convert_timestamps_to_durations(timestamps):
durations = []
for i, timestamps in enumerate(timestamps):
durations.append(round(float(timestamps['end'])-float(timestamps['start']), 4))
return durations
# Function to extract features from sperm whale codas and identify 1+1+3 codas
def transcribe_whalish(file, key):
if key != API_KEY:
raise gr.Error("Invalid API key.")
try:
# Load audio with Librosa.load and resample
file, _ = librosa.load(file, sr=sr)
# Get VAD output to file
vad_output = vad(file)
# Get timestamps for codas
codas = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS)
# Take original audio, split into an array of files, one for each coda and trim off silence at beginning and end
coda_audios = splitAudioByTimestamps(file, codas, sr)
# Get timestamps for individual clicks throughout file
clicks = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)
# Get timestamps for silences between codas
inter_coda_intervals = get_timestamps_silences(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS)
inter_coda_intervals
# for each coda in coda_timestamps, look at the audio in coda_audio that is in the same position
# in the array and extract features about it, then save that info to coda_timestamps object
for i, coda_audio in enumerate(coda_audios):
# get vad_output for each coda
vad_output_for_coda = vad(coda_audio)
codas[i]['vad'] = vad_output_for_coda
# Get the timestamps for clicks inside each coda
coda_clicks = get_voice_activity_timestamps(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)
codas[i]['click_timestamps'] = coda_clicks
# Use timestamps of the clicks to find the total number of clicks
number_of_clicks = len(coda_clicks)
codas[i]['number_of_clicks'] = number_of_clicks
# Use timestamps of the clicks to find total duration of the coda (time from beginning of first click
# to end of last click)
duration = 0
if number_of_clicks > 0:
duration = float(coda_clicks[len(coda_clicks)-1]['end']) - float(coda_clicks[0]['start'])
codas[i]['duration'] = duration
# Use VAD output to extract timestamps of the silences
coda_inter_click_intervals = get_timestamps_silences(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)
codas[i]['inter_click_intervals'] = coda_inter_click_intervals
# Get the inter-click-interval durations in seconds
inter_click_interval_durations = convert_timestamps_to_durations(coda_inter_click_intervals)
codas[i]['inter_click_interval_durations'] = inter_click_interval_durations
# Check if the coda conforms to 1+1+3 using a simple formula
if number_of_clicks == 5 and inter_click_interval_durations[0] > 0.23 and inter_click_interval_durations[1] > 0.25 and inter_click_interval_durations[2] <= 0.2 and inter_click_interval_durations[3] <= 0.2:
codas[i]['content'] = '1+1+3'
else:
codas[i]['content'] = ''
# Calculate the inter-click-interval timestamps for inter-click-intervals
# inside the codas, but use timestamps for the entire file
inter_click_intervals = []
for i, coda in enumerate(codas):
for i, inter_click_interval in enumerate(coda['inter_click_intervals']):
new_interval = {'start': coda['start'] + inter_click_interval['start'], 'end': coda['start'] + inter_click_interval['end']}
inter_click_intervals.append(new_interval)
output = {
'vad': vad_output,
'codas': codas,
'clicks': clicks,
'inter_coda_intervals': inter_coda_intervals,
'inter_click_intervals': inter_click_intervals
}
except Exception as e:
print(f"An error occurred: {e}")
output = f"An error occurred: {e}"
return output
examples = [['spermwhale_dominica.wav']]
# Create a function to generate a vertically stacked interface
def create_transcription_interface(source):
with gr.Blocks() as interface:
gr.Markdown("""
Use microphone, upload .wav file, or choose an example below.
""")
with gr.Column():
audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
output = gr.JSON(label="Results")
api_key_input = gr.Textbox(label="API Key", type="password")
audio_input.change(fn=transcribe_whalish, inputs=[audio_input, api_key_input], outputs=output)
gr.Examples(examples=examples, inputs=[audio_input])
return interface
# Create two interfaces (one for mic, one for file upload)
mic_transcribe = create_transcription_interface("microphone")
file_transcribe = create_transcription_interface("upload")
demo = gr.TabbedInterface(
[mic_transcribe, file_transcribe],
["Microphone Input", "Upload .wav file"],
title="Transcribe Sperm Whalish 1+1+3 Coda",
)
demo.launch() |