File size: 10,508 Bytes
aed3c8f
6565eac
 
8f4b13a
aed3c8f
6565eac
eb971ad
 
 
 
 
 
 
 
 
 
 
 
 
 
6565eac
8f4b13a
 
 
 
 
 
 
 
 
 
 
 
6565eac
 
8f4b13a
 
 
 
6565eac
8f4b13a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6565eac
 
8f4b13a
 
6565eac
8f4b13a
 
 
 
 
 
 
 
 
 
 
 
 
 
e28768f
8f4b13a
 
 
e28768f
8f4b13a
 
e28768f
8f4b13a
 
 
 
 
 
 
 
 
 
 
e28768f
8f4b13a
e28768f
8f4b13a
 
 
 
 
 
 
 
 
 
6565eac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6432e79
6565eac
 
 
 
 
8f4b13a
5426d38
6565eac
8f4b13a
 
6565eac
8f4b13a
 
6565eac
 
8f4b13a
f41e78b
 
 
6565eac
ba453ae
 
 
 
6565eac
 
8f4b13a
 
 
5426d38
8f4b13a
 
f41e78b
 
8f4b13a
6565eac
f41e78b
8f4b13a
6565eac
 
 
5426d38
 
f41e78b
8f4b13a
6565eac
2daf38b
aaff308
 
6565eac
 
aaff308
2daf38b
aaff308
6565eac
2daf38b
8f4b13a
6565eac
8f4b13a
6565eac
2daf38b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6565eac
 
 
b5ec1b3
6565eac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aed3c8f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import gradio as gr
import librosa
from vad import EnergyVAD
from typing import List

API_KEY = "682d2362-894c-800c-af30-a4c56b7f074b"
FRAME_LENGTH = 25 # milliseconds
FRAME_SHIFT = 10 # milliseconds

# EnergyVAD takes in different sampling rates but seems to work
# best and avoid drift with 16000 rather than Librosa default of 22050
sr = 16000

# Generally a silence longer than a second is should separate one coda from another
MIN_SILENCE_BETWEEN_CODAS = 1000 # in milliseconds
MIN_SILENCE_BETWEEN_CLICKS = 10 # in milliseconds

# Initialize VAD and get the output, which is an array of 0's and 1's, where 0 = frame with silence
# and 1 = frame with voice activity above the energy threshold
vad = EnergyVAD(frame_length=FRAME_LENGTH, frame_shift=FRAME_SHIFT, sample_rate=sr, energy_threshold=0.3)

# Function that takes in
#  - vad_output as a List of 0's and 1's, where 0 = frame with silence and 1 = voice activity
#  - frame_shift in ms
#  - min_silence_duration, which is the minimum time of silence in ms that separates a group of sound
# Returns periods of time with voice activity separated by silences longer than the min_silence_duration
# Output format: [{'start': number, 'end': number}]
def get_voice_activity_timestamps(*, vad_output: List[int], frame_shift: int, min_silence_duration: int = 1000):
    min_silence_frames = int(min_silence_duration / frame_shift)

    groups = []
    start_idx = None
    silence_counter = 0

    for i, frame in enumerate(vad_output):
        if frame == 1:
            if start_idx is None:
                start_idx = i
            silence_counter = 0
        else:
            if start_idx is not None:
                silence_counter += 1
                if silence_counter >= min_silence_frames:
                    # Silence is long enough, so close the current voice group
                    end_idx = i - silence_counter
                    start_time = start_idx * frame_shift / 1000
                    end_time = (end_idx + 1) * frame_shift / 1000
                    groups.append({
                        'start': round(start_time, 4),
                        'end': round(end_time, 4)
                    })
                    start_idx = None
                    silence_counter = 0

    # Handle case where audio ends with voice activity
    if start_idx is not None:
        end_time = (len(vad_output)+1) * frame_shift / 1000
        groups.append({
            'start': round(start_idx * frame_shift / 1000, 4),
            'end': round(end_time, 4)
        })

    return groups


# Function that takes in
#  - vad_output as a List of 0's and 1's, where 0 = frame with silence and 1 = voice activity
#  - frame_shift in ms
#  - min_silence_duration, which is the minimum time of silence in ms that will be included
# Returns timestamps for silences longer than the min_silence_duration
# Output format: [{'start': number, 'end': number}]
def get_timestamps_silences(*, vad_output: List[int], frame_shift: int, min_silence_duration: int=1000):
    min_silence_frames = int(min_silence_duration / frame_shift)
    groups = []
    start_idx = None

    for i, frame in enumerate(vad_output):
        if frame==0:
            if start_idx is None:
                start_idx = i
        else:
            if start_idx is not None:
                end_idx = i
                duration = end_idx - start_idx

                if duration >= min_silence_frames:
                    start_time = start_idx * frame_shift / 1000
                    end_time = end_idx * frame_shift / 1000
                    groups.append({
                        'start': round(start_time, 2),
                        'end': round(end_time, 2)
                    })

                start_idx = None

    # Handle case where the last segment goes to the end
    if start_idx is not None:
        end_idx = len(vad_output)
        duration = end_idx - start_idx
        if duration >= min_silence_frames:
            start_time = start_idx * frame_shift / 1000
            end_time = end_idx * frame_shift / 1000
            groups.append({
                'start': round(start_time, 2),
                'end': round(end_time, 2)
            })

    return groups


# Function to spit an audio into an array of individual audios by timestamp.
# Assumes timestamps are in format {'start': time in seconds, 'end': time in seconds}
def splitAudioByTimestamps(audio, timestamps, sr):
    audio_array = []
    for i, ts in enumerate(timestamps):
        start_sample = int(float(ts['start']) * sr) # convert start time into sample index
        end_sample = int(float(ts['end']) * sr)
        segment = audio[start_sample:end_sample] # Extract the segment using start and ending sample index
        audio_array.append(segment) # append segment into the array
    return audio_array

# Convert timestamps into durations
# Returns timestamps in format {'start': time in seconds, 'end': time in seconds}
def convert_timestamps_to_durations(timestamps):
    durations = []
    for i, timestamps in enumerate(timestamps):
        durations.append(round(float(timestamps['end'])-float(timestamps['start']), 4))
    return durations

# Function to extract features from sperm whale codas and identify 1+1+3 codas
def transcribe_whalish(file, key):
    if key != API_KEY:
        raise gr.Error("Invalid API key.")

    try:
        # Load audio with Librosa.load and resample
        file, _ = librosa.load(file, sr=sr)

        # Get VAD output to file
        vad_output = vad(file)

        # Get timestamps for codas
        codas = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS)

        # Take original audio, split into an array of files, one for each coda and trim off silence at beginning and end
        coda_audios = splitAudioByTimestamps(file, codas, sr)
       
        # Get timestamps for individual clicks throughout file
        clicks = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)

        # Get timestamps for silences between codas
        inter_coda_intervals = get_timestamps_silences(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS)
        inter_coda_intervals

        # for each coda in coda_timestamps, look at the audio in coda_audio that is in the same position
        # in the array and extract features about it, then save that info to coda_timestamps object
        for i, coda_audio in enumerate(coda_audios):
            # get vad_output for each coda
            vad_output_for_coda = vad(coda_audio)
            codas[i]['vad'] = vad_output_for_coda

            # Get the timestamps for clicks inside each coda
            coda_clicks = get_voice_activity_timestamps(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)
            codas[i]['click_timestamps'] = coda_clicks
            
            # Use timestamps of the clicks to find the total number of clicks
            number_of_clicks = len(coda_clicks)
            codas[i]['number_of_clicks'] = number_of_clicks

            # Use timestamps of the clicks to find total duration of the coda (time from beginning of first click
            # to end of last click)
            duration = 0
            if number_of_clicks > 0:
                duration = float(coda_clicks[len(coda_clicks)-1]['end']) - float(coda_clicks[0]['start'])
            codas[i]['duration'] = duration

            # Use VAD output to extract timestamps of the silences
            coda_inter_click_intervals = get_timestamps_silences(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)
            codas[i]['inter_click_intervals'] = coda_inter_click_intervals

            # Get the inter-click-interval durations in seconds
            inter_click_interval_durations = convert_timestamps_to_durations(coda_inter_click_intervals)
            codas[i]['inter_click_interval_durations'] = inter_click_interval_durations
            
            # Check if the coda conforms to 1+1+3 using a simple formula
            if number_of_clicks == 5 and inter_click_interval_durations[0] > 0.23 and inter_click_interval_durations[1] > 0.25 and inter_click_interval_durations[2] <= 0.2 and inter_click_interval_durations[3] <= 0.2:
                codas[i]['content'] = '1+1+3'
            else: 
                codas[i]['content'] = ''

        # Calculate the inter-click-interval timestamps for inter-click-intervals
        # inside the codas, but use timestamps for the entire file
        inter_click_intervals = []
        for i, coda in enumerate(codas):
            for i, inter_click_interval in enumerate(coda['inter_click_intervals']):
                new_interval = {'start': coda['start'] + inter_click_interval['start'], 'end': coda['start'] + inter_click_interval['end']}
                inter_click_intervals.append(new_interval)

        output = {
            'vad': vad_output,
            'codas': codas,
            'clicks': clicks,
            'inter_coda_intervals': inter_coda_intervals,
            'inter_click_intervals': inter_click_intervals
        }

    except Exception as e:
        print(f"An error occurred: {e}")
        output = f"An error occurred: {e}"

    return output

examples = [['spermwhale_dominica.wav']]

# Create a function to generate a vertically stacked interface
def create_transcription_interface(source):
    with gr.Blocks() as interface:
        gr.Markdown("""
        Use microphone, upload .wav file, or choose an example below.
        """)
        with gr.Column():
            audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
            output = gr.JSON(label="Results")
            api_key_input = gr.Textbox(label="API Key", type="password")
        audio_input.change(fn=transcribe_whalish, inputs=[audio_input, api_key_input], outputs=output)
        gr.Examples(examples=examples, inputs=[audio_input])
    return interface

# Create two interfaces (one for mic, one for file upload)
mic_transcribe = create_transcription_interface("microphone")
file_transcribe = create_transcription_interface("upload")

demo = gr.TabbedInterface(
    [mic_transcribe, file_transcribe],
    ["Microphone Input", "Upload .wav file"],
    title="Transcribe Sperm Whalish 1+1+3 Coda",
)

demo.launch()