import gradio as gr
import librosa
from vad import EnergyVAD
from typing import List

API_KEY = "682d2362-894c-800c-af30-a4c56b7f074b"
FRAME_LENGTH = 25 # milliseconds
FRAME_SHIFT = 10 # milliseconds

# EnergyVAD takes in different sampling rates but seems to work
# best and avoid drift with 16000 rather than Librosa default of 22050
sr = 16000

# Generally a silence longer than a second is should separate one coda from another
MIN_SILENCE_BETWEEN_CODAS = 1000 # in milliseconds
MIN_SILENCE_BETWEEN_CLICKS = 10 # in milliseconds

# Initialize VAD and get the output, which is an array of 0's and 1's, where 0 = frame with silence
# and 1 = frame with voice activity above the energy threshold
vad = EnergyVAD(frame_length=FRAME_LENGTH, frame_shift=FRAME_SHIFT, sample_rate=sr, energy_threshold=0.3)

# Function that takes in
#  - vad_output as a List of 0's and 1's, where 0 = frame with silence and 1 = voice activity
#  - frame_shift in ms
#  - min_silence_duration, which is the minimum time of silence in ms that separates a group of sound
# Returns periods of time with voice activity separated by silences longer than the min_silence_duration
# Output format: [{'start': number, 'end': number}]
def get_voice_activity_timestamps(*, vad_output: List[int], frame_shift: int, min_silence_duration: int = 1000):
    min_silence_frames = int(min_silence_duration / frame_shift)

    groups = []
    start_idx = None
    silence_counter = 0

    for i, frame in enumerate(vad_output):
        if frame == 1:
            if start_idx is None:
                start_idx = i
            silence_counter = 0
        else:
            if start_idx is not None:
                silence_counter += 1
                if silence_counter >= min_silence_frames:
                    # Silence is long enough, so close the current voice group
                    end_idx = i - silence_counter
                    start_time = start_idx * frame_shift / 1000
                    end_time = (end_idx + 1) * frame_shift / 1000
                    groups.append({
                        'start': round(start_time, 4),
                        'end': round(end_time, 4)
                    })
                    start_idx = None
                    silence_counter = 0

    # Handle case where audio ends with voice activity
    if start_idx is not None:
        end_time = (len(vad_output)+1) * frame_shift / 1000
        groups.append({
            'start': round(start_idx * frame_shift / 1000, 4),
            'end': round(end_time, 4)
        })

    return groups


# Function that takes in
#  - vad_output as a List of 0's and 1's, where 0 = frame with silence and 1 = voice activity
#  - frame_shift in ms
#  - min_silence_duration, which is the minimum time of silence in ms that will be included
# Returns timestamps for silences longer than the min_silence_duration
# Output format: [{'start': number, 'end': number}]
def get_timestamps_silences(*, vad_output: List[int], frame_shift: int, min_silence_duration: int=1000):
    min_silence_frames = int(min_silence_duration / frame_shift)
    groups = []
    start_idx = None

    for i, frame in enumerate(vad_output):
        if frame==0:
            if start_idx is None:
                start_idx = i
        else:
            if start_idx is not None:
                end_idx = i
                duration = end_idx - start_idx

                if duration >= min_silence_frames:
                    start_time = start_idx * frame_shift / 1000
                    end_time = end_idx * frame_shift / 1000
                    groups.append({
                        'start': round(start_time, 2),
                        'end': round(end_time, 2)
                    })

                start_idx = None

    # Handle case where the last segment goes to the end
    if start_idx is not None:
        end_idx = len(vad_output)
        duration = end_idx - start_idx
        if duration >= min_silence_frames:
            start_time = start_idx * frame_shift / 1000
            end_time = end_idx * frame_shift / 1000
            groups.append({
                'start': round(start_time, 2),
                'end': round(end_time, 2)
            })

    return groups


# Function to spit an audio into an array of individual audios by timestamp.
# Assumes timestamps are in format {'start': time in seconds, 'end': time in seconds}
def splitAudioByTimestamps(audio, timestamps, sr):
    audio_array = []
    for i, ts in enumerate(timestamps):
        start_sample = int(float(ts['start']) * sr) # convert start time into sample index
        end_sample = int(float(ts['end']) * sr)
        segment = audio[start_sample:end_sample] # Extract the segment using start and ending sample index
        audio_array.append(segment) # append segment into the array
    return audio_array

# Convert timestamps into durations
# Returns timestamps in format {'start': time in seconds, 'end': time in seconds}
def convert_timestamps_to_durations(timestamps):
    durations = []
    for i, timestamps in enumerate(timestamps):
        durations.append(round(float(timestamps['end'])-float(timestamps['start']), 4))
    return durations

# Function to extract features from sperm whale codas and identify 1+1+3 codas
def transcribe_whalish(file, key):
    if key != API_KEY:
        raise gr.Error("Invalid API key.")

    try:
        # Load audio with Librosa.load and resample
        file, _ = librosa.load(file, sr=sr)

        # Get VAD output to file
        vad_output = vad(file)

        # Get timestamps for codas
        codas = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS)

        # Take original audio, split into an array of files, one for each coda and trim off silence at beginning and end
        coda_audios = splitAudioByTimestamps(file, codas, sr)
       
        # Get timestamps for individual clicks throughout file
        clicks = get_voice_activity_timestamps(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)

        # Get timestamps for silences between codas
        inter_coda_intervals = get_timestamps_silences(vad_output=vad_output, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CODAS)
        inter_coda_intervals

        # for each coda in coda_timestamps, look at the audio in coda_audio that is in the same position
        # in the array and extract features about it, then save that info to coda_timestamps object
        for i, coda_audio in enumerate(coda_audios):
            # get vad_output for each coda
            vad_output_for_coda = vad(coda_audio)
            codas[i]['vad'] = vad_output_for_coda

            # Get the timestamps for clicks inside each coda
            coda_clicks = get_voice_activity_timestamps(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)
            codas[i]['click_timestamps'] = coda_clicks
            
            # Use timestamps of the clicks to find the total number of clicks
            number_of_clicks = len(coda_clicks)
            codas[i]['number_of_clicks'] = number_of_clicks

            # Use timestamps of the clicks to find total duration of the coda (time from beginning of first click
            # to end of last click)
            duration = 0
            if number_of_clicks > 0:
                duration = float(coda_clicks[len(coda_clicks)-1]['end']) - float(coda_clicks[0]['start'])
            codas[i]['duration'] = duration

            # Use VAD output to extract timestamps of the silences
            coda_inter_click_intervals = get_timestamps_silences(vad_output=vad_output_for_coda, frame_shift=FRAME_SHIFT, min_silence_duration=MIN_SILENCE_BETWEEN_CLICKS)
            codas[i]['inter_click_intervals'] = coda_inter_click_intervals

            # Get the inter-click-interval durations in seconds
            inter_click_interval_durations = convert_timestamps_to_durations(coda_inter_click_intervals)
            codas[i]['inter_click_interval_durations'] = inter_click_interval_durations
            
            # Check if the coda conforms to 1+1+3 using a simple formula
            if number_of_clicks == 5 and inter_click_interval_durations[0] > 0.23 and inter_click_interval_durations[1] > 0.25 and inter_click_interval_durations[2] <= 0.2 and inter_click_interval_durations[3] <= 0.2:
                codas[i]['content'] = '1+1+3'
            else: 
                codas[i]['content'] = ''

        # Calculate the inter-click-interval timestamps for inter-click-intervals
        # inside the codas, but use timestamps for the entire file
        inter_click_intervals = []
        for i, coda in enumerate(codas):
            for i, inter_click_interval in enumerate(coda['inter_click_intervals']):
                new_interval = {'start': coda['start'] + inter_click_interval['start'], 'end': coda['start'] + inter_click_interval['end']}
                inter_click_intervals.append(new_interval)

        output = {
            'vad': vad_output,
            'codas': codas,
            'clicks': clicks,
            'inter_coda_intervals': inter_coda_intervals,
            'inter_click_intervals': inter_click_intervals
        }

    except Exception as e:
        print(f"An error occurred: {e}")
        output = f"An error occurred: {e}"

    return output

examples = [['spermwhale_dominica.wav']]

# Create a function to generate a vertically stacked interface
def create_transcription_interface(source):
    with gr.Blocks() as interface:
        gr.Markdown("""
        Use microphone, upload .wav file, or choose an example below.
        """)
        with gr.Column():
            audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
            output = gr.JSON(label="Results")
            api_key_input = gr.Textbox(label="API Key", type="password")
        audio_input.change(fn=transcribe_whalish, inputs=[audio_input, api_key_input], outputs=output)
        gr.Examples(examples=examples, inputs=[audio_input])
    return interface

# Create two interfaces (one for mic, one for file upload)
mic_transcribe = create_transcription_interface("microphone")
file_transcribe = create_transcription_interface("upload")

demo = gr.TabbedInterface(
    [mic_transcribe, file_transcribe],
    ["Microphone Input", "Upload .wav file"],
    title="Transcribe Sperm Whalish 1+1+3 Coda",
)

demo.launch()