import gradio as gr
import transformers
import librosa
import numpy as np

# Load the model pipeline
pipe = transformers.pipeline(
    model='fixie-ai/ultravox-v0_5-llama-3_1-8b',
    trust_remote_code=True
)

def transcribe(audio):
    if audio is None:
        return "No audio provided."

    # Load audio using librosa
    audio_array, sr = librosa.load(audio, sr=16000)

    # Define initial system prompt
    turns = [
        {
            "role": "system",
            "content": "You are a friendly and helpful character. You love to answer questions for people."
        },
    ]

    # Run inference
    result = pipe(
        {'audio': audio_array, 'turns': turns, 'sampling_rate': sr},
        max_new_tokens=30
    )

    # Return result content
    return result[0]['content'] if isinstance(result, list) else str(result)

# Build Gradio Interface
demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(source="upload", type="filepath", label="Upload an Audio File"),
    outputs=gr.Textbox(label="Ultravox Response"),
    title="🎙️ Ultravox AI Voicebot",
    description="Upload an audio file and Ultravox will respond intelligently!"
)

# Launch app
if __name__ == "__main__":
    demo.launch()