import gradio as gr
import numpy as np
from transformers import pipeline
import os

# Load the fine-tuned model using pipeline
model_path = "podcasts-org/detect-background-music"
classifier = pipeline("audio-classification", model=model_path, token=os.getenv("HF_TOKEN"))

def classify_audio(audio):
    """Classify whether audio has background music or not."""
    if audio is None:
        return "Please provide an audio file"

    # audio is a tuple of (sample_rate, audio_array)
    sample_rate, audio_array = audio

    # Convert to float32 and normalize if needed
    if audio_array.dtype == np.int16:
        audio_array = audio_array.astype(np.float32) / 32768.0
    elif audio_array.dtype == np.int32:
        audio_array = audio_array.astype(np.float32) / 2147483648.0

    # Convert stereo to mono if needed
    if len(audio_array.shape) > 1:
        audio_array = audio_array.mean(axis=1)

    # Use the pipeline for inference
    # Pipeline expects dict with "array" and "sampling_rate" keys
    predictions = classifier({"array": audio_array, "sampling_rate": sample_rate})

    # Convert list of dicts to single dict for Gradio Label component
    results = {pred["label"]: pred["score"] for pred in predictions}

    return results

# Create Gradio interface
demo = gr.Interface(
    fn=classify_audio,
    inputs=gr.Audio(type="numpy", label="Upload Audio"),
    outputs=gr.Label(num_top_classes=2, label="Prediction"),
    title="Background Music Detection",
    description="Upload an audio file to detect whether it contains background music (BGM) or not. Model: Whisper-base fine-tuned on podcasts-org/bgm dataset.",
    examples=None
)

if __name__ == "__main__":
    demo.launch()