import gradio as gr import numpy as np from transformers import pipeline import os # Load the fine-tuned model using pipeline model_path = "podcasts-org/detect-background-music" classifier = pipeline("audio-classification", model=model_path, token=os.getenv("HF_TOKEN")) def classify_audio(audio): """Classify whether audio has background music or not.""" if audio is None: return "Please provide an audio file" # audio is a tuple of (sample_rate, audio_array) sample_rate, audio_array = audio # Convert to float32 and normalize if needed if audio_array.dtype == np.int16: audio_array = audio_array.astype(np.float32) / 32768.0 elif audio_array.dtype == np.int32: audio_array = audio_array.astype(np.float32) / 2147483648.0 # Convert stereo to mono if needed if len(audio_array.shape) > 1: audio_array = audio_array.mean(axis=1) # Use the pipeline for inference # Pipeline expects dict with "array" and "sampling_rate" keys predictions = classifier({"array": audio_array, "sampling_rate": sample_rate}) # Convert list of dicts to single dict for Gradio Label component results = {pred["label"]: pred["score"] for pred in predictions} return results # Create Gradio interface demo = gr.Interface( fn=classify_audio, inputs=gr.Audio(type="numpy", label="Upload Audio"), outputs=gr.Label(num_top_classes=2, label="Prediction"), title="Background Music Detection", description="Upload an audio file to detect whether it contains background music (BGM) or not. Model: Whisper-base fine-tuned on podcasts-org/bgm dataset.", examples=None ) if __name__ == "__main__": demo.launch()