Spaces:

Pheire
/

accent-detector

Running

File size: 4,962 Bytes

import gradio as gr
from speechbrain.inference import EncoderClassifier
import torch
import requests
import subprocess
import os
import uuid
import yt_dlp

model = None  # Lazy-loaded model

def get_model():
    global model
    if model is None:
        model = EncoderClassifier.from_hparams("Jzuluaga/accent-id-commonaccent_ecapa")
    return model

def extract_id_from_url(url):
    urlRet = url.split("/")[-1]
    if '?' in urlRet:
        urlRet = urlRet.split("?")[0]
    return urlRet

def fetch_loom_download_url(id):
    response = requests.post(url=f"https://www.loom.com/api/campaigns/sessions/{id}/transcoded-url")
    if response.status_code == 200:
        return response.json()["url"]
    else:
        print("Error while retrieving response: ", response.status_code)
        exit



def download_loom_video(url, filename):
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    try:
        with requests.get(url, headers=headers, stream=True) as response:
            response.raise_for_status()

            with open(filename, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

        print(f"Downloaded video to {filename}")
        return filename

    except requests.exceptions.RequestException as e:
        print(f"Failed to download Loom video: {e}")
        return None

def download_direct_mp4(url, filename):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        return filename
    except Exception as e:
        print(f"Error downloading direct mp4: {e}")
        return None
        

def download_video_from_url(url):
    if "loom.com" in url:
        video_id = extract_id_from_url(url)
        print(video_id)
        direct_url = fetch_loom_download_url(video_id)
        print(direct_url)
        filename = f"LoomVideo_{video_id}.mp4"
        success = download_loom_video(direct_url, filename)
        print(success)
        return filename if success else None

    elif url.endswith(".mp4"):
        filename = f"video_{uuid.uuid4()}.mp4"
        result = download_direct_mp4(url, filename)
        return result

    else:
        # fallback to yt_dlp for youtube, vimeo, etc.
        out_path = f"video_{uuid.uuid4()}.mp4"
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': out_path,
            'quiet': True,
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        return out_path



def extract_audio(video_file):
    audio_path = f"audio_{uuid.uuid4()}.wav"
    cmd = [
        "ffmpeg", "-i", video_file, "-vn",
        "-acodec", "pcm_s16le", "-ac", "1", "-ar", "16000",
        audio_path, "-y"
    ]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return audio_path


def classify_accent(input_file_or_url):
    model = get_model()
    
    # Check if it's a URL
    if isinstance(input_file_or_url, str) and input_file_or_url.startswith("http"):
        video_path = download_video_from_url(input_file_or_url)
    else:
        video_path = input_file_or_url.name if hasattr(input_file_or_url, "name") else input_file_or_url

    audio_path = extract_audio(video_path)
    out_probs, top_prob, top_idx, label = model.classify_file(audio_path)

    top_labels = model.hparams.label_encoder.decode_ndim(torch.topk(out_probs, 3).indices.squeeze())
    confidences = torch.topk(out_probs, 3).values.squeeze().tolist()
    result = "\n".join([f"{l}: {p*100:.2f}%" for l, p in zip(top_labels, confidences)])

    return label[0], f"{top_prob.item()*100:.2f}%", result

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Accent Identifier")
    gr.Markdown(
        "Upload a video or audio file, or paste a link (e.g. direct .mp4 URL or Loom video) to identify the speaker's accent."
    )

    with gr.Row():
        with gr.Column():
            input_file = gr.File(label="Upload video/audio file", file_types=[".mp4", ".wav", ".mp3"])
            url_input = gr.Textbox(label="...or paste a direct mp4 URL/loom link")
            submit_btn = gr.Button("Classify Accent")

        with gr.Column():
            label_output = gr.Textbox(label="Top Prediction")
            confidence_output = gr.Textbox(label="Confidence")
            top3_output = gr.Textbox(label="Top 3 Predictions")

    def handle_inputs(file, url):
        if url:
            return classify_accent(url)
        elif file:
            return classify_accent(file)
        else:
            return "No input", "", ""

    submit_btn.click(handle_inputs, inputs=[input_file, url_input], outputs=[label_output, confidence_output, top3_output])

if __name__ == "__main__":
    demo.launch(share=True)