import gradio as gr from speechbrain.inference import EncoderClassifier import torch import requests import subprocess import os import uuid import yt_dlp model = None # Lazy-loaded model def get_model(): global model if model is None: model = EncoderClassifier.from_hparams("Jzuluaga/accent-id-commonaccent_ecapa") return model def extract_id_from_url(url): urlRet = url.split("/")[-1] if '?' in urlRet: urlRet = urlRet.split("?")[0] return urlRet def fetch_loom_download_url(id): response = requests.post(url=f"https://www.loom.com/api/campaigns/sessions/{id}/transcoded-url") if response.status_code == 200: return response.json()["url"] else: print("Error while retrieving response: ", response.status_code) exit def download_loom_video(url, filename): headers = { "User-Agent": "Mozilla/5.0" } try: with requests.get(url, headers=headers, stream=True) as response: response.raise_for_status() with open(filename, "wb") as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) print(f"Downloaded video to {filename}") return filename except requests.exceptions.RequestException as e: print(f"Failed to download Loom video: {e}") return None def download_direct_mp4(url, filename): try: response = requests.get(url, stream=True) response.raise_for_status() with open(filename, "wb") as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) return filename except Exception as e: print(f"Error downloading direct mp4: {e}") return None def download_video_from_url(url): if "loom.com" in url: video_id = extract_id_from_url(url) print(video_id) direct_url = fetch_loom_download_url(video_id) print(direct_url) filename = f"LoomVideo_{video_id}.mp4" success = download_loom_video(direct_url, filename) print(success) return filename if success else None elif url.endswith(".mp4"): filename = f"video_{uuid.uuid4()}.mp4" result = download_direct_mp4(url, filename) return result else: # fallback to yt_dlp for youtube, vimeo, etc. out_path = f"video_{uuid.uuid4()}.mp4" ydl_opts = { 'format': 'bestaudio/best', 'outtmpl': out_path, 'quiet': True, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) return out_path def extract_audio(video_file): audio_path = f"audio_{uuid.uuid4()}.wav" cmd = [ "ffmpeg", "-i", video_file, "-vn", "-acodec", "pcm_s16le", "-ac", "1", "-ar", "16000", audio_path, "-y" ] subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return audio_path def classify_accent(input_file_or_url): model = get_model() # Check if it's a URL if isinstance(input_file_or_url, str) and input_file_or_url.startswith("http"): video_path = download_video_from_url(input_file_or_url) else: video_path = input_file_or_url.name if hasattr(input_file_or_url, "name") else input_file_or_url audio_path = extract_audio(video_path) out_probs, top_prob, top_idx, label = model.classify_file(audio_path) top_labels = model.hparams.label_encoder.decode_ndim(torch.topk(out_probs, 3).indices.squeeze()) confidences = torch.topk(out_probs, 3).values.squeeze().tolist() result = "\n".join([f"{l}: {p*100:.2f}%" for l, p in zip(top_labels, confidences)]) return label[0], f"{top_prob.item()*100:.2f}%", result # Gradio UI with gr.Blocks() as demo: gr.Markdown("# Accent Identifier") gr.Markdown( "Upload a video or audio file, or paste a link (e.g. direct .mp4 URL or Loom video) to identify the speaker's accent." ) with gr.Row(): with gr.Column(): input_file = gr.File(label="Upload video/audio file", file_types=[".mp4", ".wav", ".mp3"]) url_input = gr.Textbox(label="...or paste a direct mp4 URL/loom link") submit_btn = gr.Button("Classify Accent") with gr.Column(): label_output = gr.Textbox(label="Top Prediction") confidence_output = gr.Textbox(label="Confidence") top3_output = gr.Textbox(label="Top 3 Predictions") def handle_inputs(file, url): if url: return classify_accent(url) elif file: return classify_accent(file) else: return "No input", "", "" submit_btn.click(handle_inputs, inputs=[input_file, url_input], outputs=[label_output, confidence_output, top3_output]) if __name__ == "__main__": demo.launch(share=True)