Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
English Accent Detector - Analyzes speaker's accent from video URLs | |
""" | |
from __future__ import annotations | |
import argparse | |
import random | |
import tempfile | |
from collections import Counter | |
from pathlib import Path | |
import time | |
import torch | |
import torchaudio | |
import gradio as gr | |
from speechbrain.inference.classifiers import EncoderClassifier | |
from yt_dlp import YoutubeDL | |
from huggingface_hub.utils import LocalEntryNotFoundError | |
# βββββββββββββββ Model setup (with retry) βββββββββββββββ | |
ACCENT_MODEL_ID = "Jzuluaga/accent-id-commonaccent_ecapa" | |
LANG_MODEL_ID = "speechbrain/lang-id-voxlingua107-ecapa" | |
DEVICE = "cpu" # force CPU; Spaces' free tier has no GPU | |
def load_with_retry(model_id: str, tries: int = 5, backoff: int = 5): | |
"""Download model weights with exponential-backoff retry.""" | |
for attempt in range(1, tries + 1): | |
try: | |
return EncoderClassifier.from_hparams( | |
source=model_id, | |
run_opts={"device": DEVICE}, | |
) | |
except LocalEntryNotFoundError: | |
if attempt == tries: | |
raise | |
wait = backoff * attempt | |
print(f"[{model_id}] download failed (try {attempt}/{tries}), retrying in {wait}s") | |
time.sleep(wait) | |
accent_clf = load_with_retry(ACCENT_MODEL_ID) | |
lang_clf = load_with_retry(LANG_MODEL_ID) | |
# βββββββββββββββ Helpers βββββββββββββββ | |
def sec_to_hms(sec: int) -> str: | |
h = sec // 3600 | |
m = (sec % 3600) // 60 | |
s = sec % 60 | |
return f"{h:02d}:{m:02d}:{s:02d}" | |
def download_audio(url: str, out_path: Path) -> Path: | |
""" | |
Download best audio via yt_dlp, always using cookies.txt in the repo root. | |
""" | |
repo_root = Path(__file__).parent | |
cookie_path = repo_root / "cookies.txt" | |
if not cookie_path.is_file() or cookie_path.stat().st_size == 0: | |
raise FileNotFoundError( | |
f"No valid cookies.txt found at {cookie_path}. " | |
f"Make sure you uploaded your Netscape-format cookie jar." | |
) | |
opts = { | |
"format": "bestaudio/best", | |
"outtmpl": str(out_path.with_suffix(".%(ext)s")), | |
"cookiefile": str(cookie_path), | |
"quiet": True, | |
} | |
print(f"[download_audio] using cookiefile: {opts['cookiefile']}") | |
with YoutubeDL(opts) as ydl: | |
info = ydl.extract_info(url, download=True) | |
filename = ydl.prepare_filename(info) | |
return Path(filename) | |
def extract_wav(src: Path, dst: Path, start: int, dur: int = 8) -> None: | |
target_sr = 16000 | |
offset = start * target_sr | |
frames = dur * target_sr | |
wav, orig_sr = torchaudio.load(str(src), frame_offset=offset, num_frames=frames) | |
if orig_sr != target_sr: | |
wav = torchaudio.transforms.Resample(orig_sr, target_sr)(wav) | |
torchaudio.save(str(dst), wav, target_sr, encoding="PCM_S", bits_per_sample=16) | |
def pick_random_offsets(total_s: int, n: int) -> list[int]: | |
max_start = total_s - 8 | |
pool = list(range(max_start + 1)) | |
if n > len(pool): | |
n = len(pool) | |
return random.sample(pool, n) | |
# βββββββββββββββ Classification βββββββββββββββ | |
def classify_language(wav: Path) -> tuple[str, float]: | |
sig = lang_clf.load_audio(str(wav)) | |
_, log_p, _, label = lang_clf.classify_batch(sig) | |
return label[0], float(log_p.exp().item()) * 100 | |
def classify_accent(wav: Path) -> tuple[str, float]: | |
sig = accent_clf.load_audio(str(wav)) | |
_, log_p, _, label = accent_clf.classify_batch(sig) | |
return label[0], float(log_p.item()) * 100 | |
def calculate_english_confidence(lang: str, lang_conf: float, accent_conf: float) -> float: | |
if not lang.lower().startswith("en"): | |
return 0.0 | |
english_score = (lang_conf * 0.7) + (accent_conf * 0.3) | |
return min(100.0, max(0.0, english_score)) | |
# βββββββββββββββ Core pipeline βββββββββββββββ | |
def analyse_accent(url: str, n_samples: int = 4) -> dict: | |
if not url: | |
return {"error": "Please provide a video URL."} | |
if n_samples < 1: | |
return {"error": "Number of samples must be at least 1."} | |
with tempfile.TemporaryDirectory() as td: | |
td = Path(td) | |
try: | |
# 1) Download audio | |
audio_file = download_audio(url, td / "audio") | |
info = torchaudio.info(str(audio_file)) | |
total_s = int(info.num_frames / info.sample_rate) | |
if total_s < 8: | |
return {"error": "Audio shorter than 8 seconds."} | |
# 2) Language detection | |
mid_start = max(0, total_s // 2 - 4) | |
lang_wav = td / "lang_check.wav" | |
extract_wav(audio_file, lang_wav, start=mid_start) | |
lang, lang_conf = classify_language(lang_wav) | |
is_english = lang.lower().startswith("en") | |
if not is_english: | |
return { | |
"is_english_speaker": False, | |
"detected_language": lang, | |
"language_confidence": round(lang_conf, 1), | |
"accent_classification": "N/A", | |
"english_confidence_score": 0.0, | |
"summary": f"Non-English language detected: {lang} ({lang_conf:.1f}%)" | |
} | |
# 3) Accent analysis | |
offsets = pick_random_offsets(total_s, n_samples) | |
accent_results = [] | |
for i, start in enumerate(sorted(offsets)): | |
clip_wav = td / f"clip_{i}.wav" | |
extract_wav(audio_file, clip_wav, start=start) | |
acc, conf = classify_accent(clip_wav) | |
accent_results.append({ | |
"clip": i + 1, | |
"time_range": f"{sec_to_hms(start)} - {sec_to_hms(start + 8)}", | |
"accent": acc, | |
"confidence": round(conf, 1), | |
}) | |
# 4) Aggregate results | |
labels = [r["accent"] for r in accent_results] | |
most_common_accent, count = Counter(labels).most_common(1)[0] | |
confs = [r["confidence"] for r in accent_results if r["accent"] == most_common_accent] | |
avg_conf = sum(confs) / len(confs) | |
eng_conf = calculate_english_confidence(lang, lang_conf, avg_conf) | |
return { | |
"is_english_speaker": True, | |
"detected_language": "English", | |
"language_confidence": round(lang_conf, 1), | |
"accent_classification": most_common_accent, | |
"accent_confidence": round(avg_conf, 1), | |
"english_confidence_score": round(eng_conf, 1), | |
"samples_analyzed": len(accent_results), | |
"consensus": f"{count}/{n_samples} samples", | |
"detailed_results": accent_results, | |
"summary": ( | |
f"English speaker detected with {most_common_accent} accent " | |
f"(confidence: {eng_conf:.1f}%)" | |
) | |
} | |
except Exception as e: | |
return {"error": f"Processing failed: {e}"} | |
# βββββββββββββββ Gradio UI βββββββββββββββ | |
def app(): | |
with gr.Blocks(title="English Accent Detector") as demo: | |
gr.Markdown( | |
"# ποΈ English Accent Detector\n" | |
"**Analyze speaker's accent from video URLs**\n\n" | |
"This tool:\n" | |
"1. Accepts public video URLs (YouTube, Loom, direct MP4 links)\n" | |
"2. Extracts audio from the video\n" | |
"3. Analyzes if the speaker is an English language candidate\n" | |
"4. Classifies the accent type and provides confidence scores\n" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
url_input = gr.Text( | |
label="Video URL", | |
placeholder="Enter public video URL (YouTube, Loom, etc.)", | |
lines=1 | |
) | |
samples_input = gr.Slider( | |
minimum=1, | |
maximum=10, | |
value=4, | |
step=1, | |
label="Number of audio samples to analyze", | |
info="More samples = more accurate but slower" | |
) | |
analyze_btn = gr.Button("π Analyze Accent", variant="primary") | |
with gr.Column(): | |
result_output = gr.JSON(label="Analysis Results") | |
gr.Markdown("### Example URLs to try:") | |
gr.Examples( | |
examples=[ | |
["https://www.youtube.com/watch?v=dQw4w9WgXcQ", 4], | |
["https://www.youtube.com/shorts/VO6n9GTzSqU", 4], | |
], | |
inputs=[url_input, samples_input], | |
label="Click to load example" | |
) | |
analyze_btn.click( | |
fn=analyse_accent, | |
inputs=[url_input, samples_input], | |
outputs=result_output | |
) | |
return demo | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="English Accent Detector") | |
parser.add_argument( | |
"--port", type=int, default=7860, | |
help="Port to run the server on" | |
) | |
args = parser.parse_args() | |
demo = app() | |
# On Hugging Face Spaces, a public URL is provided automatically | |
demo.launch(server_port=args.port) | |