File size: 3,327 Bytes
8b08bcd
e1d88a1
8b08bcd
ad2cddc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f13776
b06ca90
 
 
 
 
474695a
 
 
 
 
 
4f13776
ad2cddc
 
 
 
4f13776
 
ad2cddc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
# Set Hugging Face and Torch cache directories to /tmp (must be before any other imports)

import yt_dlp
from pydub import AudioSegment
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import torchaudio

def download_and_extract_audio(video_url, output_audio_path="audio.wav"):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': 'temp_audio.%(ext)s',
        'quiet': True,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])
    for ext in ['wav', 'mp3', 'm4a', 'webm']:
        fname = f"temp_audio.{ext}"
        if os.path.exists(fname):
            if ext != 'wav':
                audio = AudioSegment.from_file(fname)
                audio.export(output_audio_path, format="wav")
                os.remove(fname)
            else:
                os.rename(fname, output_audio_path)
            return output_audio_path
    raise FileNotFoundError("Audio extraction failed.")

def debug_audio(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    plt.figure(figsize=(10, 2))
    plt.plot(np.linspace(0, len(y)/sr, num=len(y)), y)
    plt.title('Extracted Audio Waveform')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.show()

def get_accent_classifier():
    from speechbrain.pretrained.interfaces import foreign_class
    import os
    import shutil
    # Remove any local folders that may cause permission errors
    for folder in ["pretrained_models", "wav2vec2_checkpoints"]:
        if os.path.exists(folder):
            shutil.rmtree(folder)
    # Set all relevant cache dirs to /tmp to avoid permission errors
    os.environ["HF_HOME"] = "/tmp/huggingface"
    os.environ["HF_HUB_CACHE"] = "/tmp/huggingface/hub"
    os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface/transformers"
    os.environ["TORCH_HOME"] = "/tmp/torch"
    os.environ["XDG_CACHE_HOME"] = "/tmp/xdg_cache"
    cache_dir = "/tmp/pretrained_models"
    if not hasattr(get_accent_classifier, "model"):
        get_accent_classifier.model = foreign_class(
            source="Jzuluaga/accent-id-commonaccent_xlsr-en-english",
            pymodule_file="custom_interface.py",
            classname="CustomEncoderWav2vec2Classifier",
            savedir=cache_dir
        )
    return get_accent_classifier.model

def analyze_accent(audio_path):
    accent_classifier = get_accent_classifier()
    # The classifier expects a path to a wav file
    out_prob, score, index, text_lab = accent_classifier.classify_file(audio_path)
    accent = text_lab[0] if isinstance(text_lab, list) else text_lab
    confidence = float(score[0]) if hasattr(score, '__getitem__') else float(score)
    summary = f"Detected accent: {accent} with confidence {confidence:.2f}."
    return accent, confidence, summary

if __name__ == "__main__":
    video_url = input("Enter public video URL: ")
    audio_path = download_and_extract_audio(video_url)
    # debug_audio(audio_path)  # Uncomment to listen and plot
    accent, confidence, summary = analyze_accent(audio_path)
    print(f"Accent: {accent}")
    print(f"English Accent Confidence: {confidence}%")
    print(f"Summary: {summary}")