File size: 3,327 Bytes
8b08bcd e1d88a1 8b08bcd ad2cddc 4f13776 b06ca90 474695a 4f13776 ad2cddc 4f13776 ad2cddc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import os
# Set Hugging Face and Torch cache directories to /tmp (must be before any other imports)
import yt_dlp
from pydub import AudioSegment
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import torchaudio
def download_and_extract_audio(video_url, output_audio_path="audio.wav"):
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': 'temp_audio.%(ext)s',
'quiet': True,
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'wav',
'preferredquality': '192',
}],
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
for ext in ['wav', 'mp3', 'm4a', 'webm']:
fname = f"temp_audio.{ext}"
if os.path.exists(fname):
if ext != 'wav':
audio = AudioSegment.from_file(fname)
audio.export(output_audio_path, format="wav")
os.remove(fname)
else:
os.rename(fname, output_audio_path)
return output_audio_path
raise FileNotFoundError("Audio extraction failed.")
def debug_audio(audio_path):
y, sr = librosa.load(audio_path, sr=None)
plt.figure(figsize=(10, 2))
plt.plot(np.linspace(0, len(y)/sr, num=len(y)), y)
plt.title('Extracted Audio Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.show()
def get_accent_classifier():
from speechbrain.pretrained.interfaces import foreign_class
import os
import shutil
# Remove any local folders that may cause permission errors
for folder in ["pretrained_models", "wav2vec2_checkpoints"]:
if os.path.exists(folder):
shutil.rmtree(folder)
# Set all relevant cache dirs to /tmp to avoid permission errors
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface/hub"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface/transformers"
os.environ["TORCH_HOME"] = "/tmp/torch"
os.environ["XDG_CACHE_HOME"] = "/tmp/xdg_cache"
cache_dir = "/tmp/pretrained_models"
if not hasattr(get_accent_classifier, "model"):
get_accent_classifier.model = foreign_class(
source="Jzuluaga/accent-id-commonaccent_xlsr-en-english",
pymodule_file="custom_interface.py",
classname="CustomEncoderWav2vec2Classifier",
savedir=cache_dir
)
return get_accent_classifier.model
def analyze_accent(audio_path):
accent_classifier = get_accent_classifier()
# The classifier expects a path to a wav file
out_prob, score, index, text_lab = accent_classifier.classify_file(audio_path)
accent = text_lab[0] if isinstance(text_lab, list) else text_lab
confidence = float(score[0]) if hasattr(score, '__getitem__') else float(score)
summary = f"Detected accent: {accent} with confidence {confidence:.2f}."
return accent, confidence, summary
if __name__ == "__main__":
video_url = input("Enter public video URL: ")
audio_path = download_and_extract_audio(video_url)
# debug_audio(audio_path) # Uncomment to listen and plot
accent, confidence, summary = analyze_accent(audio_path)
print(f"Accent: {accent}")
print(f"English Accent Confidence: {confidence}%")
print(f"Summary: {summary}") |