Upload 5 files
Browse files- src/custom_interface.py +24 -0
- src/d.py +69 -0
- src/packages.txt +1 -0
- src/requirements.txt +9 -0
- src/streamlit_app.py +35 -39
src/custom_interface.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# custom_interface.py for CommonAccent English Accent Classifier
|
2 |
+
# Downloaded from: https://huggingface.co/Jzuluaga/accent-id-commonaccent_xlsr-en-english/blob/main/custom_interface.py
|
3 |
+
# This file is required by the SpeechBrain foreign_class interface.
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from speechbrain.pretrained.interfaces import Pretrained
|
7 |
+
|
8 |
+
class CustomEncoderWav2vec2Classifier(Pretrained):
|
9 |
+
MODULES_NEEDED = ["model", "mean_var_norm", "label_encoder"]
|
10 |
+
HPARAMS_NEEDED = ["sample_rate"]
|
11 |
+
|
12 |
+
def classify_file(self, path):
|
13 |
+
signal, fs = self.load_audio(path)
|
14 |
+
return self.classify_batch(signal, fs)
|
15 |
+
|
16 |
+
def classify_batch(self, signal, fs):
|
17 |
+
if fs != self.hparams.sample_rate:
|
18 |
+
signal = self.resample(signal, fs, self.hparams.sample_rate)
|
19 |
+
signal = self.modules.mean_var_norm(signal, torch.tensor([1]))
|
20 |
+
embeddings = self.modules.model.encode_batch(signal)
|
21 |
+
out_prob = self.modules.model.classify_batch(embeddings)
|
22 |
+
score, index = torch.max(out_prob, dim=1)
|
23 |
+
text_lab = self.hparams.label_encoder.decode_torch(index)
|
24 |
+
return out_prob, score, index, text_lab
|
src/d.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import yt_dlp
|
2 |
+
from pydub import AudioSegment
|
3 |
+
import os
|
4 |
+
import librosa
|
5 |
+
import numpy as np
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import torchaudio
|
8 |
+
|
9 |
+
def download_and_extract_audio(video_url, output_audio_path="audio.wav"):
|
10 |
+
ydl_opts = {
|
11 |
+
'format': 'bestaudio/best',
|
12 |
+
'outtmpl': 'temp_audio.%(ext)s',
|
13 |
+
'quiet': True,
|
14 |
+
'postprocessors': [{
|
15 |
+
'key': 'FFmpegExtractAudio',
|
16 |
+
'preferredcodec': 'wav',
|
17 |
+
'preferredquality': '192',
|
18 |
+
}],
|
19 |
+
}
|
20 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
21 |
+
ydl.download([video_url])
|
22 |
+
for ext in ['wav', 'mp3', 'm4a', 'webm']:
|
23 |
+
fname = f"temp_audio.{ext}"
|
24 |
+
if os.path.exists(fname):
|
25 |
+
if ext != 'wav':
|
26 |
+
audio = AudioSegment.from_file(fname)
|
27 |
+
audio.export(output_audio_path, format="wav")
|
28 |
+
os.remove(fname)
|
29 |
+
else:
|
30 |
+
os.rename(fname, output_audio_path)
|
31 |
+
return output_audio_path
|
32 |
+
raise FileNotFoundError("Audio extraction failed.")
|
33 |
+
|
34 |
+
def debug_audio(audio_path):
|
35 |
+
y, sr = librosa.load(audio_path, sr=None)
|
36 |
+
plt.figure(figsize=(10, 2))
|
37 |
+
plt.plot(np.linspace(0, len(y)/sr, num=len(y)), y)
|
38 |
+
plt.title('Extracted Audio Waveform')
|
39 |
+
plt.xlabel('Time (s)')
|
40 |
+
plt.ylabel('Amplitude')
|
41 |
+
plt.show()
|
42 |
+
|
43 |
+
def get_accent_classifier():
|
44 |
+
from speechbrain.pretrained.interfaces import foreign_class
|
45 |
+
if not hasattr(get_accent_classifier, "model"):
|
46 |
+
get_accent_classifier.model = foreign_class(
|
47 |
+
source="Jzuluaga/accent-id-commonaccent_xlsr-en-english",
|
48 |
+
pymodule_file="custom_interface.py",
|
49 |
+
classname="CustomEncoderWav2vec2Classifier"
|
50 |
+
)
|
51 |
+
return get_accent_classifier.model
|
52 |
+
|
53 |
+
def analyze_accent(audio_path):
|
54 |
+
accent_classifier = get_accent_classifier()
|
55 |
+
# The classifier expects a path to a wav file
|
56 |
+
out_prob, score, index, text_lab = accent_classifier.classify_file(audio_path)
|
57 |
+
accent = text_lab[0] if isinstance(text_lab, list) else text_lab
|
58 |
+
confidence = float(score[0]) if hasattr(score, '__getitem__') else float(score)
|
59 |
+
summary = f"Detected accent: {accent} with confidence {confidence:.2f}."
|
60 |
+
return accent, confidence, summary
|
61 |
+
|
62 |
+
if __name__ == "__main__":
|
63 |
+
video_url = input("Enter public video URL: ")
|
64 |
+
audio_path = download_and_extract_audio(video_url)
|
65 |
+
# debug_audio(audio_path) # Uncomment to listen and plot
|
66 |
+
accent, confidence, summary = analyze_accent(audio_path)
|
67 |
+
print(f"Accent: {accent}")
|
68 |
+
print(f"English Accent Confidence: {confidence}%")
|
69 |
+
print(f"Summary: {summary}")
|
src/packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ffmpeg
|
src/requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
speechbrain==0.5.14
|
3 |
+
torchaudio
|
4 |
+
transformers
|
5 |
+
yt-dlp
|
6 |
+
pydub
|
7 |
+
librosa
|
8 |
+
matplotlib
|
9 |
+
numpy
|
src/streamlit_app.py
CHANGED
@@ -1,40 +1,36 @@
|
|
1 |
-
import altair as alt
|
2 |
-
import numpy as np
|
3 |
-
import pandas as pd
|
4 |
import streamlit as st
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
""
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
"
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
))
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
import os
|
3 |
+
from d import download_and_extract_audio, analyze_accent, get_accent_classifier
|
4 |
+
|
5 |
+
st.title("English Accent Classifier")
|
6 |
+
st.write("""
|
7 |
+
Upload a public video URL (e.g., YouTube, Loom, or direct MP4 link). The tool will extract the audio, analyze the speaker’s accent, and provide a confidence score.
|
8 |
+
""")
|
9 |
+
|
10 |
+
# Show spinner and load model at startup
|
11 |
+
if "model_loaded" not in st.session_state:
|
12 |
+
with st.spinner("Loading models (this may take a while the first time)..."):
|
13 |
+
get_accent_classifier()
|
14 |
+
st.session_state["model_loaded"] = True
|
15 |
+
st.success("Model loaded!")
|
16 |
+
|
17 |
+
video_url = st.text_input("Enter public video URL:")
|
18 |
+
|
19 |
+
if st.button("Analyze Accent") and video_url:
|
20 |
+
with st.spinner("Downloading and extracting audio..."):
|
21 |
+
try:
|
22 |
+
audio_path = download_and_extract_audio(video_url)
|
23 |
+
except Exception as e:
|
24 |
+
st.error(f"Audio extraction failed: {e}")
|
25 |
+
st.stop()
|
26 |
+
st.success("Audio extracted successfully!")
|
27 |
+
st.audio(audio_path)
|
28 |
+
with st.spinner("Analyzing accent (downloading model if needed)..."):
|
29 |
+
try:
|
30 |
+
accent, confidence, summary = analyze_accent(audio_path)
|
31 |
+
except Exception as e:
|
32 |
+
st.error(f"Accent analysis failed: {e}")
|
33 |
+
st.stop()
|
34 |
+
st.markdown(f"**Accent:** {accent}")
|
35 |
+
st.markdown(f"**English Accent Confidence:** {confidence:.2f}%")
|
36 |
+
st.markdown(f"**Summary:** {summary}")
|
|