slav7 commited on
Commit
ad2cddc
·
verified ·
1 Parent(s): 36db4cd

Upload 5 files

Browse files
src/custom_interface.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # custom_interface.py for CommonAccent English Accent Classifier
2
+ # Downloaded from: https://huggingface.co/Jzuluaga/accent-id-commonaccent_xlsr-en-english/blob/main/custom_interface.py
3
+ # This file is required by the SpeechBrain foreign_class interface.
4
+
5
+ import torch
6
+ from speechbrain.pretrained.interfaces import Pretrained
7
+
8
+ class CustomEncoderWav2vec2Classifier(Pretrained):
9
+ MODULES_NEEDED = ["model", "mean_var_norm", "label_encoder"]
10
+ HPARAMS_NEEDED = ["sample_rate"]
11
+
12
+ def classify_file(self, path):
13
+ signal, fs = self.load_audio(path)
14
+ return self.classify_batch(signal, fs)
15
+
16
+ def classify_batch(self, signal, fs):
17
+ if fs != self.hparams.sample_rate:
18
+ signal = self.resample(signal, fs, self.hparams.sample_rate)
19
+ signal = self.modules.mean_var_norm(signal, torch.tensor([1]))
20
+ embeddings = self.modules.model.encode_batch(signal)
21
+ out_prob = self.modules.model.classify_batch(embeddings)
22
+ score, index = torch.max(out_prob, dim=1)
23
+ text_lab = self.hparams.label_encoder.decode_torch(index)
24
+ return out_prob, score, index, text_lab
src/d.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yt_dlp
2
+ from pydub import AudioSegment
3
+ import os
4
+ import librosa
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ import torchaudio
8
+
9
+ def download_and_extract_audio(video_url, output_audio_path="audio.wav"):
10
+ ydl_opts = {
11
+ 'format': 'bestaudio/best',
12
+ 'outtmpl': 'temp_audio.%(ext)s',
13
+ 'quiet': True,
14
+ 'postprocessors': [{
15
+ 'key': 'FFmpegExtractAudio',
16
+ 'preferredcodec': 'wav',
17
+ 'preferredquality': '192',
18
+ }],
19
+ }
20
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
21
+ ydl.download([video_url])
22
+ for ext in ['wav', 'mp3', 'm4a', 'webm']:
23
+ fname = f"temp_audio.{ext}"
24
+ if os.path.exists(fname):
25
+ if ext != 'wav':
26
+ audio = AudioSegment.from_file(fname)
27
+ audio.export(output_audio_path, format="wav")
28
+ os.remove(fname)
29
+ else:
30
+ os.rename(fname, output_audio_path)
31
+ return output_audio_path
32
+ raise FileNotFoundError("Audio extraction failed.")
33
+
34
+ def debug_audio(audio_path):
35
+ y, sr = librosa.load(audio_path, sr=None)
36
+ plt.figure(figsize=(10, 2))
37
+ plt.plot(np.linspace(0, len(y)/sr, num=len(y)), y)
38
+ plt.title('Extracted Audio Waveform')
39
+ plt.xlabel('Time (s)')
40
+ plt.ylabel('Amplitude')
41
+ plt.show()
42
+
43
+ def get_accent_classifier():
44
+ from speechbrain.pretrained.interfaces import foreign_class
45
+ if not hasattr(get_accent_classifier, "model"):
46
+ get_accent_classifier.model = foreign_class(
47
+ source="Jzuluaga/accent-id-commonaccent_xlsr-en-english",
48
+ pymodule_file="custom_interface.py",
49
+ classname="CustomEncoderWav2vec2Classifier"
50
+ )
51
+ return get_accent_classifier.model
52
+
53
+ def analyze_accent(audio_path):
54
+ accent_classifier = get_accent_classifier()
55
+ # The classifier expects a path to a wav file
56
+ out_prob, score, index, text_lab = accent_classifier.classify_file(audio_path)
57
+ accent = text_lab[0] if isinstance(text_lab, list) else text_lab
58
+ confidence = float(score[0]) if hasattr(score, '__getitem__') else float(score)
59
+ summary = f"Detected accent: {accent} with confidence {confidence:.2f}."
60
+ return accent, confidence, summary
61
+
62
+ if __name__ == "__main__":
63
+ video_url = input("Enter public video URL: ")
64
+ audio_path = download_and_extract_audio(video_url)
65
+ # debug_audio(audio_path) # Uncomment to listen and plot
66
+ accent, confidence, summary = analyze_accent(audio_path)
67
+ print(f"Accent: {accent}")
68
+ print(f"English Accent Confidence: {confidence}%")
69
+ print(f"Summary: {summary}")
src/packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
src/requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ speechbrain==0.5.14
3
+ torchaudio
4
+ transformers
5
+ yt-dlp
6
+ pydub
7
+ librosa
8
+ matplotlib
9
+ numpy
src/streamlit_app.py CHANGED
@@ -1,40 +1,36 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
5
-
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ import os
3
+ from d import download_and_extract_audio, analyze_accent, get_accent_classifier
4
+
5
+ st.title("English Accent Classifier")
6
+ st.write("""
7
+ Upload a public video URL (e.g., YouTube, Loom, or direct MP4 link). The tool will extract the audio, analyze the speaker’s accent, and provide a confidence score.
8
+ """)
9
+
10
+ # Show spinner and load model at startup
11
+ if "model_loaded" not in st.session_state:
12
+ with st.spinner("Loading models (this may take a while the first time)..."):
13
+ get_accent_classifier()
14
+ st.session_state["model_loaded"] = True
15
+ st.success("Model loaded!")
16
+
17
+ video_url = st.text_input("Enter public video URL:")
18
+
19
+ if st.button("Analyze Accent") and video_url:
20
+ with st.spinner("Downloading and extracting audio..."):
21
+ try:
22
+ audio_path = download_and_extract_audio(video_url)
23
+ except Exception as e:
24
+ st.error(f"Audio extraction failed: {e}")
25
+ st.stop()
26
+ st.success("Audio extracted successfully!")
27
+ st.audio(audio_path)
28
+ with st.spinner("Analyzing accent (downloading model if needed)..."):
29
+ try:
30
+ accent, confidence, summary = analyze_accent(audio_path)
31
+ except Exception as e:
32
+ st.error(f"Accent analysis failed: {e}")
33
+ st.stop()
34
+ st.markdown(f"**Accent:** {accent}")
35
+ st.markdown(f"**English Accent Confidence:** {confidence:.2f}%")
36
+ st.markdown(f"**Summary:** {summary}")