Spaces:
Sleeping
Sleeping
Enhance Dockerfile for improved network reliability and add cookies directory; update Streamlit app for better YouTube authentication handling and user guidance
a9b23f3
unverified
import streamlit as st | |
import os | |
import yt_dlp | |
import subprocess | |
import librosa | |
import numpy as np | |
import torch | |
from speechbrain.inference.classifiers import EncoderClassifier | |
from transformers import AutoProcessor, AutoModelForAudioClassification | |
from dotenv import load_dotenv | |
import matplotlib.pyplot as plt | |
import tempfile | |
import time | |
# Comment for deployment instructions: | |
# To deploy this app: | |
# 1. Make sure Docker is installed | |
# 2. Build the Docker image: docker build -t accent-detector . | |
# 3. Run the container: docker run -p 8501:8501 accent-detector | |
# 4. Access the app at http://localhost:8501 | |
# | |
# For cloud deployment: | |
# - Streamlit Cloud: Connect your GitHub repository to Streamlit Cloud | |
# - Hugging Face Spaces: Use the Docker deployment option | |
# - Azure/AWS/GCP: Deploy the container using their container services | |
# Load environment variables (if .env file exists) | |
try: | |
load_dotenv() | |
except: | |
pass | |
# Check for OpenAI API access - optional for enhanced explanations | |
try: | |
import openai | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
have_openai = openai.api_key is not None | |
except (ImportError, AttributeError): | |
have_openai = False | |
# English accent categories | |
ENGLISH_ACCENTS = { | |
"en-us": "American English", | |
"en-gb": "British English", | |
"en-au": "Australian English", | |
"en-ca": "Canadian English", | |
"en-ie": "Irish English", | |
"en-scotland": "Scottish English", | |
"en-in": "Indian English", | |
"en-za": "South African English", | |
"en-ng": "Nigerian English", | |
"en-caribbean": "Caribbean English", | |
} | |
def download_video(url, video_path="video.mp4", cookies_file=None): | |
"""Download a video from a URL""" | |
ydl_opts = { | |
"outtmpl": video_path, | |
"quiet": False, | |
"no_warnings": False, | |
"verbose": True # More detailed output for debugging | |
} | |
# Only use cookies if explicitly provided via file upload | |
# Don't try to access browser cookies in Docker container | |
if cookies_file and os.path.exists(cookies_file): | |
ydl_opts["cookiefile"] = cookies_file | |
try: | |
# Special handling for YouTube URLs to try without cookies first | |
is_youtube = "youtube" in url.lower() or "youtu.be" in url.lower() | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
ydl.download([url]) | |
if os.path.exists(video_path): | |
return True | |
else: | |
st.error(f"Video downloaded but file not found: {video_path}") | |
return False | |
except Exception as e: | |
error_msg = str(e) | |
st.error(f"Download error: {error_msg}") | |
# Provide specific guidance based on error type | |
if is_youtube and ("bot" in error_msg.lower() or "sign in" in error_msg.lower()): | |
st.warning("YouTube requires authentication. Please upload a cookies.txt file or try a direct video link.") | |
elif "not find" in error_msg.lower() and "cookies" in error_msg.lower(): | |
st.warning("Browser cookies could not be accessed. Please upload a cookies.txt file.") | |
elif "network" in error_msg.lower() or "timeout" in error_msg.lower(): | |
st.warning("Network error. Please check your internet connection and try again.") | |
return False | |
def extract_audio(video_path="video.mp4", audio_path="audio.wav"): | |
"""Extract audio from video file using ffmpeg""" | |
try: | |
subprocess.run( | |
['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', audio_path], | |
check=True, | |
capture_output=True | |
) | |
return os.path.exists(audio_path) | |
except subprocess.CalledProcessError as e: | |
st.error(f"Error extracting audio: {e}") | |
st.error(f"ffmpeg output: {e.stderr.decode('utf-8')}") | |
raise | |
class AccentDetector: | |
def __init__(self): | |
# Initialize the language identification model | |
self.lang_id = EncoderClassifier.from_hparams( | |
source="speechbrain/lang-id-commonlanguage_ecapa", | |
savedir="tmp_model" | |
) | |
# Initialize the English accent classifier - using VoxLingua107 for now | |
# In production, you'd use a more specialized accent model | |
try: | |
self.model_name = "speechbrain/lang-id-voxlingua107-ecapa" | |
self.processor = AutoProcessor.from_pretrained(self.model_name) | |
self.model = AutoModelForAudioClassification.from_pretrained(self.model_name) | |
self.have_accent_model = True | |
except Exception as e: | |
st.warning(f"Could not load accent model: {str(e)}") | |
self.have_accent_model = False | |
def is_english(self, audio_path, threshold=0.7): | |
""" | |
Determine if the speech is English and return confidence score | |
""" | |
out_prob, score, index, lang = self.lang_id.classify_file(audio_path) | |
score = float(score) | |
# Check if language is English (slightly fuzzy match) | |
is_english = "eng" in lang.lower() or "en-" in lang.lower() or lang.lower() == "en" | |
return is_english, lang, score | |
def classify_accent(self, audio_path): | |
""" | |
Classify the specific English accent | |
""" | |
if not self.have_accent_model: | |
return "Unknown English Accent", 0.0 | |
try: | |
# Load and preprocess audio | |
audio, sr = librosa.load(audio_path, sr=16000) | |
inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt") | |
# Get predictions | |
with torch.no_grad(): | |
outputs = self.model(**inputs) | |
# Get probabilities | |
probs = outputs.logits.softmax(dim=-1)[0] | |
prediction_id = probs.argmax().item() | |
confidence = probs[prediction_id].item() | |
# Get predicted label | |
id2label = self.model.config.id2label | |
accent_code = id2label[prediction_id] | |
# Map to English accent if possible | |
if accent_code.startswith('en-'): | |
accent = ENGLISH_ACCENTS.get(accent_code, f"English ({accent_code})") | |
confidence = confidence # Keep confidence as-is for English accents | |
else: | |
# If it's not an English accent code, use our pre-classification | |
is_english, _, _ = self.is_english(audio_path) | |
if is_english: | |
accent = "General English" | |
else: | |
accent = f"Non-English ({accent_code})" | |
confidence *= 0.7 # Reduce confidence for non-specific matches | |
return accent, confidence | |
except Exception as e: | |
st.error(f"Error in accent classification: {str(e)}") | |
return "Unknown English Accent", 0.0 | |
def generate_explanation(self, audio_path, accent, confidence, is_english, language): | |
""" | |
Generate an explanation of the accent detection results using OpenAI API (if available) | |
""" | |
if not have_openai: | |
if is_english: | |
return f"The speaker has a {accent} accent with {confidence*100:.1f}% confidence. The speech was identified as English." | |
else: | |
return f"The speech was identified as {language}, not English. English confidence is low." | |
try: | |
import openai | |
is_english, lang, lang_score = self.is_english(audio_path) | |
prompt = f""" | |
Audio analysis detected a speaker with the following characteristics: | |
- Primary accent/language: {accent} | |
- Confidence score: {confidence*100:.1f}% | |
- Detected language category: {lang} | |
- Is English: {is_english} | |
Based on this information, provide a 2-3 sentence summary about the speaker's accent. | |
Focus on how clear their English is and any notable accent characteristics. | |
This is for hiring purposes to evaluate English speaking abilities. | |
""" | |
response = openai.chat.completions.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": "You are an accent analysis specialist providing factual assessments."}, | |
{"role": "user", "content": prompt} | |
], | |
max_tokens=150 | |
) | |
return response.choices[0].message.content.strip() | |
except Exception as e: | |
st.error(f"Error generating explanation: {str(e)}") | |
if is_english: | |
return f"The speaker has a {accent} accent with {confidence*100:.1f}% confidence. The speech was identified as English." | |
else: | |
return f"The speech was identified as {language}, not English. English confidence is low." | |
def analyze_audio(self, audio_path): | |
""" | |
Complete analysis pipeline returning all needed results | |
""" | |
# Check if it's English | |
is_english, lang, lang_score = self.is_english(audio_path) | |
# Classify accent if it's English | |
if is_english: | |
accent, accent_confidence = self.classify_accent(audio_path) | |
english_confidence = lang_score * 100 # Scale to percentage | |
else: | |
accent = f"Non-English ({lang})" | |
accent_confidence = lang_score | |
english_confidence = max(0, min(30, lang_score * 50)) # Cap at 30% if non-English | |
# Generate explanation | |
explanation = self.generate_explanation(audio_path, accent, accent_confidence, is_english, lang) | |
# Create visualization of the audio waveform | |
try: | |
y, sr = librosa.load(audio_path, sr=None) | |
fig, ax = plt.subplots(figsize=(10, 2)) | |
ax.plot(y) | |
ax.set_xlabel('Sample') | |
ax.set_ylabel('Amplitude') | |
ax.set_title('Audio Waveform') | |
plt.tight_layout() | |
audio_viz = fig | |
except Exception as e: | |
st.warning(f"Could not generate audio visualization: {str(e)}") | |
audio_viz = None | |
return { | |
"is_english": is_english, | |
"accent": accent, | |
"accent_confidence": accent_confidence * 100, # Scale to percentage | |
"english_confidence": english_confidence, | |
"language_detected": lang, | |
"explanation": explanation, | |
"audio_viz": audio_viz | |
} | |
def process_uploaded_audio(uploaded_file): | |
"""Process uploaded audio file""" | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file: | |
temp_file.write(uploaded_file.getvalue()) | |
audio_path = temp_file.name | |
detector = AccentDetector() | |
results = detector.analyze_audio(audio_path) | |
# Clean up | |
os.unlink(audio_path) | |
return results | |
# --- Streamlit App --- | |
st.set_page_config( | |
page_title="🎤 English Accent Detector", | |
page_icon="🎤", | |
layout="wide" | |
) | |
st.title("🎤 English Accent Detection Tool") | |
st.markdown(""" | |
This application analyzes a speaker's English accent from video URLs or audio uploads, | |
providing detailed insights for hiring evaluation purposes. | |
""") | |
# Add container for tips | |
with st.container(): | |
st.info(""" | |
💡 **Tips for best results:** | |
- Use **Loom** or **Vimeo** videos (more reliable than YouTube) | |
- For YouTube videos, you may need to provide cookies | |
- Audio clips of 15-30 seconds work best | |
- Clear speech with minimal background noise is ideal | |
""") | |
st.markdown(""" | |
This app analyzes a speaker's English accent from a video or audio source. | |
It provides: | |
- Classification of the accent (British, American, etc.) | |
- Confidence score for English proficiency | |
- Explanation of accent characteristics | |
""") | |
# Create tabs for different input methods | |
tab1, tab2 = st.tabs(["Video URL", "Upload Audio"]) | |
with tab1: | |
st.markdown("### 🎬 Analyze video from URL") | |
url = st.text_input("Enter a public video URL", | |
placeholder="https://www.loom.com/..., https://vimeo.com/..., or direct MP4 link") | |
# Recommend alternative sources | |
st.caption("⚠️ **Note**: YouTube videos often require authentication. For best results, use Loom, Vimeo or direct video links.") | |
# Add file uploader for cookies.txt | |
cookies_file = None | |
uploaded_cookies = st.file_uploader("Upload cookies.txt file for YouTube (if needed)", | |
type="txt", | |
help="Only needed for YouTube videos that require authentication") | |
if uploaded_cookies is not None: | |
# Save the uploaded cookies file to a temporary file | |
cookies_file = f"cookies_{int(time.time())}.txt" | |
with open(cookies_file, "wb") as f: | |
f.write(uploaded_cookies.getbuffer()) | |
st.success("Cookies file uploaded successfully!") | |
with st.expander("Having trouble with YouTube videos?"): | |
st.markdown(""" | |
### YouTube Authentication Issues | |
YouTube's anti-bot measures often block automated video downloads. To solve this: | |
#### Option 1: Use Alternative Video Sources (Recommended) | |
These typically work without authentication issues: | |
- [Loom](https://www.loom.com/) - Great for screen recordings | |
- [Vimeo](https://vimeo.com/) - High-quality video hosting | |
- [Streamable](https://streamable.com/) - Simple video sharing | |
- Any direct MP4 link | |
#### Option 2: Upload Cookies for YouTube | |
1. Install a browser extension like [Get cookies.txt](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc) | |
2. Login to YouTube in your browser | |
3. Use the extension to export cookies to a .txt file | |
4. Upload the cookies.txt file using the uploader above | |
#### Option 3: Use Audio Upload Instead | |
The 'Upload Audio' tab allows direct analysis of audio files without URL issues. | |
""") | |
if st.button("Analyze Video"): | |
if not url: | |
st.warning("Please enter a valid URL") | |
else: | |
try: | |
# Create a placeholder for status updates | |
status = st.empty() | |
# Generate unique filenames using timestamp to avoid conflicts | |
timestamp = str(int(time.time())) | |
video_path = f"video_{timestamp}.mp4" | |
audio_path = f"audio_{timestamp}.wav" | |
# Download and process the video | |
status.text("Downloading video...") | |
download_success = download_video(url, video_path, cookies_file) | |
if not download_success: | |
st.error("Failed to download video") | |
else: | |
status.text("Extracting audio...") | |
extract_success = extract_audio(video_path, audio_path) | |
if not extract_success: | |
st.error("Failed to extract audio") | |
else: | |
status.text("Analyzing accent... (this may take a moment)") | |
detector = AccentDetector() | |
results = detector.analyze_audio(audio_path) | |
# Display results | |
st.success("✅ Analysis Complete!") | |
# Create columns for results | |
col1, col2 = st.columns([2, 1]) | |
with col1: | |
st.subheader("Accent Analysis Results") | |
st.markdown(f"**Detected Accent:** {results['accent']}") | |
st.markdown(f"**English Proficiency:** {results['english_confidence']:.1f}%") | |
st.markdown(f"**Accent Confidence:** {results['accent_confidence']:.1f}%") | |
# Show explanation in a box | |
st.markdown("### Expert Analysis") | |
st.info(results['explanation']) | |
with col2: | |
if results['audio_viz']: | |
st.pyplot(results['audio_viz']) | |
# Show audio playback | |
st.audio(audio_path) | |
# Clean up files | |
try: | |
if os.path.exists(video_path): | |
os.remove(video_path) | |
if os.path.exists(audio_path): | |
os.remove(audio_path) | |
if cookies_file and os.path.exists(cookies_file): | |
os.remove(cookies_file) | |
except Exception as e: | |
st.warning(f"Couldn't clean up temporary files: {str(e)}") | |
except Exception as e: | |
st.error(f"Error during analysis: {str(e)}") | |
with tab2: | |
st.markdown("### 🎵 Upload Audio File") | |
st.caption("**Recommended option!** Direct audio upload is more reliable than video URLs.") | |
uploaded_file = st.file_uploader("Upload an audio file", | |
type=["wav", "mp3", "m4a", "ogg", "flac"], | |
help="Support for WAV, MP3, M4A, OGG and FLAC formats") | |
if uploaded_file is not None: | |
# Show a preview of the audio | |
st.markdown("#### Audio Preview:") | |
st.audio(uploaded_file) | |
st.markdown("#### Ready for Analysis") | |
col1, col2 = st.columns([1, 3]) | |
with col1: | |
analyze_button = st.button("Analyze Audio", type="primary", use_container_width=True) | |
with col2: | |
st.caption("Tip: 15-30 seconds of clear speech works best for accent detection") | |
if analyze_button: | |
with st.spinner("Analyzing audio... (this may take 15-30 seconds)"): | |
try: | |
results = process_uploaded_audio(uploaded_file) | |
# Display results | |
st.success("✅ Analysis Complete!") | |
# Create columns for results | |
col1, col2 = st.columns([2, 1]) | |
with col1: | |
st.subheader("Accent Analysis Results") | |
st.markdown(f"**Detected Accent:** {results['accent']}") | |
st.markdown(f"**English Proficiency:** {results['english_confidence']:.1f}%") | |
st.markdown(f"**Accent Confidence:** {results['accent_confidence']:.1f}%") | |
# Show explanation in a box | |
st.markdown("### Expert Analysis") | |
st.info(results['explanation']) | |
with col2: | |
if results['audio_viz']: | |
st.pyplot(results['audio_viz']) | |
except Exception as e: | |
st.error(f"Error during analysis: {str(e)}") | |
# Add footer with deployment info | |
st.markdown("---") | |
st.markdown("Deployed using Streamlit • Built with SpeechBrain and Transformers") | |
# Add a section for how it works | |
with st.expander("ℹ️ How It Works"): | |
st.markdown(""" | |
This app uses a multi-stage process to analyze a speaker's accent: | |
1. **Audio Extraction**: The audio track is extracted from the input video or directly processed from uploaded audio. | |
2. **Language Identification**: First, we determine if the speech is English using SpeechBrain's language identification model. | |
3. **Accent Classification**: For English speech, we analyze the specific accent using a transformer-based model trained on diverse accent data. | |
4. **English Proficiency Score**: A confidence score is calculated based on both language identification and accent clarity. | |
5. **Analysis Summary**: An explanation is generated describing accent characteristics relevant for hiring evaluations. | |
""") | |