File size: 20,538 Bytes
3b528f9
b15f0c7
 
635694f
 
 
 
6ba5ea6
635694f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b15f0c7
094ee23
635694f
094ee23
 
a9b23f3
 
 
094ee23
 
a9b23f3
 
094ee23
 
 
 
a9b23f3
 
 
094ee23
 
a9b23f3
 
 
 
 
 
094ee23
a9b23f3
 
 
 
 
 
 
 
 
 
 
094ee23
b15f0c7
 
635694f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ba5ea6
635694f
 
 
 
 
 
 
 
 
 
 
 
 
 
b15f0c7
635694f
 
 
 
6ba5ea6
 
635694f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b15f0c7
 
635694f
 
 
 
 
b15f0c7
635694f
 
a9b23f3
 
 
 
 
 
 
 
 
 
 
 
 
 
635694f
 
 
 
 
 
 
 
 
b15f0c7
635694f
a9b23f3
 
 
 
 
 
635694f
094ee23
 
a9b23f3
 
 
094ee23
 
 
 
 
 
a9b23f3
094ee23
 
 
a9b23f3
 
 
094ee23
a9b23f3
 
 
 
 
 
094ee23
a9b23f3
 
 
 
 
 
 
 
094ee23
 
635694f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
094ee23
635694f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
094ee23
635694f
 
 
 
 
094ee23
 
635694f
 
 
 
 
b15f0c7
635694f
a9b23f3
 
 
 
 
 
635694f
 
a9b23f3
 
635694f
 
a9b23f3
 
 
 
 
 
 
 
 
635694f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b15f0c7
635694f
 
 
b15f0c7
635694f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
import streamlit as st
import os
import yt_dlp
import subprocess
import librosa
import numpy as np
import torch
from speechbrain.inference.classifiers import EncoderClassifier
from transformers import AutoProcessor, AutoModelForAudioClassification
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import tempfile
import time

# Comment for deployment instructions:
# To deploy this app:
# 1. Make sure Docker is installed
# 2. Build the Docker image: docker build -t accent-detector .
# 3. Run the container: docker run -p 8501:8501 accent-detector
# 4. Access the app at http://localhost:8501
# 
# For cloud deployment:
# - Streamlit Cloud: Connect your GitHub repository to Streamlit Cloud
# - Hugging Face Spaces: Use the Docker deployment option
# - Azure/AWS/GCP: Deploy the container using their container services

# Load environment variables (if .env file exists)
try:
    load_dotenv()
except:
    pass

# Check for OpenAI API access - optional for enhanced explanations
try:
    import openai
    openai.api_key = os.getenv("OPENAI_API_KEY")
    have_openai = openai.api_key is not None
except (ImportError, AttributeError):
    have_openai = False

# English accent categories
ENGLISH_ACCENTS = {
    "en-us": "American English",
    "en-gb": "British English", 
    "en-au": "Australian English",
    "en-ca": "Canadian English",
    "en-ie": "Irish English",
    "en-scotland": "Scottish English",
    "en-in": "Indian English",
    "en-za": "South African English",
    "en-ng": "Nigerian English",
    "en-caribbean": "Caribbean English",
}

def download_video(url, video_path="video.mp4", cookies_file=None):
    """Download a video from a URL"""
    ydl_opts = {
        "outtmpl": video_path,
        "quiet": False,
        "no_warnings": False,
        "verbose": True  # More detailed output for debugging
    }
    
    # Only use cookies if explicitly provided via file upload
    # Don't try to access browser cookies in Docker container
    if cookies_file and os.path.exists(cookies_file):
        ydl_opts["cookiefile"] = cookies_file
    
    try:
        # Special handling for YouTube URLs to try without cookies first
        is_youtube = "youtube" in url.lower() or "youtu.be" in url.lower()
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
            
        if os.path.exists(video_path):
            return True
        else:
            st.error(f"Video downloaded but file not found: {video_path}")
            return False
    except Exception as e:
        error_msg = str(e)
        st.error(f"Download error: {error_msg}")
        
        # Provide specific guidance based on error type
        if is_youtube and ("bot" in error_msg.lower() or "sign in" in error_msg.lower()):
            st.warning("YouTube requires authentication. Please upload a cookies.txt file or try a direct video link.")
        elif "not find" in error_msg.lower() and "cookies" in error_msg.lower():
            st.warning("Browser cookies could not be accessed. Please upload a cookies.txt file.")
        elif "network" in error_msg.lower() or "timeout" in error_msg.lower():
            st.warning("Network error. Please check your internet connection and try again.")
            
        return False

def extract_audio(video_path="video.mp4", audio_path="audio.wav"):
    """Extract audio from video file using ffmpeg"""
    try:
        subprocess.run(
            ['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', audio_path],
            check=True,
            capture_output=True
        )
        return os.path.exists(audio_path)
    except subprocess.CalledProcessError as e:
        st.error(f"Error extracting audio: {e}")
        st.error(f"ffmpeg output: {e.stderr.decode('utf-8')}")
        raise

class AccentDetector:
    def __init__(self):
        # Initialize the language identification model
        self.lang_id = EncoderClassifier.from_hparams(
            source="speechbrain/lang-id-commonlanguage_ecapa", 
            savedir="tmp_model"
        )
        
        # Initialize the English accent classifier - using VoxLingua107 for now
        # In production, you'd use a more specialized accent model
        try:
            self.model_name = "speechbrain/lang-id-voxlingua107-ecapa"
            self.processor = AutoProcessor.from_pretrained(self.model_name)
            self.model = AutoModelForAudioClassification.from_pretrained(self.model_name)
            self.have_accent_model = True
        except Exception as e:
            st.warning(f"Could not load accent model: {str(e)}")
            self.have_accent_model = False

    def is_english(self, audio_path, threshold=0.7):
        """
        Determine if the speech is English and return confidence score
        """
        out_prob, score, index, lang = self.lang_id.classify_file(audio_path)
        score = float(score)
        
        # Check if language is English (slightly fuzzy match)
        is_english = "eng" in lang.lower() or "en-" in lang.lower() or lang.lower() == "en"
        
        return is_english, lang, score

    def classify_accent(self, audio_path):
        """
        Classify the specific English accent
        """
        if not self.have_accent_model:
            return "Unknown English Accent", 0.0
            
        try:
            # Load and preprocess audio
            audio, sr = librosa.load(audio_path, sr=16000)
            inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt")
            
            # Get predictions
            with torch.no_grad():
                outputs = self.model(**inputs)
                
            # Get probabilities
            probs = outputs.logits.softmax(dim=-1)[0]
            prediction_id = probs.argmax().item()
            confidence = probs[prediction_id].item()
            
            # Get predicted label
            id2label = self.model.config.id2label
            accent_code = id2label[prediction_id]
            
            # Map to English accent if possible
            if accent_code.startswith('en-'):
                accent = ENGLISH_ACCENTS.get(accent_code, f"English ({accent_code})")
                confidence = confidence  # Keep confidence as-is for English accents
            else:
                # If it's not an English accent code, use our pre-classification
                is_english, _, _ = self.is_english(audio_path)
                if is_english:
                    accent = "General English"
                else:
                    accent = f"Non-English ({accent_code})"
                confidence *= 0.7  # Reduce confidence for non-specific matches
            
            return accent, confidence
        except Exception as e:
            st.error(f"Error in accent classification: {str(e)}")
            return "Unknown English Accent", 0.0

    def generate_explanation(self, audio_path, accent, confidence, is_english, language):
        """
        Generate an explanation of the accent detection results using OpenAI API (if available)
        """
        if not have_openai:
            if is_english:
                return f"The speaker has a {accent} accent with {confidence*100:.1f}% confidence. The speech was identified as English."
            else:
                return f"The speech was identified as {language}, not English. English confidence is low."
        
        try:
            import openai
            is_english, lang, lang_score = self.is_english(audio_path)
            
            prompt = f"""
            Audio analysis detected a speaker with the following characteristics:
            - Primary accent/language: {accent}
            - Confidence score: {confidence*100:.1f}%
            - Detected language category: {lang}
            - Is English: {is_english}
            
            Based on this information, provide a 2-3 sentence summary about the speaker's accent.
            Focus on how clear their English is and any notable accent characteristics.
            This is for hiring purposes to evaluate English speaking abilities.
            """
            
            response = openai.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are an accent analysis specialist providing factual assessments."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=150
            )
            
            return response.choices[0].message.content.strip()
        except Exception as e:
            st.error(f"Error generating explanation: {str(e)}")
            if is_english:
                return f"The speaker has a {accent} accent with {confidence*100:.1f}% confidence. The speech was identified as English."
            else:
                return f"The speech was identified as {language}, not English. English confidence is low."
            
    def analyze_audio(self, audio_path):
        """
        Complete analysis pipeline returning all needed results
        """
        # Check if it's English
        is_english, lang, lang_score = self.is_english(audio_path)
        
        # Classify accent if it's English
        if is_english:
            accent, accent_confidence = self.classify_accent(audio_path)
            english_confidence = lang_score * 100  # Scale to percentage
        else:
            accent = f"Non-English ({lang})"
            accent_confidence = lang_score
            english_confidence = max(0, min(30, lang_score * 50))  # Cap at 30% if non-English
            
        # Generate explanation
        explanation = self.generate_explanation(audio_path, accent, accent_confidence, is_english, lang)
        
        # Create visualization of the audio waveform
        try:
            y, sr = librosa.load(audio_path, sr=None)
            fig, ax = plt.subplots(figsize=(10, 2))
            ax.plot(y)
            ax.set_xlabel('Sample')
            ax.set_ylabel('Amplitude')
            ax.set_title('Audio Waveform')
            plt.tight_layout()
            audio_viz = fig
        except Exception as e:
            st.warning(f"Could not generate audio visualization: {str(e)}")
            audio_viz = None
        
        return {
            "is_english": is_english,
            "accent": accent,
            "accent_confidence": accent_confidence * 100,  # Scale to percentage
            "english_confidence": english_confidence,
            "language_detected": lang,
            "explanation": explanation,
            "audio_viz": audio_viz
        }

def process_uploaded_audio(uploaded_file):
    """Process uploaded audio file"""
    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
        temp_file.write(uploaded_file.getvalue())
        audio_path = temp_file.name
    
    detector = AccentDetector()
    results = detector.analyze_audio(audio_path)
    
    # Clean up
    os.unlink(audio_path)
    return results

# --- Streamlit App ---
st.set_page_config(
    page_title="🎀 English Accent Detector", 
    page_icon="🎀", 
    layout="wide"
)

st.title("🎀 English Accent Detection Tool")
st.markdown("""
This application analyzes a speaker's English accent from video URLs or audio uploads, 
providing detailed insights for hiring evaluation purposes.
""")

# Add container for tips
with st.container():
    st.info("""
    πŸ’‘ **Tips for best results:**
    - Use **Loom** or **Vimeo** videos (more reliable than YouTube)
    - For YouTube videos, you may need to provide cookies
    - Audio clips of 15-30 seconds work best
    - Clear speech with minimal background noise is ideal
    """)
st.markdown("""
This app analyzes a speaker's English accent from a video or audio source.
It provides:
- Classification of the accent (British, American, etc.)
- Confidence score for English proficiency
- Explanation of accent characteristics
""")

# Create tabs for different input methods
tab1, tab2 = st.tabs(["Video URL", "Upload Audio"])

with tab1:
    st.markdown("### 🎬 Analyze video from URL")
    url = st.text_input("Enter a public video URL", 
                       placeholder="https://www.loom.com/..., https://vimeo.com/..., or direct MP4 link")
    
    # Recommend alternative sources
    st.caption("⚠️ **Note**: YouTube videos often require authentication. For best results, use Loom, Vimeo or direct video links.")
    
    # Add file uploader for cookies.txt
    cookies_file = None
    uploaded_cookies = st.file_uploader("Upload cookies.txt file for YouTube (if needed)", 
                                      type="txt", 
                                      help="Only needed for YouTube videos that require authentication")
    
    if uploaded_cookies is not None:
        # Save the uploaded cookies file to a temporary file
        cookies_file = f"cookies_{int(time.time())}.txt"
        with open(cookies_file, "wb") as f:
            f.write(uploaded_cookies.getbuffer())
        st.success("Cookies file uploaded successfully!")
    
    with st.expander("Having trouble with YouTube videos?"):
        st.markdown("""
        ### YouTube Authentication Issues
        
        YouTube's anti-bot measures often block automated video downloads. To solve this:
        
        #### Option 1: Use Alternative Video Sources (Recommended)
        These typically work without authentication issues:
        - [Loom](https://www.loom.com/) - Great for screen recordings
        - [Vimeo](https://vimeo.com/) - High-quality video hosting
        - [Streamable](https://streamable.com/) - Simple video sharing
        - Any direct MP4 link
        
        #### Option 2: Upload Cookies for YouTube
        1. Install a browser extension like [Get cookies.txt](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc)
        2. Login to YouTube in your browser
        3. Use the extension to export cookies to a .txt file
        4. Upload the cookies.txt file using the uploader above
        
        #### Option 3: Use Audio Upload Instead
        The 'Upload Audio' tab allows direct analysis of audio files without URL issues.
        """)
    
    if st.button("Analyze Video"):
        if not url:
            st.warning("Please enter a valid URL")
        else:
            try:
                # Create a placeholder for status updates
                status = st.empty()
                
                # Generate unique filenames using timestamp to avoid conflicts
                timestamp = str(int(time.time()))
                video_path = f"video_{timestamp}.mp4"
                audio_path = f"audio_{timestamp}.wav"
                
                # Download and process the video
                status.text("Downloading video...")
                download_success = download_video(url, video_path, cookies_file)
                if not download_success:
                    st.error("Failed to download video")
                else:
                    status.text("Extracting audio...")
                    extract_success = extract_audio(video_path, audio_path)
                    if not extract_success:
                        st.error("Failed to extract audio")
                    else:
                        status.text("Analyzing accent... (this may take a moment)")
                        detector = AccentDetector()
                        results = detector.analyze_audio(audio_path)
                        
                        # Display results
                        st.success("βœ… Analysis Complete!")
                        
                        # Create columns for results
                        col1, col2 = st.columns([2, 1])
                        
                        with col1:
                            st.subheader("Accent Analysis Results")
                            st.markdown(f"**Detected Accent:** {results['accent']}")
                            st.markdown(f"**English Proficiency:** {results['english_confidence']:.1f}%")
                            st.markdown(f"**Accent Confidence:** {results['accent_confidence']:.1f}%")
                            
                            # Show explanation in a box
                            st.markdown("### Expert Analysis")
                            st.info(results['explanation'])
                        
                        with col2:
                            if results['audio_viz']:
                                st.pyplot(results['audio_viz'])
                            
                            # Show audio playback
                            st.audio(audio_path)
                  # Clean up files
                try:
                    if os.path.exists(video_path):
                        os.remove(video_path)
                    if os.path.exists(audio_path):
                        os.remove(audio_path)
                    if cookies_file and os.path.exists(cookies_file):
                        os.remove(cookies_file)
                except Exception as e:
                    st.warning(f"Couldn't clean up temporary files: {str(e)}")
            
            except Exception as e:
                st.error(f"Error during analysis: {str(e)}")

with tab2:
    st.markdown("### 🎡 Upload Audio File")
    st.caption("**Recommended option!** Direct audio upload is more reliable than video URLs.")
    
    uploaded_file = st.file_uploader("Upload an audio file", 
                                   type=["wav", "mp3", "m4a", "ogg", "flac"], 
                                   help="Support for WAV, MP3, M4A, OGG and FLAC formats")
    
    if uploaded_file is not None:
        # Show a preview of the audio
        st.markdown("#### Audio Preview:")
        st.audio(uploaded_file)
        
        st.markdown("#### Ready for Analysis")
        col1, col2 = st.columns([1, 3])
        with col1:
            analyze_button = st.button("Analyze Audio", type="primary", use_container_width=True)
        with col2:
            st.caption("Tip: 15-30 seconds of clear speech works best for accent detection")
        
        if analyze_button:
            with st.spinner("Analyzing audio... (this may take 15-30 seconds)"):
                try:
                    results = process_uploaded_audio(uploaded_file)
                    
                    # Display results
                    st.success("βœ… Analysis Complete!")
                    
                    # Create columns for results
                    col1, col2 = st.columns([2, 1])
                    
                    with col1:
                        st.subheader("Accent Analysis Results")
                        st.markdown(f"**Detected Accent:** {results['accent']}")
                        st.markdown(f"**English Proficiency:** {results['english_confidence']:.1f}%")
                        st.markdown(f"**Accent Confidence:** {results['accent_confidence']:.1f}%")
                        
                        # Show explanation in a box
                        st.markdown("### Expert Analysis")
                        st.info(results['explanation'])
                    
                    with col2:
                        if results['audio_viz']:
                            st.pyplot(results['audio_viz'])
                
                except Exception as e:
                    st.error(f"Error during analysis: {str(e)}")

# Add footer with deployment info
st.markdown("---")
st.markdown("Deployed using Streamlit β€’ Built with SpeechBrain and Transformers")

# Add a section for how it works
with st.expander("ℹ️ How It Works"):
    st.markdown("""
    This app uses a multi-stage process to analyze a speaker's accent:
    
    1. **Audio Extraction**: The audio track is extracted from the input video or directly processed from uploaded audio.
    
    2. **Language Identification**: First, we determine if the speech is English using SpeechBrain's language identification model.
    
    3. **Accent Classification**: For English speech, we analyze the specific accent using a transformer-based model trained on diverse accent data.
    
    4. **English Proficiency Score**: A confidence score is calculated based on both language identification and accent clarity.
    
    5. **Analysis Summary**: An explanation is generated describing accent characteristics relevant for hiring evaluations.
    """)