Spaces:

amirjamali
/

accent-detector

Sleeping

File size: 20,538 Bytes

import streamlit as st
import os
import yt_dlp
import subprocess
import librosa
import numpy as np
import torch
from speechbrain.inference.classifiers import EncoderClassifier
from transformers import AutoProcessor, AutoModelForAudioClassification
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import tempfile
import time

# Comment for deployment instructions:
# To deploy this app:
# 1. Make sure Docker is installed
# 2. Build the Docker image: docker build -t accent-detector .
# 3. Run the container: docker run -p 8501:8501 accent-detector
# 4. Access the app at http://localhost:8501
# 
# For cloud deployment:
# - Streamlit Cloud: Connect your GitHub repository to Streamlit Cloud
# - Hugging Face Spaces: Use the Docker deployment option
# - Azure/AWS/GCP: Deploy the container using their container services

# Load environment variables (if .env file exists)
try:
    load_dotenv()
except:
    pass

# Check for OpenAI API access - optional for enhanced explanations
try:
    import openai
    openai.api_key = os.getenv("OPENAI_API_KEY")
    have_openai = openai.api_key is not None
except (ImportError, AttributeError):
    have_openai = False

# English accent categories
ENGLISH_ACCENTS = {
    "en-us": "American English",
    "en-gb": "British English", 
    "en-au": "Australian English",
    "en-ca": "Canadian English",
    "en-ie": "Irish English",
    "en-scotland": "Scottish English",
    "en-in": "Indian English",
    "en-za": "South African English",
    "en-ng": "Nigerian English",
    "en-caribbean": "Caribbean English",
}

def download_video(url, video_path="video.mp4", cookies_file=None):
    """Download a video from a URL"""
    ydl_opts = {
        "outtmpl": video_path,
        "quiet": False,
        "no_warnings": False,
        "verbose": True  # More detailed output for debugging
    }
    
    # Only use cookies if explicitly provided via file upload
    # Don't try to access browser cookies in Docker container
    if cookies_file and os.path.exists(cookies_file):
        ydl_opts["cookiefile"] = cookies_file
    
    try:
        # Special handling for YouTube URLs to try without cookies first
        is_youtube = "youtube" in url.lower() or "youtu.be" in url.lower()
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
            
        if os.path.exists(video_path):
            return True
        else:
            st.error(f"Video downloaded but file not found: {video_path}")
            return False
    except Exception as e:
        error_msg = str(e)
        st.error(f"Download error: {error_msg}")
        
        # Provide specific guidance based on error type
        if is_youtube and ("bot" in error_msg.lower() or "sign in" in error_msg.lower()):
            st.warning("YouTube requires authentication. Please upload a cookies.txt file or try a direct video link.")
        elif "not find" in error_msg.lower() and "cookies" in error_msg.lower():
            st.warning("Browser cookies could not be accessed. Please upload a cookies.txt file.")
        elif "network" in error_msg.lower() or "timeout" in error_msg.lower():
            st.warning("Network error. Please check your internet connection and try again.")
            
        return False

def extract_audio(video_path="video.mp4", audio_path="audio.wav"):
    """Extract audio from video file using ffmpeg"""
    try:
        subprocess.run(
            ['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', audio_path],
            check=True,
            capture_output=True
        )
        return os.path.exists(audio_path)
    except subprocess.CalledProcessError as e:
        st.error(f"Error extracting audio: {e}")
        st.error(f"ffmpeg output: {e.stderr.decode('utf-8')}")
        raise

class AccentDetector:
    def __init__(self):
        # Initialize the language identification model
        self.lang_id = EncoderClassifier.from_hparams(
            source="speechbrain/lang-id-commonlanguage_ecapa", 
            savedir="tmp_model"
        )
        
        # Initialize the English accent classifier - using VoxLingua107 for now
        # In production, you'd use a more specialized accent model
        try:
            self.model_name = "speechbrain/lang-id-voxlingua107-ecapa"
            self.processor = AutoProcessor.from_pretrained(self.model_name)
            self.model = AutoModelForAudioClassification.from_pretrained(self.model_name)
            self.have_accent_model = True
        except Exception as e:
            st.warning(f"Could not load accent model: {str(e)}")
            self.have_accent_model = False

    def is_english(self, audio_path, threshold=0.7):
        """
        Determine if the speech is English and return confidence score
        """
        out_prob, score, index, lang = self.lang_id.classify_file(audio_path)
        score = float(score)
        
        # Check if language is English (slightly fuzzy match)
        is_english = "eng" in lang.lower() or "en-" in lang.lower() or lang.lower() == "en"
        
        return is_english, lang, score

    def classify_accent(self, audio_path):
        """
        Classify the specific English accent
        """
        if not self.have_accent_model:
            return "Unknown English Accent", 0.0
            
        try:
            # Load and preprocess audio
            audio, sr = librosa.load(audio_path, sr=16000)
            inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt")
            
            # Get predictions
            with torch.no_grad():
                outputs = self.model(**inputs)
                
            # Get probabilities
            probs = outputs.logits.softmax(dim=-1)[0]
            prediction_id = probs.argmax().item()
            confidence = probs[prediction_id].item()
            
            # Get predicted label
            id2label = self.model.config.id2label
            accent_code = id2label[prediction_id]
            
            # Map to English accent if possible
            if accent_code.startswith('en-'):
                accent = ENGLISH_ACCENTS.get(accent_code, f"English ({accent_code})")
                confidence = confidence  # Keep confidence as-is for English accents
            else:
                # If it's not an English accent code, use our pre-classification
                is_english, _, _ = self.is_english(audio_path)
                if is_english:
                    accent = "General English"
                else:
                    accent = f"Non-English ({accent_code})"
                confidence *= 0.7  # Reduce confidence for non-specific matches
            
            return accent, confidence
        except Exception as e:
            st.error(f"Error in accent classification: {str(e)}")
            return "Unknown English Accent", 0.0

    def generate_explanation(self, audio_path, accent, confidence, is_english, language):
        """
        Generate an explanation of the accent detection results using OpenAI API (if available)
        """
        if not have_openai:
            if is_english:
                return f"The speaker has a {accent} accent with {confidence*100:.1f}% confidence. The speech was identified as English."
            else:
                return f"The speech was identified as {language}, not English. English confidence is low."
        
        try:
            import openai
            is_english, lang, lang_score = self.is_english(audio_path)
            
            prompt = f"""
            Audio analysis detected a speaker with the following characteristics:
            - Primary accent/language: {accent}
            - Confidence score: {confidence*100:.1f}%
            - Detected language category: {lang}
            - Is English: {is_english}
            
            Based on this information, provide a 2-3 sentence summary about the speaker's accent.
            Focus on how clear their English is and any notable accent characteristics.
            This is for hiring purposes to evaluate English speaking abilities.
            """
            
            response = openai.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are an accent analysis specialist providing factual assessments."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=150
            )
            
            return response.choices[0].message.content.strip()
        except Exception as e:
            st.error(f"Error generating explanation: {str(e)}")
            if is_english:
                return f"The speaker has a {accent} accent with {confidence*100:.1f}% confidence. The speech was identified as English."
            else:
                return f"The speech was identified as {language}, not English. English confidence is low."
            
    def analyze_audio(self, audio_path):
        """
        Complete analysis pipeline returning all needed results
        """
        # Check if it's English
        is_english, lang, lang_score = self.is_english(audio_path)
        
        # Classify accent if it's English
        if is_english:
            accent, accent_confidence = self.classify_accent(audio_path)
            english_confidence = lang_score * 100  # Scale to percentage
        else:
            accent = f"Non-English ({lang})"
            accent_confidence = lang_score
            english_confidence = max(0, min(30, lang_score * 50))  # Cap at 30% if non-English
            
        # Generate explanation
        explanation = self.generate_explanation(audio_path, accent, accent_confidence, is_english, lang)
        
        # Create visualization of the audio waveform
        try:
            y, sr = librosa.load(audio_path, sr=None)
            fig, ax = plt.subplots(figsize=(10, 2))
            ax.plot(y)
            ax.set_xlabel('Sample')
            ax.set_ylabel('Amplitude')
            ax.set_title('Audio Waveform')
            plt.tight_layout()
            audio_viz = fig
        except Exception as e:
            st.warning(f"Could not generate audio visualization: {str(e)}")
            audio_viz = None
        
        return {
            "is_english": is_english,
            "accent": accent,
            "accent_confidence": accent_confidence * 100,  # Scale to percentage
            "english_confidence": english_confidence,
            "language_detected": lang,
            "explanation": explanation,
            "audio_viz": audio_viz
        }

def process_uploaded_audio(uploaded_file):
    """Process uploaded audio file"""
    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
        temp_file.write(uploaded_file.getvalue())
        audio_path = temp_file.name
    
    detector = AccentDetector()
    results = detector.analyze_audio(audio_path)
    
    # Clean up
    os.unlink(audio_path)
    return results

# --- Streamlit App ---
st.set_page_config(
    page_title="🎤 English Accent Detector", 
    page_icon="🎤", 
    layout="wide"
)

st.title("🎤 English Accent Detection Tool")
st.markdown("""
This application analyzes a speaker's English accent from video URLs or audio uploads, 
providing detailed insights for hiring evaluation purposes.
""")

# Add container for tips
with st.container():
    st.info("""
    💡 **Tips for best results:**
    - Use **Loom** or **Vimeo** videos (more reliable than YouTube)
    - For YouTube videos, you may need to provide cookies
    - Audio clips of 15-30 seconds work best
    - Clear speech with minimal background noise is ideal
    """)
st.markdown("""
This app analyzes a speaker's English accent from a video or audio source.
It provides:
- Classification of the accent (British, American, etc.)
- Confidence score for English proficiency
- Explanation of accent characteristics
""")

# Create tabs for different input methods
tab1, tab2 = st.tabs(["Video URL", "Upload Audio"])

with tab1:
    st.markdown("### 🎬 Analyze video from URL")
    url = st.text_input("Enter a public video URL", 
                       placeholder="https://www.loom.com/..., https://vimeo.com/..., or direct MP4 link")
    
    # Recommend alternative sources
    st.caption("⚠️ **Note**: YouTube videos often require authentication. For best results, use Loom, Vimeo or direct video links.")
    
    # Add file uploader for cookies.txt
    cookies_file = None
    uploaded_cookies = st.file_uploader("Upload cookies.txt file for YouTube (if needed)", 
                                      type="txt", 
                                      help="Only needed for YouTube videos that require authentication")
    
    if uploaded_cookies is not None:
        # Save the uploaded cookies file to a temporary file
        cookies_file = f"cookies_{int(time.time())}.txt"
        with open(cookies_file, "wb") as f:
            f.write(uploaded_cookies.getbuffer())
        st.success("Cookies file uploaded successfully!")
    
    with st.expander("Having trouble with YouTube videos?"):
        st.markdown("""
        ### YouTube Authentication Issues
        
        YouTube's anti-bot measures often block automated video downloads. To solve this:
        
        #### Option 1: Use Alternative Video Sources (Recommended)
        These typically work without authentication issues:
        - [Loom](https://www.loom.com/) - Great for screen recordings
        - [Vimeo](https://vimeo.com/) - High-quality video hosting
        - [Streamable](https://streamable.com/) - Simple video sharing
        - Any direct MP4 link
        
        #### Option 2: Upload Cookies for YouTube
        1. Install a browser extension like [Get cookies.txt](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc)
        2. Login to YouTube in your browser
        3. Use the extension to export cookies to a .txt file
        4. Upload the cookies.txt file using the uploader above
        
        #### Option 3: Use Audio Upload Instead
        The 'Upload Audio' tab allows direct analysis of audio files without URL issues.
        """)
    
    if st.button("Analyze Video"):
        if not url:
            st.warning("Please enter a valid URL")
        else:
            try:
                # Create a placeholder for status updates
                status = st.empty()
                
                # Generate unique filenames using timestamp to avoid conflicts
                timestamp = str(int(time.time()))
                video_path = f"video_{timestamp}.mp4"
                audio_path = f"audio_{timestamp}.wav"
                
                # Download and process the video
                status.text("Downloading video...")
                download_success = download_video(url, video_path, cookies_file)
                if not download_success:
                    st.error("Failed to download video")
                else:
                    status.text("Extracting audio...")
                    extract_success = extract_audio(video_path, audio_path)
                    if not extract_success:
                        st.error("Failed to extract audio")
                    else:
                        status.text("Analyzing accent... (this may take a moment)")
                        detector = AccentDetector()
                        results = detector.analyze_audio(audio_path)
                        
                        # Display results
                        st.success("✅ Analysis Complete!")
                        
                        # Create columns for results
                        col1, col2 = st.columns([2, 1])
                        
                        with col1:
                            st.subheader("Accent Analysis Results")
                            st.markdown(f"**Detected Accent:** {results['accent']}")
                            st.markdown(f"**English Proficiency:** {results['english_confidence']:.1f}%")
                            st.markdown(f"**Accent Confidence:** {results['accent_confidence']:.1f}%")
                            
                            # Show explanation in a box
                            st.markdown("### Expert Analysis")
                            st.info(results['explanation'])
                        
                        with col2:
                            if results['audio_viz']:
                                st.pyplot(results['audio_viz'])
                            
                            # Show audio playback
                            st.audio(audio_path)
                  # Clean up files
                try:
                    if os.path.exists(video_path):
                        os.remove(video_path)
                    if os.path.exists(audio_path):
                        os.remove(audio_path)
                    if cookies_file and os.path.exists(cookies_file):
                        os.remove(cookies_file)
                except Exception as e:
                    st.warning(f"Couldn't clean up temporary files: {str(e)}")
            
            except Exception as e:
                st.error(f"Error during analysis: {str(e)}")

with tab2:
    st.markdown("### 🎵 Upload Audio File")
    st.caption("**Recommended option!** Direct audio upload is more reliable than video URLs.")
    
    uploaded_file = st.file_uploader("Upload an audio file", 
                                   type=["wav", "mp3", "m4a", "ogg", "flac"], 
                                   help="Support for WAV, MP3, M4A, OGG and FLAC formats")
    
    if uploaded_file is not None:
        # Show a preview of the audio
        st.markdown("#### Audio Preview:")
        st.audio(uploaded_file)
        
        st.markdown("#### Ready for Analysis")
        col1, col2 = st.columns([1, 3])
        with col1:
            analyze_button = st.button("Analyze Audio", type="primary", use_container_width=True)
        with col2:
            st.caption("Tip: 15-30 seconds of clear speech works best for accent detection")
        
        if analyze_button:
            with st.spinner("Analyzing audio... (this may take 15-30 seconds)"):
                try:
                    results = process_uploaded_audio(uploaded_file)
                    
                    # Display results
                    st.success("✅ Analysis Complete!")
                    
                    # Create columns for results
                    col1, col2 = st.columns([2, 1])
                    
                    with col1:
                        st.subheader("Accent Analysis Results")
                        st.markdown(f"**Detected Accent:** {results['accent']}")
                        st.markdown(f"**English Proficiency:** {results['english_confidence']:.1f}%")
                        st.markdown(f"**Accent Confidence:** {results['accent_confidence']:.1f}%")
                        
                        # Show explanation in a box
                        st.markdown("### Expert Analysis")
                        st.info(results['explanation'])
                    
                    with col2:
                        if results['audio_viz']:
                            st.pyplot(results['audio_viz'])
                
                except Exception as e:
                    st.error(f"Error during analysis: {str(e)}")

# Add footer with deployment info
st.markdown("---")
st.markdown("Deployed using Streamlit • Built with SpeechBrain and Transformers")

# Add a section for how it works
with st.expander("ℹ️ How It Works"):
    st.markdown("""
    This app uses a multi-stage process to analyze a speaker's accent:
    
    1. **Audio Extraction**: The audio track is extracted from the input video or directly processed from uploaded audio.
    
    2. **Language Identification**: First, we determine if the speech is English using SpeechBrain's language identification model.
    
    3. **Accent Classification**: For English speech, we analyze the specific accent using a transformer-based model trained on diverse accent data.
    
    4. **English Proficiency Score**: A confidence score is calculated based on both language identification and accent clarity.
    
    5. **Analysis Summary**: An explanation is generated describing accent characteristics relevant for hiring evaluations.
    """)