Spaces:

mguven61
/

accent-classifier

Sleeping

File size: 6,187 Bytes

0b86de1

import gradio as gr
import os
from detect import SimpleOfflineAccentClassifier
import ssl
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
ssl._create_default_https_context = ssl._create_unverified_context
os.environ['CURL_CA_BUNDLE'] = ''
os.environ['REQUESTS_CA_BUNDLE'] = ''

import torch
import torchaudio
import librosa
import numpy as np
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import soundfile as sf

class AccentClassifierApp:
    def __init__(self):
        self.classifier = HuggingFaceAccentClassifier()
        
    def classify_audio(self, audio_file):
        if audio_file is None:
            return "Please upload an audio file."
        
        try:
            result = self.classifier.predict_accent(audio_file)
            
            if result is None:
                return "Audio file processing failed."
            
            output = f"Predicted Accent: {result['accent']}\n"
            output += f"Confidence Score: {result['confidence']:.2%}\n\n"
            output += "All Probabilities:\n"
            
            sorted_probs = sorted(
                result['all_probabilities'].items(), 
                key=lambda x: x[1], 
                reverse=True
            )
            
            for accent, prob in sorted_probs:
                bar = "█" * int(prob * 20)
                output += f"- {accent}: {prob:.2%} {bar}\n"
            
            return output
            
        except Exception as e:
            return f"Error occurred: {str(e)}"
    
    def create_interface(self):
        with gr.Blocks(title="Accent Classifier") as interface:
            gr.Markdown("""

            # AI Accent Classifier

            

            This application analyzes speech audio files to predict accents.

            Supported formats: WAV, MP3, FLAC

            """)
            
            with gr.Row():
                with gr.Column():
                    audio_input = gr.Audio(
                        label="Upload Audio File",
                        type="filepath"
                    )
                    
                    classify_btn = gr.Button(
                        "Analyze Accent", 
                        variant="primary"
                    )
                
                with gr.Column():
                    output_text = gr.Markdown(
                        label="Analysis Results",
                        value="Analysis results will appear here..."
                    )
            
            gr.Markdown("### Example Audio Files")
            gr.Examples(
                examples=[
                    ["examples/american_sample.wav"],
                    ["examples/british_sample.wav"],
                ] if os.path.exists("examples") else [],
                inputs=audio_input
            )
            
            classify_btn.click(
                fn=self.classify_audio,
                inputs=audio_input,
                outputs=output_text
            )
        
        return interface

    def extract_acoustic_features(self, audio_path):
        try:
            y, sr = librosa.load(audio_path, sr=22050, duration=30)
            
            if len(y) == 0:
                return None
            
            min_length = sr * 2
            if len(y) < min_length:
                repeat_count = int(min_length / len(y)) + 1
                y = np.tile(y, repeat_count)[:min_length]
            
            features = {}
            
            n_fft = min(2048, len(y))
            hop_length = n_fft // 4
            
            try:
                mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=n_fft, hop_length=hop_length)
                features['mfcc_mean'] = np.mean(mfccs, axis=1)
                features['mfcc_std'] = np.std(mfccs, axis=1)
            except Exception as e:
                features['mfcc_mean'] = np.zeros(13)
                features['mfcc_std'] = np.zeros(13)
            
            try:
                spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
                features['spectral_centroid'] = float(np.mean(spectral_centroids))
                features['spectral_centroid_std'] = float(np.std(spectral_centroids))
            except Exception as e:
                features['spectral_centroid'] = 1500.0
                features['spectral_centroid_std'] = 100.0
            
            try:
                pitches, magnitudes = librosa.piptrack(y=y, sr=sr, threshold=0.1, n_fft=n_fft, hop_length=hop_length)
                pitch_values = []
                for t in range(pitches.shape[1]):
                    index = magnitudes[:, t].argmax()
                    pitch = pitches[index, t]
                    if pitch > 0:
                        pitch_values.append(pitch)
                
                if pitch_values:
                    features['pitch_mean'] = float(np.mean(pitch_values))
                    features['pitch_std'] = float(np.std(pitch_values))
                else:
                    features['pitch_mean'] = 150.0
                    features['pitch_std'] = 20.0
            except Exception as e:
                features['pitch_mean'] = 150.0
                features['pitch_std'] = 20.0
            
            try:
                zcr = librosa.feature.zero_crossing_rate(y, hop_length=hop_length)
                features['zcr_mean'] = float(np.mean(zcr))
                features['zcr_std'] = float(np.std(zcr))
            except Exception as e:
                features['zcr_mean'] = 0.1
                features['zcr_std'] = 0.05
            
            return features
            
        except Exception as e:
            return None

def main():
    app = AccentClassifierApp()
    interface = app.create_interface()
    
    interface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )

if __name__ == "__main__":
    main()