accent-classifier / detect2.py
mguven61's picture
Upload detect2.py
0b86de1 verified
import gradio as gr
import os
from detect import SimpleOfflineAccentClassifier
import ssl
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
ssl._create_default_https_context = ssl._create_unverified_context
os.environ['CURL_CA_BUNDLE'] = ''
os.environ['REQUESTS_CA_BUNDLE'] = ''
import torch
import torchaudio
import librosa
import numpy as np
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import soundfile as sf
class AccentClassifierApp:
def __init__(self):
self.classifier = HuggingFaceAccentClassifier()
def classify_audio(self, audio_file):
if audio_file is None:
return "Please upload an audio file."
try:
result = self.classifier.predict_accent(audio_file)
if result is None:
return "Audio file processing failed."
output = f"Predicted Accent: {result['accent']}\n"
output += f"Confidence Score: {result['confidence']:.2%}\n\n"
output += "All Probabilities:\n"
sorted_probs = sorted(
result['all_probabilities'].items(),
key=lambda x: x[1],
reverse=True
)
for accent, prob in sorted_probs:
bar = "█" * int(prob * 20)
output += f"- {accent}: {prob:.2%} {bar}\n"
return output
except Exception as e:
return f"Error occurred: {str(e)}"
def create_interface(self):
with gr.Blocks(title="Accent Classifier") as interface:
gr.Markdown("""
# AI Accent Classifier
This application analyzes speech audio files to predict accents.
Supported formats: WAV, MP3, FLAC
""")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="Upload Audio File",
type="filepath"
)
classify_btn = gr.Button(
"Analyze Accent",
variant="primary"
)
with gr.Column():
output_text = gr.Markdown(
label="Analysis Results",
value="Analysis results will appear here..."
)
gr.Markdown("### Example Audio Files")
gr.Examples(
examples=[
["examples/american_sample.wav"],
["examples/british_sample.wav"],
] if os.path.exists("examples") else [],
inputs=audio_input
)
classify_btn.click(
fn=self.classify_audio,
inputs=audio_input,
outputs=output_text
)
return interface
def extract_acoustic_features(self, audio_path):
try:
y, sr = librosa.load(audio_path, sr=22050, duration=30)
if len(y) == 0:
return None
min_length = sr * 2
if len(y) < min_length:
repeat_count = int(min_length / len(y)) + 1
y = np.tile(y, repeat_count)[:min_length]
features = {}
n_fft = min(2048, len(y))
hop_length = n_fft // 4
try:
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=n_fft, hop_length=hop_length)
features['mfcc_mean'] = np.mean(mfccs, axis=1)
features['mfcc_std'] = np.std(mfccs, axis=1)
except Exception as e:
features['mfcc_mean'] = np.zeros(13)
features['mfcc_std'] = np.zeros(13)
try:
spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
features['spectral_centroid'] = float(np.mean(spectral_centroids))
features['spectral_centroid_std'] = float(np.std(spectral_centroids))
except Exception as e:
features['spectral_centroid'] = 1500.0
features['spectral_centroid_std'] = 100.0
try:
pitches, magnitudes = librosa.piptrack(y=y, sr=sr, threshold=0.1, n_fft=n_fft, hop_length=hop_length)
pitch_values = []
for t in range(pitches.shape[1]):
index = magnitudes[:, t].argmax()
pitch = pitches[index, t]
if pitch > 0:
pitch_values.append(pitch)
if pitch_values:
features['pitch_mean'] = float(np.mean(pitch_values))
features['pitch_std'] = float(np.std(pitch_values))
else:
features['pitch_mean'] = 150.0
features['pitch_std'] = 20.0
except Exception as e:
features['pitch_mean'] = 150.0
features['pitch_std'] = 20.0
try:
zcr = librosa.feature.zero_crossing_rate(y, hop_length=hop_length)
features['zcr_mean'] = float(np.mean(zcr))
features['zcr_std'] = float(np.std(zcr))
except Exception as e:
features['zcr_mean'] = 0.1
features['zcr_std'] = 0.05
return features
except Exception as e:
return None
def main():
app = AccentClassifierApp()
interface = app.create_interface()
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=True
)
if __name__ == "__main__":
main()