File size: 7,199 Bytes
15a1f73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import math
import os
# SpeechBrain and its dependencies
from speechbrain.inference.classifiers import EncoderClassifier # Updated import as requested
import torch
import torchaudio
import numpy
import scipy
from tqdm import tqdm
from huggingface_hub import hf_hub_download
import torch.nn.functional as F
import warnings
warnings.filterwarnings("ignore")
# --- Global variable for the accent classification model ---
accent_classifier = None
# Define a temporary directory for Hugging Face cache within the current working directory
HF_CACHE_DIR = os.path.join(os.getcwd(), '.hf_cache')
os.makedirs(HF_CACHE_DIR, exist_ok=True) # Create the directory if it doesn't exist
print(f"Hugging Face cache directory created/ensured at: {HF_CACHE_DIR}")
# --- Function to load the SpeechBrain Accent Classification Model ---
def load_accent_model():
"""
Loads the SpeechBrain accent classification model from Hugging Face.
This function should be called once at application startup.
"""
global accent_classifier
if accent_classifier is None:
try:
print("Loading SpeechBrain accent classification model... This may take a moment.")
# Set the HF_HOME environment variable to redirect Hugging Face's cache.
# This is crucial for resolving WinError 1314 permission issues on Windows,
# as it ensures model files are downloaded and cached in a user-writable location.
os.environ['HF_HOME'] = HF_CACHE_DIR
print(f"HF_HOME environment variable set to: {os.environ['HF_HOME']}")
# Using the ECAPA-TDNN based model for English accent classification
accent_classifier = EncoderClassifier.from_hparams(
source="Jzuluaga/accent-id-commonaccent_ecapa",
savedir="pretrained_models/accent-id-commonaccent_ecapa" # A distinct directory for this model
)
print("SpeechBrain model loaded successfully.")
except Exception as e:
print(f"Error loading SpeechBrain model: {e}")
print("\n--------------------------------------------------------------")
print("Troubleshooting Steps for Model Loading Errors:")
print("1. **Ensure Python Environment is Clean:** If you haven't, create a NEW virtual environment and install dependencies there.")
print(" Example (in your project directory):")
# print(" `python -m venv new_accent_env`")
# print(" `.\new_accent_env\Scripts\activate` (Windows) or `source new_accent_env/bin/activate` (macOS/Linux)")
print("2. **Install/Upgrade ALL Dependencies:**")
print(" `pip uninstall speechbrain transformers torchaudio huggingface_hub numpy scipy tqdm Flask Flask-Executor yt-dlp -y`")
print(" `pip install --upgrade speechbrain transformers torchaudio huggingface_hub numpy scipy tqdm Flask Flask-Executor`")
# print("3. **Manually Clear Hugging Face Cache:** If issues persist, delete the folder: `D:\Accent Detection\rem_waste\.hf_cache`")
print(" This forces a fresh download. Then try running your app again.")
print("4. **Check FFmpeg Installation:** Ensure FFmpeg is installed on your system and its `bin` directory is added to your system's PATH.")
print("--------------------------------------------------------------\n")
accent_classifier = None # Set to None if loading fails
# --- Function to detect accent from an audio file ---
def detect_accent(audio_path, task_id):
"""
Analyzes the speaker's accent from the given audio file using the pre-loaded SpeechBrain model.
Returns the classified accent, a confidence score, and a summary.
"""
if accent_classifier is None:
return None, None, None, "Accent classification model not loaded. Please ensure the model loads correctly at startup."
print(f"Task {task_id}: Analyzing accent from {audio_path}...")
try:
# The audio_path passed here from video_processing.py should already be an absolute path.
# Removing os.path.abspath() to prevent potential double-concatenation issues
# if SpeechBrain's internal file handling implicitly prepends CWD to an already absolute path.
# processed_audio_path = os.path.abspath(audio_path) # Removed this line
processed_audio_path = audio_path # Use the path directly as it should be absolute
print(f"Task {task_id}: Final audio path for SpeechBrain: {processed_audio_path}")
# Add checks for file existence and size
if not os.path.exists(processed_audio_path):
return None, None, None, f"Audio file not found at: {processed_audio_path}"
if os.path.getsize(processed_audio_path) == 0:
return None, None, None, f"Audio file is empty at: {processed_audio_path}"
out_prob, score, index, text_lab = accent_classifier.classify_file(processed_audio_path)
# Print raw outputs for debugging/understanding
print(f"out_prob: {out_prob}, score: {score}, index: {index}, text_lab: {text_lab}")
# Apply softmax to convert logits (out_prob) into probabilities.
probabilities = F.softmax(out_prob, dim=-1)
# Get the confidence for the predicted accent
confidence = probabilities[0, index.item()] * 100
accent = text_lab[0]
summary = "Analysis complete. The detected accent is based on the dominant English accent identified in the audio segment provided."
print(f"Task {task_id}: Accent: {accent}, Confidence: {confidence:.2f}%")
return accent, confidence, summary, None
except Exception as e:
error_message = f"An error occurred during accent detection: {e}"
print(f"Task {task_id}: {error_message}")
return None, None, None, error_message
# Example of how you would integrate this (not runnable on its own, requires an audio file)
if __name__ == '__main__':
# This part is for demonstration and would be integrated into your Flask app.
# You would need an actual audio file (e.g., a .wav file) to test this.
# For example:
HF_CACHE_DIR = os.path.join(os.getcwd(), '.hf_cache')
os.makedirs(HF_CACHE_DIR, exist_ok=True) # Create the directory if it doesn't exist
print(f"Hugging Face cache directory created/ensured at: {HF_CACHE_DIR}")
input_file = input("Enter audio: ")
dummy_audio_path = f"temp_files/{input_file}"
test_task_id = "test_accent_detection_123"
# 1. Load the model first (typically done once at app startup)
load_accent_model()
# 2. Then call detect_accent with a valid audio path
if accent_classifier:
accent, confidence, summary, error = detect_accent(dummy_audio_path, test_task_id)
if accent:
print(f"\n--- Detection Result ---")
print(f"Detected Accent: {accent}")
print(f"Confidence: {confidence:.2f}%")
print(f"Summary: {summary}")
else:
print(f"\n--- Detection Error ---")
print(f"Error: {error}")
else:
print("Model could not be loaded, skipping accent detection example.")
|