mguven61 commited on
Commit
d636467
·
verified ·
1 Parent(s): 846f2d4

Upload detect.py

Browse files
Files changed (1) hide show
  1. detect.py +262 -0
detect.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import os
4
+ import sys
5
+
6
+ class SimpleOfflineAccentClassifier:
7
+ def __init__(self):
8
+ self.accent_profiles = {
9
+ 'American': {
10
+ 'formant_f1_range': (300, 800),
11
+ 'formant_f2_range': (1200, 2200),
12
+ 'pitch_variance': 'medium',
13
+ 'tempo_range': (140, 180),
14
+ 'spectral_tilt': 'neutral'
15
+ },
16
+ 'British': {
17
+ 'formant_f1_range': (280, 750),
18
+ 'formant_f2_range': (1400, 2400),
19
+ 'pitch_variance': 'low',
20
+ 'tempo_range': (120, 160),
21
+ 'spectral_tilt': 'high'
22
+ },
23
+ 'Australian': {
24
+ 'formant_f1_range': (320, 850),
25
+ 'formant_f2_range': (1100, 2000),
26
+ 'pitch_variance': 'high',
27
+ 'tempo_range': (130, 170),
28
+ 'spectral_tilt': 'low'
29
+ },
30
+ 'Indian': {
31
+ 'formant_f1_range': (350, 900),
32
+ 'formant_f2_range': (1300, 2300),
33
+ 'pitch_variance': 'high',
34
+ 'tempo_range': (160, 200),
35
+ 'spectral_tilt': 'neutral'
36
+ },
37
+ 'Canadian': {
38
+ 'formant_f1_range': (290, 780),
39
+ 'formant_f2_range': (1250, 2150),
40
+ 'pitch_variance': 'medium',
41
+ 'tempo_range': (135, 175),
42
+ 'spectral_tilt': 'neutral'
43
+ }
44
+ }
45
+
46
+ def extract_acoustic_features(self, audio_path):
47
+ try:
48
+ y, sr = librosa.load(audio_path, sr=22050, duration=30)
49
+
50
+ if len(y) == 0:
51
+ return None
52
+
53
+ min_length = sr * 2
54
+ if len(y) < min_length:
55
+ repeat_count = int(min_length / len(y)) + 1
56
+ y = np.tile(y, repeat_count)[:min_length]
57
+
58
+ features = {}
59
+
60
+ n_fft = min(2048, len(y))
61
+ hop_length = n_fft // 4
62
+
63
+ try:
64
+ mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=n_fft, hop_length=hop_length)
65
+ features['mfcc_mean'] = np.mean(mfccs, axis=1)
66
+ features['mfcc_std'] = np.std(mfccs, axis=1)
67
+ except Exception as e:
68
+ features['mfcc_mean'] = np.zeros(13)
69
+ features['mfcc_std'] = np.zeros(13)
70
+
71
+ try:
72
+ spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
73
+ features['spectral_centroid'] = float(np.mean(spectral_centroids))
74
+ features['spectral_centroid_std'] = float(np.std(spectral_centroids))
75
+
76
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
77
+ features['spectral_rolloff'] = float(np.mean(spectral_rolloff))
78
+
79
+ spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
80
+ features['spectral_bandwidth'] = float(np.mean(spectral_bandwidth))
81
+ except Exception as e:
82
+ features['spectral_centroid'] = 1500.0
83
+ features['spectral_centroid_std'] = 100.0
84
+ features['spectral_rolloff'] = 3000.0
85
+ features['spectral_bandwidth'] = 1000.0
86
+
87
+ try:
88
+ pitches, magnitudes = librosa.piptrack(y=y, sr=sr, threshold=0.1, n_fft=n_fft, hop_length=hop_length)
89
+ pitch_values = []
90
+ for t in range(pitches.shape[1]):
91
+ index = magnitudes[:, t].argmax()
92
+ pitch = pitches[index, t]
93
+ if pitch > 0:
94
+ pitch_values.append(pitch)
95
+
96
+ if pitch_values:
97
+ features['pitch_mean'] = float(np.mean(pitch_values))
98
+ features['pitch_std'] = float(np.std(pitch_values))
99
+ features['pitch_range'] = float(np.max(pitch_values) - np.min(pitch_values))
100
+ else:
101
+ features['pitch_mean'] = 150.0
102
+ features['pitch_std'] = 20.0
103
+ features['pitch_range'] = 50.0
104
+ except Exception as e:
105
+ features['pitch_mean'] = 150.0
106
+ features['pitch_std'] = 20.0
107
+ features['pitch_range'] = 50.0
108
+
109
+ try:
110
+ tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=hop_length)
111
+ features['tempo'] = float(tempo)
112
+ except Exception as e:
113
+ features['tempo'] = 120.0
114
+
115
+ try:
116
+ zcr = librosa.feature.zero_crossing_rate(y, hop_length=hop_length)
117
+ features['zcr_mean'] = float(np.mean(zcr))
118
+ features['zcr_std'] = float(np.std(zcr))
119
+ except Exception as e:
120
+ features['zcr_mean'] = 0.1
121
+ features['zcr_std'] = 0.05
122
+
123
+ return features
124
+
125
+ except Exception as e:
126
+ return None
127
+
128
+ def calculate_accent_scores(self, features):
129
+ scores = {}
130
+
131
+ for accent, profile in self.accent_profiles.items():
132
+ score = 0.0
133
+
134
+ spectral_centroid = features.get('spectral_centroid', 1500)
135
+ f2_range = profile['formant_f2_range']
136
+
137
+ if f2_range[0] <= spectral_centroid <= f2_range[1]:
138
+ score += 0.3
139
+ else:
140
+ distance = min(
141
+ abs(spectral_centroid - f2_range[0]),
142
+ abs(spectral_centroid - f2_range[1])
143
+ )
144
+ score += max(0, 0.3 - (distance / 1000))
145
+
146
+ pitch_std = features.get('pitch_std', 20)
147
+ if profile['pitch_variance'] == 'low' and pitch_std < 20:
148
+ score += 0.2
149
+ elif profile['pitch_variance'] == 'medium' and 20 <= pitch_std <= 40:
150
+ score += 0.2
151
+ elif profile['pitch_variance'] == 'high' and pitch_std > 40:
152
+ score += 0.2
153
+
154
+ tempo = features.get('tempo', 120)
155
+ tempo_range = profile['tempo_range']
156
+
157
+ if tempo_range[0] <= tempo <= tempo_range[1]:
158
+ score += 0.2
159
+ else:
160
+ distance = min(
161
+ abs(tempo - tempo_range[0]),
162
+ abs(tempo - tempo_range[1])
163
+ )
164
+ score += max(0, 0.2 - (distance / 50))
165
+
166
+ mfcc_score = self._calculate_mfcc_similarity(features.get('mfcc_mean', np.zeros(13)), accent)
167
+ score += mfcc_score * 0.3
168
+
169
+ scores[accent] = max(0, min(1, score))
170
+
171
+ return scores
172
+
173
+ def _calculate_mfcc_similarity(self, mfcc_features, accent):
174
+ accent_patterns = {
175
+ 'American': [0.2, -0.1, 0.3, -0.2, 0.1, -0.1, 0.2, -0.1, 0.1, -0.1, 0.1, -0.1, 0.1],
176
+ 'British': [0.1, -0.2, 0.2, -0.3, 0.2, -0.1, 0.1, -0.2, 0.1, -0.1, 0.2, -0.1, 0.1],
177
+ 'Australian': [0.3, -0.1, 0.1, -0.2, 0.3, -0.1, 0.2, -0.1, 0.2, -0.1, 0.1, -0.2, 0.1],
178
+ 'Indian': [0.1, -0.3, 0.4, -0.1, 0.2, -0.2, 0.3, -0.1, 0.1, -0.2, 0.2, -0.1, 0.2],
179
+ 'Canadian': [0.2, -0.1, 0.2, -0.2, 0.1, -0.1, 0.1, -0.1, 0.2, -0.1, 0.1, -0.1, 0.1]
180
+ }
181
+
182
+ if accent not in accent_patterns:
183
+ return 0
184
+
185
+ try:
186
+ pattern = np.array(accent_patterns[accent])
187
+ mfcc_array = np.array(mfcc_features)
188
+
189
+ mfcc_norm = np.linalg.norm(mfcc_array)
190
+ pattern_norm = np.linalg.norm(pattern)
191
+
192
+ if mfcc_norm > 0 and pattern_norm > 0:
193
+ mfcc_normalized = mfcc_array / mfcc_norm
194
+ pattern_normalized = pattern / pattern_norm
195
+
196
+ similarity = np.dot(mfcc_normalized, pattern_normalized)
197
+ return max(0, float(similarity))
198
+ else:
199
+ return 0.5
200
+
201
+ except Exception as e:
202
+ return 0.5
203
+
204
+ def predict_accent(self, audio_path):
205
+ if not os.path.exists(audio_path):
206
+ return None
207
+
208
+ features = self.extract_acoustic_features(audio_path)
209
+ if not features:
210
+ return None
211
+
212
+ scores = self.calculate_accent_scores(features)
213
+
214
+ total_score = sum(scores.values())
215
+ if total_score > 0:
216
+ normalized_scores = {k: v/total_score for k, v in scores.items()}
217
+ else:
218
+ normalized_scores = {k: 1.0/len(scores) for k in scores.keys()}
219
+
220
+ predicted_accent = max(normalized_scores, key=normalized_scores.get)
221
+ confidence = normalized_scores[predicted_accent]
222
+
223
+ return {
224
+ 'accent': predicted_accent,
225
+ 'confidence': confidence,
226
+ 'all_probabilities': normalized_scores,
227
+ 'raw_scores': scores
228
+ }
229
+
230
+ def print_detailed_results(self, result):
231
+ if not result:
232
+ return
233
+
234
+ print(f"Predicted Accent: {result['accent']}")
235
+ print(f"Confidence Score: {result['confidence']:.1%}")
236
+
237
+ print("All Accent Probabilities:")
238
+
239
+ sorted_probs = sorted(
240
+ result['all_probabilities'].items(),
241
+ key=lambda x: x[1],
242
+ reverse=True
243
+ )
244
+
245
+ for i, (accent, prob) in enumerate(sorted_probs):
246
+ bar_length = int(prob * 40)
247
+ bar = "█" * bar_length + "░" * (40 - bar_length)
248
+ print(f"{accent:12}: {prob:.1%} |{bar}|")
249
+
250
+ def main():
251
+ if len(sys.argv) != 2:
252
+ print("Usage: python accent_classifier.py audio_file.mp3")
253
+ return
254
+
255
+ audio_file = sys.argv[1]
256
+
257
+ classifier = SimpleOfflineAccentClassifier()
258
+ result = classifier.predict_accent(audio_file)
259
+ classifier.print_detailed_results(result)
260
+
261
+ if __name__ == "__main__":
262
+ main()