import onnxruntime as ort import librosa import numpy as np import scipy SR = 22050 LENGTH_SEC = 1.5 def resample(audio_data, original_sr, target_sr): num_samples = int(len(audio_data) * target_sr / original_sr) return scipy.signal.resample(audio_data, num_samples) def load_audio_slices(af): audio, sr = librosa.load(af, sr=None) if sr != SR: audio = resample(audio, sr, SR) LENGTH_SAMPLES = int(LENGTH_SEC * SR) slices = [] if len(audio) < LENGTH_SAMPLES: padding_needed = LENGTH_SAMPLES - len(audio) audio = np.pad(audio, (0, padding_needed), mode='constant') slices.append(audio) else: num_chunks = len(audio) / LENGTH_SAMPLES if num_chunks > 2: num_chunks = 5 elif num_chunks > 1.5: num_chunks = 3 elif num_chunks > 1: num_chunks = 2 end = len(audio) - LENGTH_SAMPLES idxs_split = np.arange(0, end, end // num_chunks, dtype=int) for idx_split in idxs_split: sl = slice(idx_split, idx_split+LENGTH_SAMPLES) slices.append(audio[sl]) slices = np.vstack(slices) return slices / np.max(slices, axis=1)[:, np.newaxis] class CnnVoiceClassifier: def __init__(self): self.session = ort.InferenceSession('model.onnx') self.input_name = self.session.get_inputs()[0].name self.output_name = self.session.get_outputs()[0].name def inference(self, audio_path): audio = load_audio_slices(audio_path) input_feed = {self.input_name: np.expand_dims(audio, axis=-1)} outputs = self.session.run([self.output_name], input_feed) probs = outputs[0].flatten() w = np.abs((probs - 0.5)*2) final_prob = np.average(probs, weights=w) return {'Male': final_prob, 'Female': 1-final_prob}