Cristian Tatu
v1
1e50356
import onnxruntime as ort
import librosa
import numpy as np
import scipy
SR = 22050
LENGTH_SEC = 1.5
def resample(audio_data, original_sr, target_sr):
num_samples = int(len(audio_data) * target_sr / original_sr)
return scipy.signal.resample(audio_data, num_samples)
def load_audio_slices(af):
audio, sr = librosa.load(af, sr=None)
if sr != SR:
audio = resample(audio, sr, SR)
LENGTH_SAMPLES = int(LENGTH_SEC * SR)
slices = []
if len(audio) < LENGTH_SAMPLES:
padding_needed = LENGTH_SAMPLES - len(audio)
audio = np.pad(audio, (0, padding_needed), mode='constant')
slices.append(audio)
else:
num_chunks = len(audio) / LENGTH_SAMPLES
if num_chunks > 2:
num_chunks = 5
elif num_chunks > 1.5:
num_chunks = 3
elif num_chunks > 1:
num_chunks = 2
end = len(audio) - LENGTH_SAMPLES
idxs_split = np.arange(0, end, end // num_chunks, dtype=int)
for idx_split in idxs_split:
sl = slice(idx_split, idx_split+LENGTH_SAMPLES)
slices.append(audio[sl])
slices = np.vstack(slices)
return slices / np.max(slices, axis=1)[:, np.newaxis]
class CnnVoiceClassifier:
def __init__(self):
self.session = ort.InferenceSession('model.onnx')
self.input_name = self.session.get_inputs()[0].name
self.output_name = self.session.get_outputs()[0].name
def inference(self, audio_path):
audio = load_audio_slices(audio_path)
input_feed = {self.input_name: np.expand_dims(audio, axis=-1)}
outputs = self.session.run([self.output_name], input_feed)
probs = outputs[0].flatten()
w = np.abs((probs - 0.5)*2)
final_prob = np.average(probs, weights=w)
return {'Male': final_prob, 'Female': 1-final_prob}