Spaces:
Sleeping
Sleeping
import onnxruntime as ort | |
import librosa | |
import numpy as np | |
import scipy | |
SR = 22050 | |
LENGTH_SEC = 1.5 | |
def resample(audio_data, original_sr, target_sr): | |
num_samples = int(len(audio_data) * target_sr / original_sr) | |
return scipy.signal.resample(audio_data, num_samples) | |
def load_audio_slices(af): | |
audio, sr = librosa.load(af, sr=None) | |
if sr != SR: | |
audio = resample(audio, sr, SR) | |
LENGTH_SAMPLES = int(LENGTH_SEC * SR) | |
slices = [] | |
if len(audio) < LENGTH_SAMPLES: | |
padding_needed = LENGTH_SAMPLES - len(audio) | |
audio = np.pad(audio, (0, padding_needed), mode='constant') | |
slices.append(audio) | |
else: | |
num_chunks = len(audio) / LENGTH_SAMPLES | |
if num_chunks > 2: | |
num_chunks = 5 | |
elif num_chunks > 1.5: | |
num_chunks = 3 | |
elif num_chunks > 1: | |
num_chunks = 2 | |
end = len(audio) - LENGTH_SAMPLES | |
idxs_split = np.arange(0, end, end // num_chunks, dtype=int) | |
for idx_split in idxs_split: | |
sl = slice(idx_split, idx_split+LENGTH_SAMPLES) | |
slices.append(audio[sl]) | |
slices = np.vstack(slices) | |
return slices / np.max(slices, axis=1)[:, np.newaxis] | |
class CnnVoiceClassifier: | |
def __init__(self): | |
self.session = ort.InferenceSession('model.onnx') | |
self.input_name = self.session.get_inputs()[0].name | |
self.output_name = self.session.get_outputs()[0].name | |
def inference(self, audio_path): | |
audio = load_audio_slices(audio_path) | |
input_feed = {self.input_name: np.expand_dims(audio, axis=-1)} | |
outputs = self.session.run([self.output_name], input_feed) | |
probs = outputs[0].flatten() | |
w = np.abs((probs - 0.5)*2) | |
final_prob = np.average(probs, weights=w) | |
return {'Male': final_prob, 'Female': 1-final_prob} | |