#!/usr/bin/python3 # -*- coding: utf-8 -*- import argparse import numpy as np import torch import torch.nn as nn import librosa from transformers import Wav2Vec2Processor from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model, Wav2Vec2PreTrainedModel from project_settings import project_path class ModelHead(nn.Module): def __init__(self, config, num_labels): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.final_dropout) self.out_proj = nn.Linear(config.hidden_size, num_labels) def forward(self, features, **kwargs): x = features x = self.dropout(x) x = self.dense(x) x = torch.tanh(x) x = self.dropout(x) x = self.out_proj(x) return x class AgeGenderModel(Wav2Vec2PreTrainedModel): def __init__(self, config): super().__init__(config) self.config = config self.wav2vec2 = Wav2Vec2Model(config) self.age = ModelHead(config, 1) self.gender = ModelHead(config, 3) self.init_weights() def forward(self, input_values, ): outputs = self.wav2vec2(input_values) hidden_states = outputs[0] hidden_states = torch.mean(hidden_states, dim=1) logits_age = self.age.forward(hidden_states) logits_gender = torch.softmax(self.gender.forward(hidden_states), dim=1) return hidden_states, logits_age, logits_gender class AudeeringModel(object): """ https://arxiv.org/abs/2306.16962 https://github.com/audeering/w2v2-age-gender-how-to https://huggingface.co/audeering/wav2vec2-large-robust-6-ft-age-gender https://huggingface.co/audeering/wav2vec2-large-robust-24-ft-age-gender """ def __init__(self, model_path: str): self.model_path = model_path device = torch.device("cuda" if torch.cuda.is_available() else "cpu") processor: Wav2Vec2Processor = Wav2Vec2Processor.from_pretrained(self.model_path) model = AgeGenderModel.from_pretrained(self.model_path).to(device) model.eval() self.device = device self.processor = processor self.model = model def predict(self, signal: np.ndarray, sample_rate: int) -> dict: y = self.processor.__call__(signal, sampling_rate=sample_rate) y = y["input_values"][0] y = y.reshape(1, -1) y = torch.from_numpy(y).to(self.device) _, age, gender = self.model.forward(y) age = age.detach().cpu().numpy().tolist() age = age[0][0] gender = gender.detach().cpu().numpy().tolist() gender = gender[0] result = { "age": round(age, 4), "female": round(gender[0], 4), "male": round(gender[1], 4), "child": round(gender[2], 4), } return result def __call__(self, *args, **kwargs): return self.predict(*args, **kwargs) if __name__ == "__main__": pass