Spaces:
Running
Running
# models/embeddings/audio_embedding_model.py | |
import torch | |
import librosa | |
from typing import List | |
from transformers import AutoProcessor, AutoModel | |
from utils.logger import logger | |
from config.model_configs import AUDIO_EMBEDDING_MODEL | |
class AudioEmbeddingModel: | |
def __init__(self): | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
logger.info(f"Loading Audio Embedding Model '{AUDIO_EMBEDDING_MODEL}' to device: {self.device}") | |
self.processor = AutoProcessor.from_pretrained(AUDIO_EMBEDDING_MODEL) | |
self.model = AutoModel.from_pretrained(AUDIO_EMBEDDING_MODEL).to(self.device) | |
logger.info("Audio Embedding Model loaded successfully.") | |
def get_embeddings(self, audio_paths: List[str]) -> List[List[float]]: | |
if not audio_paths: | |
return [] | |
audio_inputs = [] | |
sample_rate = self.processor.feature_extractor.sampling_rate | |
for audio_path in audio_paths: | |
try: | |
audio_data, sr = librosa.load(audio_path, sr=sample_rate) | |
audio_inputs.append(audio_data) | |
except Exception as e: | |
logger.warning(f"Could not load audio {audio_path}: {e}. Skipping.") | |
continue | |
if not audio_inputs: | |
return [] | |
inputs = self.processor(audios=audio_inputs, sampling_rate=sample_rate, return_tensors="pt", padding=True).to(self.device) | |
with torch.no_grad(): | |
audio_features = self.model.get_audio_features(**inputs) | |
embeddings = audio_features / audio_features.norm(p=2, dim=-1, keepdim=True) | |
embeddings_list = embeddings.cpu().tolist() | |
logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(audio_inputs)} audio clips.") | |
return embeddings_list |