|
import warnings |
|
|
|
import librosa |
|
import numpy as np |
|
import resampy |
|
import torch |
|
|
|
import torchcrepe |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MIN_DB = -100. |
|
|
|
|
|
REF_DB = 20. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def a_weighted(audio, sample_rate, hop_length=None, pad=True): |
|
"""Retrieve the per-frame loudness""" |
|
|
|
device = audio.device |
|
|
|
|
|
hop_length = sample_rate // 100 if hop_length is None else hop_length |
|
|
|
|
|
audio = audio.detach().cpu().numpy().squeeze(0) |
|
|
|
|
|
if sample_rate != torchcrepe.SAMPLE_RATE: |
|
audio = resampy.resample(audio, sample_rate, torchcrepe.SAMPLE_RATE) |
|
hop_length = int(hop_length * torchcrepe.SAMPLE_RATE / sample_rate) |
|
|
|
|
|
if not hasattr(a_weighted, 'weights'): |
|
a_weighted.weights = perceptual_weights() |
|
|
|
|
|
stft = librosa.stft(audio, |
|
n_fft=torchcrepe.WINDOW_SIZE, |
|
hop_length=hop_length, |
|
win_length=torchcrepe.WINDOW_SIZE, |
|
center=pad, |
|
pad_mode='constant') |
|
|
|
|
|
db = librosa.amplitude_to_db(np.abs(stft)) |
|
|
|
|
|
weighted = db + a_weighted.weights |
|
|
|
|
|
weighted[weighted < MIN_DB] = MIN_DB |
|
|
|
|
|
return torch.from_numpy(weighted.mean(axis=0)).float().to(device)[None] |
|
|
|
|
|
def perceptual_weights(): |
|
"""A-weighted frequency-dependent perceptual loudness weights""" |
|
frequencies = librosa.fft_frequencies(sr=torchcrepe.SAMPLE_RATE, |
|
n_fft=torchcrepe.WINDOW_SIZE) |
|
|
|
|
|
|
|
with warnings.catch_warnings(): |
|
warnings.simplefilter('ignore', RuntimeWarning) |
|
return librosa.A_weighting(frequencies)[:, None] - REF_DB |
|
|