Sulai2005's picture
Initial commit
506a2b4
from functools import lru_cache
from scipy import signal
import numpy as np
import librosa
@lru_cache()
def mel_basis(hp):
assert hp.fmax <= hp.sample_rate // 2
return librosa.filters.mel(
sr=hp.sample_rate,
n_fft=hp.n_fft,
n_mels=hp.num_mels,
fmin=hp.fmin,
fmax=hp.fmax) # -> (nmel, nfreq)
def preemphasis(wav, hp):
assert hp.preemphasis != 0
wav = signal.lfilter([1, -hp.preemphasis], [1], wav)
wav = np.clip(wav, -1, 1)
return wav
def melspectrogram(wav, hp, pad=True):
# Run through pre-emphasis
if hp.preemphasis > 0:
wav = preemphasis(wav, hp)
assert np.abs(wav).max() - 1 < 1e-07
# Do the stft
spec_complex = _stft(wav, hp, pad=pad)
# Get the magnitudes
spec_magnitudes = np.abs(spec_complex)
if hp.mel_power != 1.0:
spec_magnitudes **= hp.mel_power
# Get the mel and convert magnitudes->db
mel = np.dot(mel_basis(hp), spec_magnitudes)
if hp.mel_type == "db":
mel = _amp_to_db(mel, hp)
# Normalise the mel from db to 0,1
if hp.normalized_mels:
mel = _normalize(mel, hp).astype(np.float32)
assert not pad or mel.shape[1] == 1 + len(wav) // hp.hop_size # Sanity check
return mel # (M, T)
def _stft(y, hp, pad=True):
# NOTE: after 0.8, pad mode defaults to constant, setting this to reflect for
# historical consistency and streaming-version consistency
return librosa.stft(
y,
n_fft=hp.n_fft,
hop_length=hp.hop_size,
win_length=hp.win_size,
center=pad,
pad_mode="reflect",
)
def _amp_to_db(x, hp):
return 20 * np.log10(np.maximum(hp.stft_magnitude_min, x))
def _db_to_amp(x):
return np.power(10.0, x * 0.05)
def _normalize(s, hp, headroom_db=15):
min_level_db = 20 * np.log10(hp.stft_magnitude_min)
s = (s - min_level_db) / (-min_level_db + headroom_db)
return s