|
import warnings
|
|
|
|
import numpy as np
|
|
import resampy
|
|
import torch
|
|
import tqdm
|
|
|
|
import torchcrepe
|
|
|
|
|
|
__all__ = ['CENTS_PER_BIN',
|
|
'MAX_FMAX',
|
|
'PITCH_BINS',
|
|
'SAMPLE_RATE',
|
|
'WINDOW_SIZE',
|
|
'UNVOICED',
|
|
'embed',
|
|
'embed_from_file',
|
|
'embed_from_file_to_file',
|
|
'embed_from_files_to_files',
|
|
'infer',
|
|
'predict',
|
|
'predict_from_file',
|
|
'predict_from_file_to_file',
|
|
'predict_from_files_to_files',
|
|
'preprocess',
|
|
'postprocess',
|
|
'resample']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CENTS_PER_BIN = 20
|
|
MAX_FMAX = 2006.
|
|
PITCH_BINS = 360
|
|
SAMPLE_RATE = 16000
|
|
WINDOW_SIZE = 1024
|
|
UNVOICED = np.nan
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def predict(audio,
|
|
sample_rate,
|
|
hop_length=None,
|
|
fmin=50.,
|
|
fmax=MAX_FMAX,
|
|
model='full',
|
|
decoder=torchcrepe.decode.viterbi,
|
|
return_harmonicity=False,
|
|
return_periodicity=False,
|
|
batch_size=None,
|
|
device='cpu',
|
|
pad=True):
|
|
"""Performs pitch estimation
|
|
|
|
Arguments
|
|
audio (torch.tensor [shape=(1, time)])
|
|
The audio signal
|
|
sample_rate (int)
|
|
The sampling rate in Hz
|
|
hop_length (int)
|
|
The hop_length in samples
|
|
fmin (float)
|
|
The minimum allowable frequency in Hz
|
|
fmax (float)
|
|
The maximum allowable frequency in Hz
|
|
model (string)
|
|
The model capacity. One of 'full' or 'tiny'.
|
|
decoder (function)
|
|
The decoder to use. See decode.py for decoders.
|
|
return_harmonicity (bool) [DEPRECATED]
|
|
Whether to also return the network confidence
|
|
return_periodicity (bool)
|
|
Whether to also return the network confidence
|
|
batch_size (int)
|
|
The number of frames per batch
|
|
device (string)
|
|
The device used to run inference
|
|
pad (bool)
|
|
Whether to zero-pad the audio
|
|
|
|
Returns
|
|
pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))])
|
|
(Optional) periodicity (torch.tensor
|
|
[shape=(1, 1 + int(time // hop_length))])
|
|
"""
|
|
|
|
if return_harmonicity:
|
|
message = (
|
|
'The torchcrepe return_harmonicity argument is deprecated and '
|
|
'will be removed in a future release. Please use '
|
|
'return_periodicity. Rationale: if network confidence measured '
|
|
'harmonics, the value would be low for non-harmonic, periodic '
|
|
'sounds (e.g., sine waves). But this is not observed.')
|
|
warnings.warn(message, DeprecationWarning)
|
|
return_periodicity = return_harmonicity
|
|
|
|
results = []
|
|
|
|
|
|
with torch.no_grad():
|
|
|
|
|
|
generator = preprocess(audio,
|
|
sample_rate,
|
|
hop_length,
|
|
batch_size,
|
|
device,
|
|
pad)
|
|
for frames in generator:
|
|
|
|
|
|
probabilities = infer(frames, model, device, embed=False)
|
|
|
|
|
|
probabilities = probabilities.reshape(
|
|
audio.size(0), -1, PITCH_BINS).transpose(1, 2)
|
|
|
|
|
|
result = postprocess(probabilities,
|
|
fmin,
|
|
fmax,
|
|
decoder,
|
|
return_harmonicity,
|
|
return_periodicity)
|
|
|
|
|
|
if isinstance(result, tuple):
|
|
result = (result[0].to(audio.device),
|
|
result[1].to(audio.device))
|
|
else:
|
|
result = result.to(audio.device)
|
|
|
|
results.append(result)
|
|
|
|
|
|
if return_periodicity:
|
|
pitch, periodicity = zip(*results)
|
|
return torch.cat(pitch, 1), torch.cat(periodicity, 1)
|
|
|
|
|
|
return torch.cat(results, 1)
|
|
|
|
|
|
def predict_from_file(audio_file,
|
|
hop_length=None,
|
|
fmin=50.,
|
|
fmax=MAX_FMAX,
|
|
model='full',
|
|
decoder=torchcrepe.decode.viterbi,
|
|
return_harmonicity=False,
|
|
return_periodicity=False,
|
|
batch_size=None,
|
|
device='cpu',
|
|
pad=True):
|
|
"""Performs pitch estimation from file on disk
|
|
|
|
Arguments
|
|
audio_file (string)
|
|
The file to perform pitch tracking on
|
|
hop_length (int)
|
|
The hop_length in samples
|
|
fmin (float)
|
|
The minimum allowable frequency in Hz
|
|
fmax (float)
|
|
The maximum allowable frequency in Hz
|
|
model (string)
|
|
The model capacity. One of 'full' or 'tiny'.
|
|
decoder (function)
|
|
The decoder to use. See decode.py for decoders.
|
|
return_harmonicity (bool) [DEPRECATED]
|
|
Whether to also return the network confidence
|
|
return_periodicity (bool)
|
|
Whether to also return the network confidence
|
|
batch_size (int)
|
|
The number of frames per batch
|
|
device (string)
|
|
The device used to run inference
|
|
pad (bool)
|
|
Whether to zero-pad the audio
|
|
|
|
Returns
|
|
pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))])
|
|
(Optional) periodicity (torch.tensor
|
|
[shape=(1, 1 + int(time // hop_length))])
|
|
"""
|
|
|
|
audio, sample_rate = torchcrepe.load.audio(audio_file)
|
|
|
|
|
|
return predict(audio,
|
|
sample_rate,
|
|
hop_length,
|
|
fmin,
|
|
fmax,
|
|
model,
|
|
decoder,
|
|
return_harmonicity,
|
|
return_periodicity,
|
|
batch_size,
|
|
device,
|
|
pad)
|
|
|
|
|
|
def predict_from_file_to_file(audio_file,
|
|
output_pitch_file,
|
|
output_harmonicity_file=None,
|
|
output_periodicity_file=None,
|
|
hop_length=None,
|
|
fmin=50.,
|
|
fmax=MAX_FMAX,
|
|
model='full',
|
|
decoder=torchcrepe.decode.viterbi,
|
|
batch_size=None,
|
|
device='cpu',
|
|
pad=True):
|
|
"""Performs pitch estimation from file on disk
|
|
|
|
Arguments
|
|
audio_file (string)
|
|
The file to perform pitch tracking on
|
|
output_pitch_file (string)
|
|
The file to save predicted pitch
|
|
output_harmonicity_file (string or None) [DEPRECATED]
|
|
The file to save predicted harmonicity
|
|
output_periodicity_file (string or None)
|
|
The file to save predicted periodicity
|
|
hop_length (int)
|
|
The hop_length in samples
|
|
fmin (float)
|
|
The minimum allowable frequency in Hz
|
|
fmax (float)
|
|
The maximum allowable frequency in Hz
|
|
model (string)
|
|
The model capacity. One of 'full' or 'tiny'.
|
|
decoder (function)
|
|
The decoder to use. See decode.py for decoders.
|
|
batch_size (int)
|
|
The number of frames per batch
|
|
device (string)
|
|
The device used to run inference
|
|
pad (bool)
|
|
Whether to zero-pad the audio
|
|
"""
|
|
|
|
if output_harmonicity_file is not None:
|
|
message = (
|
|
'The torchcrepe output_harmonicity_file argument is deprecated and '
|
|
'will be removed in a future release. Please use '
|
|
'output_periodicity_file. Rationale: if network confidence measured '
|
|
'harmonic content, the value would be low for non-harmonic, periodic '
|
|
'sounds (e.g., sine waves). But this is not observed.')
|
|
warnings.warn(message, DeprecationWarning)
|
|
output_periodicity_file = output_harmonicity_file
|
|
|
|
|
|
prediction = predict_from_file(audio_file,
|
|
hop_length,
|
|
fmin,
|
|
fmax,
|
|
model,
|
|
decoder,
|
|
False,
|
|
output_periodicity_file is not None,
|
|
batch_size,
|
|
device,
|
|
pad)
|
|
|
|
|
|
if output_periodicity_file is not None:
|
|
torch.save(prediction[0].detach(), output_pitch_file)
|
|
torch.save(prediction[1].detach(), output_periodicity_file)
|
|
else:
|
|
torch.save(prediction.detach(), output_pitch_file)
|
|
|
|
|
|
def predict_from_files_to_files(audio_files,
|
|
output_pitch_files,
|
|
output_harmonicity_files=None,
|
|
output_periodicity_files=None,
|
|
hop_length=None,
|
|
fmin=50.,
|
|
fmax=MAX_FMAX,
|
|
model='full',
|
|
decoder=torchcrepe.decode.viterbi,
|
|
batch_size=None,
|
|
device='cpu',
|
|
pad=True):
|
|
"""Performs pitch estimation from files on disk without reloading model
|
|
|
|
Arguments
|
|
audio_files (list[string])
|
|
The files to perform pitch tracking on
|
|
output_pitch_files (list[string])
|
|
The files to save predicted pitch
|
|
output_harmonicity_files (list[string] or None) [DEPRECATED]
|
|
The files to save predicted harmonicity
|
|
output_periodicity_files (list[string] or None)
|
|
The files to save predicted periodicity
|
|
hop_length (int)
|
|
The hop_length in samples
|
|
fmin (float)
|
|
The minimum allowable frequency in Hz
|
|
fmax (float)
|
|
The maximum allowable frequency in Hz
|
|
model (string)
|
|
The model capacity. One of 'full' or 'tiny'.
|
|
decoder (function)
|
|
The decoder to use. See decode.py for decoders.
|
|
batch_size (int)
|
|
The number of frames per batch
|
|
device (string)
|
|
The device used to run inference
|
|
pad (bool)
|
|
Whether to zero-pad the audio
|
|
"""
|
|
|
|
if output_harmonicity_files is not None:
|
|
message = (
|
|
'The torchcrepe output_harmonicity_files argument is deprecated and '
|
|
'will be removed in a future release. Please use '
|
|
'output_periodicity_files. Rationale: if network confidence measured '
|
|
'harmonic content, the value would be low for non-harmonic, periodic '
|
|
'sounds (e.g., sine waves). But this is not observed.')
|
|
warnings.warn(message, DeprecationWarning)
|
|
output_periodicity_files = output_harmonicity_files
|
|
|
|
if output_periodicity_files is None:
|
|
output_periodicity_files = len(audio_files) * [None]
|
|
|
|
|
|
iterator = zip(audio_files, output_pitch_files, output_periodicity_files)
|
|
iterator = tqdm.tqdm(iterator, desc='torchcrepe', dynamic_ncols=True)
|
|
for audio_file, output_pitch_file, output_periodicity_file in iterator:
|
|
|
|
|
|
predict_from_file_to_file(audio_file,
|
|
output_pitch_file,
|
|
None,
|
|
output_periodicity_file,
|
|
hop_length,
|
|
fmin,
|
|
fmax,
|
|
model,
|
|
decoder,
|
|
batch_size,
|
|
device,
|
|
pad)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def embed(audio,
|
|
sample_rate,
|
|
hop_length=None,
|
|
model='full',
|
|
batch_size=None,
|
|
device='cpu',
|
|
pad=True):
|
|
"""Embeds audio to the output of CREPE's fifth maxpool layer
|
|
|
|
Arguments
|
|
audio (torch.tensor [shape=(1, time)])
|
|
The audio signals
|
|
sample_rate (int)
|
|
The sampling rate in Hz
|
|
hop_length (int)
|
|
The hop_length in samples
|
|
model (string)
|
|
The model capacity. One of 'full' or 'tiny'.
|
|
batch_size (int)
|
|
The number of frames per batch
|
|
device (string)
|
|
The device to run inference on
|
|
pad (bool)
|
|
Whether to zero-pad the audio
|
|
|
|
Returns
|
|
embedding (torch.tensor [shape=(1,
|
|
1 + int(time // hop_length), 32, -1)])
|
|
"""
|
|
results = []
|
|
|
|
|
|
generator = preprocess(audio,
|
|
sample_rate,
|
|
hop_length,
|
|
batch_size,
|
|
device,
|
|
pad)
|
|
for frames in generator:
|
|
|
|
|
|
embedding = infer(frames, model, device, embed=True)
|
|
|
|
|
|
result = embedding.reshape(audio.size(0), frames.size(0), 32, -1)
|
|
|
|
|
|
results.append(result.to(audio.device))
|
|
|
|
|
|
return torch.cat(results, 1)
|
|
|
|
|
|
def embed_from_file(audio_file,
|
|
hop_length=None,
|
|
model='full',
|
|
batch_size=None,
|
|
device='cpu',
|
|
pad=True):
|
|
"""Embeds audio from disk to the output of CREPE's fifth maxpool layer
|
|
|
|
Arguments
|
|
audio_file (string)
|
|
The wav file containing the audio to embed
|
|
hop_length (int)
|
|
The hop_length in samples
|
|
model (string)
|
|
The model capacity. One of 'full' or 'tiny'.
|
|
batch_size (int)
|
|
The number of frames per batch
|
|
device (string)
|
|
The device to run inference on
|
|
pad (bool)
|
|
Whether to zero-pad the audio
|
|
|
|
Returns
|
|
embedding (torch.tensor [shape=(1,
|
|
1 + int(time // hop_length), 32, -1)])
|
|
"""
|
|
|
|
audio, sample_rate = torchcrepe.load.audio(audio_file)
|
|
|
|
|
|
return embed(audio,
|
|
sample_rate,
|
|
hop_length,
|
|
model,
|
|
batch_size,
|
|
device,
|
|
pad)
|
|
|
|
|
|
def embed_from_file_to_file(audio_file,
|
|
output_file,
|
|
hop_length=None,
|
|
model='full',
|
|
batch_size=None,
|
|
device='cpu',
|
|
pad=True):
|
|
"""Embeds audio from disk and saves to disk
|
|
|
|
Arguments
|
|
audio_file (string)
|
|
The wav file containing the audio to embed
|
|
hop_length (int)
|
|
The hop_length in samples
|
|
output_file (string)
|
|
The file to save the embedding
|
|
model (string)
|
|
The model capacity. One of 'full' or 'tiny'.
|
|
batch_size (int)
|
|
The number of frames per batch
|
|
device (string)
|
|
The device to run inference on
|
|
pad (bool)
|
|
Whether to zero-pad the audio
|
|
"""
|
|
|
|
with torch.no_grad():
|
|
|
|
|
|
embedding = embed_from_file(audio_file,
|
|
hop_length,
|
|
model,
|
|
batch_size,
|
|
device,
|
|
pad)
|
|
|
|
|
|
torch.save(embedding.detach(), output_file)
|
|
|
|
|
|
def embed_from_files_to_files(audio_files,
|
|
output_files,
|
|
hop_length=None,
|
|
model='full',
|
|
batch_size=None,
|
|
device='cpu',
|
|
pad=True):
|
|
"""Embeds audio from disk and saves to disk without reloading model
|
|
|
|
Arguments
|
|
audio_files (list[string])
|
|
The wav files containing the audio to embed
|
|
output_files (list[string])
|
|
The files to save the embeddings
|
|
hop_length (int)
|
|
The hop_length in samples
|
|
model (string)
|
|
The model capacity. One of 'full' or 'tiny'.
|
|
batch_size (int)
|
|
The number of frames per batch
|
|
device (string)
|
|
The device to run inference on
|
|
pad (bool)
|
|
Whether to zero-pad the audio
|
|
"""
|
|
|
|
iterator = zip(audio_files, output_files)
|
|
iterator = tqdm.tqdm(iterator, desc='torchcrepe', dynamic_ncols=True)
|
|
for audio_file, output_file in iterator:
|
|
|
|
|
|
embed_from_file_to_file(audio_file,
|
|
output_file,
|
|
hop_length,
|
|
model,
|
|
batch_size,
|
|
device,
|
|
pad)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def infer(frames, model='full', device='cpu', embed=False):
|
|
"""Forward pass through the model
|
|
|
|
Arguments
|
|
frames (torch.tensor [shape=(time / hop_length, 1024)])
|
|
The network input
|
|
model (string)
|
|
The model capacity. One of 'full' or 'tiny'.
|
|
embed (bool)
|
|
Whether to stop inference at the intermediate embedding layer
|
|
|
|
Returns
|
|
logits (torch.tensor [shape=(1 + int(time // hop_length), 360)]) OR
|
|
embedding (torch.tensor [shape=(1 + int(time // hop_length),
|
|
embedding_size)])
|
|
"""
|
|
|
|
if not hasattr(infer, 'model') or not hasattr(infer, 'capacity') or \
|
|
(hasattr(infer, 'capacity') and infer.capacity != model):
|
|
torchcrepe.load.model(device, model)
|
|
|
|
|
|
infer.model = infer.model.to(device)
|
|
|
|
|
|
return infer.model(frames, embed=embed)
|
|
|
|
|
|
def postprocess(probabilities,
|
|
fmin=0.,
|
|
fmax=MAX_FMAX,
|
|
decoder=torchcrepe.decode.viterbi,
|
|
return_harmonicity=False,
|
|
return_periodicity=False):
|
|
"""Convert model output to F0 and periodicity
|
|
|
|
Arguments
|
|
probabilities (torch.tensor [shape=(1, 360, time / hop_length)])
|
|
The probabilities for each pitch bin inferred by the network
|
|
fmin (float)
|
|
The minimum allowable frequency in Hz
|
|
fmax (float)
|
|
The maximum allowable frequency in Hz
|
|
viterbi (bool)
|
|
Whether to use viterbi decoding
|
|
return_harmonicity (bool) [DEPRECATED]
|
|
Whether to also return the network confidence
|
|
return_periodicity (bool)
|
|
Whether to also return the network confidence
|
|
|
|
Returns
|
|
pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))])
|
|
periodicity (torch.tensor [shape=(1, 1 + int(time // hop_length))])
|
|
"""
|
|
|
|
probabilities = probabilities.detach()
|
|
|
|
|
|
minidx = torchcrepe.convert.frequency_to_bins(torch.tensor(fmin))
|
|
maxidx = torchcrepe.convert.frequency_to_bins(torch.tensor(fmax),
|
|
torch.ceil)
|
|
|
|
|
|
probabilities[:, :minidx] = -float('inf')
|
|
probabilities[:, maxidx:] = -float('inf')
|
|
|
|
|
|
bins, pitch = decoder(probabilities)
|
|
|
|
|
|
if return_harmonicity:
|
|
message = (
|
|
'The torchcrepe return_harmonicity argument is deprecated and '
|
|
'will be removed in a future release. Please use '
|
|
'return_periodicity. Rationale: if network confidence measured '
|
|
'harmonics, the value would be low for non-harmonic, periodic '
|
|
'sounds (e.g., sine waves). But this is not observed.')
|
|
warnings.warn(message, DeprecationWarning)
|
|
return_periodicity = return_harmonicity
|
|
|
|
if not return_periodicity:
|
|
return pitch
|
|
|
|
|
|
return pitch, periodicity(probabilities, bins)
|
|
|
|
|
|
def preprocess(audio,
|
|
sample_rate,
|
|
hop_length=None,
|
|
batch_size=None,
|
|
device='cpu',
|
|
pad=True):
|
|
"""Convert audio to model input
|
|
|
|
Arguments
|
|
audio (torch.tensor [shape=(1, time)])
|
|
The audio signals
|
|
sample_rate (int)
|
|
The sampling rate in Hz
|
|
hop_length (int)
|
|
The hop_length in samples
|
|
batch_size (int)
|
|
The number of frames per batch
|
|
device (string)
|
|
The device to run inference on
|
|
pad (bool)
|
|
Whether to zero-pad the audio
|
|
|
|
Returns
|
|
frames (torch.tensor [shape=(1 + int(time // hop_length), 1024)])
|
|
"""
|
|
|
|
hop_length = sample_rate // 100 if hop_length is None else hop_length
|
|
|
|
|
|
if sample_rate != SAMPLE_RATE:
|
|
audio = resample(audio, sample_rate)
|
|
hop_length = int(hop_length * SAMPLE_RATE / sample_rate)
|
|
|
|
|
|
|
|
|
|
if pad:
|
|
total_frames = 1 + int(audio.size(1) // hop_length)
|
|
audio = torch.nn.functional.pad(
|
|
audio,
|
|
(WINDOW_SIZE // 2, WINDOW_SIZE // 2))
|
|
else:
|
|
total_frames = 1 + int((audio.size(1) - WINDOW_SIZE) // hop_length)
|
|
|
|
|
|
batch_size = total_frames if batch_size is None else batch_size
|
|
|
|
|
|
for i in range(0, total_frames, batch_size):
|
|
|
|
|
|
start = max(0, i * hop_length)
|
|
end = min(audio.size(1),
|
|
(i + batch_size - 1) * hop_length + WINDOW_SIZE)
|
|
|
|
|
|
frames = torch.nn.functional.unfold(
|
|
audio[:, None, None, start:end],
|
|
kernel_size=(1, WINDOW_SIZE),
|
|
stride=(1, hop_length))
|
|
|
|
|
|
frames = frames.transpose(1, 2).reshape(-1, WINDOW_SIZE)
|
|
|
|
|
|
frames = frames.to(device)
|
|
|
|
|
|
frames -= frames.mean(dim=1, keepdim=True)
|
|
|
|
|
|
|
|
|
|
frames /= torch.max(torch.tensor(1e-10, device=frames.device),
|
|
frames.std(dim=1, keepdim=True))
|
|
|
|
yield frames
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def periodicity(probabilities, bins):
|
|
"""Computes the periodicity from the network output and pitch bins"""
|
|
|
|
probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS)
|
|
|
|
|
|
bins_stacked = bins.reshape(-1, 1).to(torch.int64)
|
|
|
|
|
|
periodicity = probs_stacked.gather(1, bins_stacked)
|
|
|
|
|
|
return periodicity.reshape(probabilities.size(0), probabilities.size(2))
|
|
|
|
|
|
def resample(audio, sample_rate):
|
|
"""Resample audio"""
|
|
|
|
device = audio.device
|
|
|
|
|
|
audio = audio.detach().cpu().numpy().squeeze(0)
|
|
|
|
|
|
|
|
audio = resampy.resample(audio, sample_rate, SAMPLE_RATE)
|
|
|
|
|
|
return torch.tensor(audio, device=device).unsqueeze(0)
|
|
|