Audio-Separator

Running

App Files Files Community

Audio-Separator / look2hear /layers /enc_dec.py

fffiloni

Migrated from GitHub

406f22d verified 23 days ago

raw

history blame contribute delete

17.6 kB

	import warnings
	from typing import Optional
	import torch
	from torch import nn
	from torch.nn import functional as F


	def make_enc_dec(
	fb_name,
	n_filters,
	kernel_size,
	stride=None,
	sample_rate=8000.0,
	who_is_pinv=None,
	padding=0,
	output_padding=0,
	**kwargs,
	):
	"""Creates congruent encoder and decoder from the same filterbank family.
	Args:
	fb_name (str, className): Filterbank family from which to make encoder
	and decoder. To choose among [``'free'``, ``'analytic_free'``,
	``'param_sinc'``, ``'stft'``]. Can also be a class defined in a
	submodule in this subpackade (e.g. :class:`~.FreeFB`).
	n_filters (int): Number of filters.
	kernel_size (int): Length of the filters.
	stride (int, optional): Stride of the convolution.
	If None (default), set to ``kernel_size // 2``.
	sample_rate (float): Sample rate of the expected audio.
	Defaults to 8000.0.
	who_is_pinv (str, optional): If `None`, no pseudo-inverse filters will
	be used. If string (among [``'encoder'``, ``'decoder'``]), decides
	which of ``Encoder`` or ``Decoder`` will be the pseudo inverse of
	the other one.
	padding (int): Zero-padding added to both sides of the input.
	Passed to Encoder and Decoder.
	output_padding (int): Additional size added to one side of the output shape.
	Passed to Decoder.
	**kwargs: Arguments which will be passed to the filterbank class
	additionally to the usual `n_filters`, `kernel_size` and `stride`.
	Depends on the filterbank family.
	Returns:
	:class:`.Encoder`, :class:`.Decoder`
	"""
	fb_class = get(fb_name)

	if who_is_pinv in ["dec", "decoder"]:
	fb = fb_class(
	n_filters, kernel_size, stride=stride, sample_rate=sample_rate, **kwargs
	)
	enc = Encoder(fb, padding=padding)
	# Decoder filterbank is pseudo inverse of encoder filterbank.
	dec = Decoder.pinv_of(fb)
	elif who_is_pinv in ["enc", "encoder"]:
	fb = fb_class(
	n_filters, kernel_size, stride=stride, sample_rate=sample_rate, **kwargs
	)
	dec = Decoder(fb, padding=padding, output_padding=output_padding)
	# Encoder filterbank is pseudo inverse of decoder filterbank.
	enc = Encoder.pinv_of(fb)
	else:
	fb = fb_class(
	n_filters, kernel_size, stride=stride, sample_rate=sample_rate, **kwargs
	)
	enc = Encoder(fb, padding=padding)
	# Filters between encoder and decoder should not be shared.
	fb = fb_class(
	n_filters, kernel_size, stride=stride, sample_rate=sample_rate, **kwargs
	)
	dec = Decoder(fb, padding=padding, output_padding=output_padding)
	return enc, dec


	def register_filterbank(custom_fb):
	"""Register a custom filterbank, gettable with `filterbanks.get`.
	Args:
	custom_fb: Custom filterbank to register.
	"""
	if (
	custom_fb.__name__ in globals().keys()
	or custom_fb.__name__.lower() in globals().keys()
	):
	raise ValueError(
	f"Filterbank {custom_fb.__name__} already exists. Choose another name."
	)
	globals().update({custom_fb.__name__: custom_fb})


	def get(identifier):
	"""Returns a filterbank class from a string. Returns its input if it
	is callable (already a :class:`.Filterbank` for example).
	Args:
	identifier (str or Callable or None): the filterbank identifier.
	Returns:
	:class:`.Filterbank` or None
	"""
	if identifier is None:
	return None
	elif callable(identifier):
	return identifier
	elif isinstance(identifier, str):
	cls = globals().get(identifier)
	if cls is None:
	raise ValueError(
	"Could not interpret filterbank identifier: " + str(identifier)
	)
	return cls
	else:
	raise ValueError(
	"Could not interpret filterbank identifier: " + str(identifier)
	)


	class Filterbank(nn.Module):
	"""Base Filterbank class.
	Each subclass has to implement a ``filters`` method.
	Args:
	n_filters (int): Number of filters.
	kernel_size (int): Length of the filters.
	stride (int, optional): Stride of the conv or transposed conv. (Hop size).
	If None (default), set to ``kernel_size // 2``.
	sample_rate (float): Sample rate of the expected audio.
	Defaults to 8000.
	Attributes:
	n_feats_out (int): Number of output filters.
	"""

	def __init__(self, n_filters, kernel_size, stride=None, sample_rate=8000.0):
	super(Filterbank, self).__init__()
	self.n_filters = n_filters
	self.kernel_size = kernel_size
	self.stride = stride if stride else self.kernel_size // 2
	# If not specified otherwise in the filterbank's init, output
	# number of features is equal to number of required filters.
	self.n_feats_out = n_filters
	self.sample_rate = sample_rate

	def filters(self):
	"""Abstract method for filters."""
	raise NotImplementedError

	def pre_analysis(self, wav: torch.Tensor):
	"""Apply transform before encoder convolution."""
	return wav

	def post_analysis(self, spec: torch.Tensor):
	"""Apply transform to encoder convolution."""
	return spec

	def pre_synthesis(self, spec: torch.Tensor):
	"""Apply transform before decoder transposed convolution."""
	return spec

	def post_synthesis(self, wav: torch.Tensor):
	"""Apply transform after decoder transposed convolution."""
	return wav

	def get_config(self):
	"""Returns dictionary of arguments to re-instantiate the class.
	Needs to be subclassed if the filterbanks takes additional arguments
	than ``n_filters`` ``kernel_size`` ``stride`` and ``sample_rate``.
	"""
	config = {
	"fb_name": self.__class__.__name__,
	"n_filters": self.n_filters,
	"kernel_size": self.kernel_size,
	"stride": self.stride,
	"sample_rate": self.sample_rate,
	}
	return config

	def forward(self, waveform):
	raise NotImplementedError(
	"Filterbanks must be wrapped with an Encoder or a Decoder."
	)


	class _EncDec(nn.Module):
	"""Base private class for Encoder and Decoder.
	Common parameters and methods.
	Args:
	filterbank (:class:`Filterbank`): Filterbank instance. The filterbank
	to use as an encoder or a decoder.
	is_pinv (bool): Whether to be the pseudo inverse of filterbank.
	Attributes:
	filterbank (:class:`Filterbank`)
	stride (int)
	is_pinv (bool)
	"""

	def __init__(self, filterbank, is_pinv=False):
	super(_EncDec, self).__init__()
	self.filterbank = filterbank
	self.sample_rate = getattr(filterbank, "sample_rate", None)
	self.stride = self.filterbank.stride
	self.is_pinv = is_pinv

	def filters(self):
	return self.filterbank.filters()

	def compute_filter_pinv(self, filters):
	"""Computes pseudo inverse filterbank of given filters."""
	scale = self.filterbank.stride / self.filterbank.kernel_size
	shape = filters.shape
	ifilt = torch.pinverse(filters.squeeze()).transpose(-1, -2).view(shape)
	# Compensate for the overlap-add.
	return ifilt * scale

	def get_filters(self):
	"""Returns filters or pinv filters depending on `is_pinv` attribute"""
	if self.is_pinv:
	return self.compute_filter_pinv(self.filters())
	else:
	return self.filters()

	def get_config(self):
	"""Returns dictionary of arguments to re-instantiate the class."""
	config = {"is_pinv": self.is_pinv}
	base_config = self.filterbank.get_config()
	return dict(list(base_config.items()) + list(config.items()))


	class Encoder(_EncDec):
	r"""Encoder class.
	Add encoding methods to Filterbank classes.
	Not intended to be subclassed.
	Args:
	filterbank (:class:`Filterbank`): The filterbank to use
	as an encoder.
	is_pinv (bool): Whether to be the pseudo inverse of filterbank.
	as_conv1d (bool): Whether to behave like nn.Conv1d.
	If True (default), forwarding input with shape :math:`(batch, 1, time)`
	will output a tensor of shape :math:`(batch, freq, conv\_time)`.
	If False, will output a tensor of shape :math:`(batch, 1, freq, conv\_time)`.
	padding (int): Zero-padding added to both sides of the input.
	"""

	def __init__(self, filterbank, is_pinv=False, as_conv1d=True, padding=0):
	super(Encoder, self).__init__(filterbank, is_pinv=is_pinv)
	self.as_conv1d = as_conv1d
	self.n_feats_out = self.filterbank.n_feats_out
	self.kernel_size = self.filterbank.kernel_size
	self.padding = padding

	@classmethod
	def pinv_of(cls, filterbank, **kwargs):
	"""Returns an :class:`~.Encoder`, pseudo inverse of a
	:class:`~.Filterbank` or :class:`~.Decoder`."""
	if isinstance(filterbank, Filterbank):
	return cls(filterbank, is_pinv=True, **kwargs)
	elif isinstance(filterbank, Decoder):
	return cls(filterbank.filterbank, is_pinv=True, **kwargs)

	def forward(self, waveform):
	"""Convolve input waveform with the filters from a filterbank.
	Args:
	waveform (:class:`torch.Tensor`): any tensor with samples along the
	last dimension. The waveform representation with and
	batch/channel etc.. dimension.
	Returns:
	:class:`torch.Tensor`: The corresponding TF domain signal.
	Shapes
	>>> (time, ) -> (freq, conv_time)
	>>> (batch, time) -> (batch, freq, conv_time) # Avoid
	>>> if as_conv1d:
	>>> (batch, 1, time) -> (batch, freq, conv_time)
	>>> (batch, chan, time) -> (batch, chan, freq, conv_time)
	>>> else:
	>>> (batch, chan, time) -> (batch, chan, freq, conv_time)
	>>> (batch, any, dim, time) -> (batch, any, dim, freq, conv_time)
	"""
	filters = self.get_filters()
	waveform = self.filterbank.pre_analysis(waveform)
	spec = multishape_conv1d(
	waveform,
	filters=filters,
	stride=self.stride,
	padding=self.padding,
	as_conv1d=self.as_conv1d,
	)
	return self.filterbank.post_analysis(spec)


	def multishape_conv1d(
	waveform: torch.Tensor,
	filters: torch.Tensor,
	stride: int,
	padding: int = 0,
	as_conv1d: bool = True,
	) -> torch.Tensor:
	if waveform.ndim == 1:
	# Assumes 1D input with shape (time,)
	# Output will be (freq, conv_time)
	return F.conv1d(
	waveform[None, None], filters, stride=stride, padding=padding
	).squeeze()
	elif waveform.ndim == 2:
	# Assume 2D input with shape (batch or channels, time)
	# Output will be (batch or channels, freq, conv_time)
	warnings.warn(
	"Input tensor was 2D. Applying the corresponding "
	"Decoder to the current output will result in a 3D "
	"tensor. This behaviours was introduced to match "
	"Conv1D and ConvTranspose1D, please use 3D inputs "
	"to avoid it. For example, this can be done with "
	"input_tensor.unsqueeze(1)."
	)
	return F.conv1d(waveform.unsqueeze(1), filters, stride=stride, padding=padding)
	elif waveform.ndim == 3:
	batch, channels, time_len = waveform.shape
	if channels == 1 and as_conv1d:
	# That's the common single channel case (batch, 1, time)
	# Output will be (batch, freq, stft_time), behaves as Conv1D
	return F.conv1d(waveform, filters, stride=stride, padding=padding)
	else:
	# Return batched convolution, input is (batch, 3, time), output will be
	# (b, 3, f, conv_t). Useful for multichannel transforms. If as_conv1d is
	# false, (batch, 1, time) will output (batch, 1, freq, conv_time), useful for
	# consistency.
	return batch_packed_1d_conv(
	waveform, filters, stride=stride, padding=padding
	)
	else: # waveform.ndim > 3
	# This is to compute "multi"multichannel convolution.
	# Input can be (, time), output will be (, freq, conv_time)
	return batch_packed_1d_conv(waveform, filters, stride=stride, padding=padding)


	def batch_packed_1d_conv(
	inp: torch.Tensor, filters: torch.Tensor, stride: int = 1, padding: int = 0
	):
	# Here we perform multichannel / multi-source convolution.
	# Output should be (batch, channels, freq, conv_time)
	batched_conv = F.conv1d(
	inp.view(-1, 1, inp.shape[-1]), filters, stride=stride, padding=padding
	)
	output_shape = inp.shape[:-1] + batched_conv.shape[-2:]
	return batched_conv.view(output_shape)


	class Decoder(_EncDec):
	"""Decoder class.
	Add decoding methods to Filterbank classes.
	Not intended to be subclassed.
	Args:
	filterbank (:class:`Filterbank`): The filterbank to use as an decoder.
	is_pinv (bool): Whether to be the pseudo inverse of filterbank.
	padding (int): Zero-padding added to both sides of the input.
	output_padding (int): Additional size added to one side of the
	output shape.
	.. note::
	``padding`` and ``output_padding`` arguments are directly passed to
	``F.conv_transpose1d``.
	"""

	def __init__(self, filterbank, is_pinv=False, padding=0, output_padding=0):
	super().__init__(filterbank, is_pinv=is_pinv)
	self.padding = padding
	self.output_padding = output_padding

	@classmethod
	def pinv_of(cls, filterbank):
	"""Returns an Decoder, pseudo inverse of a filterbank or Encoder."""
	if isinstance(filterbank, Filterbank):
	return cls(filterbank, is_pinv=True)
	elif isinstance(filterbank, Encoder):
	return cls(filterbank.filterbank, is_pinv=True)

	def forward(self, spec, length: Optional[int] = None) -> torch.Tensor:
	"""Applies transposed convolution to a TF representation.
	This is equivalent to overlap-add.
	Args:
	spec (:class:`torch.Tensor`): 3D or 4D Tensor. The TF
	representation. (Output of :func:`Encoder.forward`).
	length: desired output length.
	Returns:
	:class:`torch.Tensor`: The corresponding time domain signal.
	"""
	filters = self.get_filters()
	spec = self.filterbank.pre_synthesis(spec)
	wav = multishape_conv_transpose1d(
	spec,
	filters,
	stride=self.stride,
	padding=self.padding,
	output_padding=self.output_padding,
	)
	wav = self.filterbank.post_synthesis(wav)
	if length is not None:
	length = min(length, wav.shape[-1])
	return wav[..., :length]
	return wav


	def multishape_conv_transpose1d(
	spec: torch.Tensor,
	filters: torch.Tensor,
	stride: int = 1,
	padding: int = 0,
	output_padding: int = 0,
	) -> torch.Tensor:
	if spec.ndim == 2:
	# Input is (freq, conv_time), output is (time)
	return F.conv_transpose1d(
	spec.unsqueeze(0),
	filters,
	stride=stride,
	padding=padding,
	output_padding=output_padding,
	).squeeze()
	if spec.ndim == 3:
	# Input is (batch, freq, conv_time), output is (batch, 1, time)
	return F.conv_transpose1d(
	spec,
	filters,
	stride=stride,
	padding=padding,
	output_padding=output_padding,
	)
	else:
	# Multiply all the left dimensions together and group them in the
	# batch. Make the convolution and restore.
	view_as = (-1,) + spec.shape[-2:]
	out = F.conv_transpose1d(
	spec.reshape(view_as),
	filters,
	stride=stride,
	padding=padding,
	output_padding=output_padding,
	)
	return out.view(spec.shape[:-2] + (-1,))


	class FreeFB(Filterbank):
	"""Free filterbank without any constraints. Equivalent to
	:class:`nn.Conv1d`.
	Args:
	n_filters (int): Number of filters.
	kernel_size (int): Length of the filters.
	stride (int, optional): Stride of the convolution.
	If None (default), set to ``kernel_size // 2``.
	sample_rate (float): Sample rate of the expected audio.
	Defaults to 8000.
	Attributes:
	n_feats_out (int): Number of output filters.
	References
	[1] : "Filterbank design for end-to-end speech separation". ICASSP 2020.
	Manuel Pariente, Samuele Cornell, Antoine Deleforge, Emmanuel Vincent.
	"""

	def __init__(
	self, n_filters, kernel_size, stride=None, sample_rate=8000.0, **kwargs
	):
	super().__init__(n_filters, kernel_size, stride=stride, sample_rate=sample_rate)
	self._filters = nn.Parameter(torch.ones(n_filters, 1, kernel_size))
	for p in self.parameters():
	nn.init.xavier_normal_(p)

	def filters(self):
	return self._filters


	free = FreeFB