GPT-SoVITS-ProPlus

Running on Zero

App Files Files Community

lj1995 commited on 10 days ago

Commit

48ddb2e

1 Parent(s): 6c168a1

Delete eres2net

Browse files

Files changed (6) hide show

eres2net/ERes2Net.py +0 -260
eres2net/ERes2NetV2.py +0 -292
eres2net/ERes2Net_huge.py +0 -286
eres2net/fusion.py +0 -29
eres2net/kaldi.py +0 -819
eres2net/pooling_layers.py +0 -104

eres2net/ERes2Net.py DELETED Viewed

@@ -1,260 +0,0 @@
-# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-"""
-    Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
-    ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
-    The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
-    The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
-"""
-import torch
-import math
-import torch.nn as nn
-import torch.nn.functional as F
-import pooling_layers as pooling_layers
-from fusion import AFF
-class ReLU(nn.Hardtanh):
-    def __init__(self, inplace=False):
-        super(ReLU, self).__init__(0, 20, inplace)
-    def __repr__(self):
-        inplace_str = 'inplace' if self.inplace else ''
-        return self.__class__.__name__ + ' (' \
-            + inplace_str + ')'
-class BasicBlockERes2Net(nn.Module):
-    expansion = 2
-    def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
-        super(BasicBlockERes2Net, self).__init__()
-        width = int(math.floor(planes*(baseWidth/64.0)))
-        self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
-        self.bn1 = nn.BatchNorm2d(width*scale)
-        self.nums = scale
-        convs=[]
-        bns=[]
-        for i in range(self.nums):
-        	convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
-        	bns.append(nn.BatchNorm2d(width))
-        self.convs = nn.ModuleList(convs)
-        self.bns = nn.ModuleList(bns)
-        self.relu = ReLU(inplace=True)
-        self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes*self.expansion)
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_planes != self.expansion * planes:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1,
-                          stride=stride, bias=False),
-                nn.BatchNorm2d(self.expansion * planes))
-        self.stride = stride
-        self.width = width
-        self.scale = scale
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        spx = torch.split(out,self.width,1)
-        for i in range(self.nums):
-        	if i==0:
-        		sp = spx[i]
-        	else:
-        		sp = sp + spx[i]
-        	sp = self.convs[i](sp)
-        	sp = self.relu(self.bns[i](sp))
-        	if i==0:
-        		out = sp
-        	else:
-        		out = torch.cat((out,sp),1)
-        out = self.conv3(out)
-        out = self.bn3(out)
-        residual = self.shortcut(x)
-        out += residual
-        out = self.relu(out)
-        return out
-class BasicBlockERes2Net_diff_AFF(nn.Module):
-    expansion = 2
-    def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
-        super(BasicBlockERes2Net_diff_AFF, self).__init__()
-        width = int(math.floor(planes*(baseWidth/64.0)))
-        self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
-        self.bn1 = nn.BatchNorm2d(width*scale)
-        self.nums = scale
-        convs=[]
-        fuse_models=[]
-        bns=[]
-        for i in range(self.nums):
-        	convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
-        	bns.append(nn.BatchNorm2d(width))
-        for j in range(self.nums - 1):
-            fuse_models.append(AFF(channels=width))
-        self.convs = nn.ModuleList(convs)
-        self.bns = nn.ModuleList(bns)
-        self.fuse_models = nn.ModuleList(fuse_models)
-        self.relu = ReLU(inplace=True)
-        self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes*self.expansion)
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_planes != self.expansion * planes:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1,
-                          stride=stride, bias=False),
-                nn.BatchNorm2d(self.expansion * planes))
-        self.stride = stride
-        self.width = width
-        self.scale = scale
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        spx = torch.split(out,self.width,1)
-        for i in range(self.nums):
-            if i==0:
-                sp = spx[i]
-            else:
-                sp = self.fuse_models[i-1](sp, spx[i])
-            sp = self.convs[i](sp)
-            sp = self.relu(self.bns[i](sp))
-            if i==0:
-                out = sp
-            else:
-                out = torch.cat((out,sp),1)
-        out = self.conv3(out)
-        out = self.bn3(out)
-        residual = self.shortcut(x)
-        out += residual
-        out = self.relu(out)
-        return out
-class ERes2Net(nn.Module):
-    def __init__(self,
-                 block=BasicBlockERes2Net,
-                 block_fuse=BasicBlockERes2Net_diff_AFF,
-                 num_blocks=[3, 4, 6, 3],
-                 m_channels=32,
-                 feat_dim=80,
-                 embedding_size=192,
-                 pooling_func='TSTP',
-                 two_emb_layer=False):
-        super(ERes2Net, self).__init__()
-        self.in_planes = m_channels
-        self.feat_dim = feat_dim
-        self.embedding_size = embedding_size
-        self.stats_dim = int(feat_dim / 8) * m_channels * 8
-        self.two_emb_layer = two_emb_layer
-        self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(m_channels)
-        self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
-        self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
-        self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
-        self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
-        # Downsampling module for each layer
-        self.layer1_downsample = nn.Conv2d(m_channels * 2, m_channels * 4, kernel_size=3, stride=2, padding=1, bias=False)
-        self.layer2_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False)
-        self.layer3_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False)
-        # Bottom-up fusion module
-        self.fuse_mode12 = AFF(channels=m_channels * 4)
-        self.fuse_mode123 = AFF(channels=m_channels * 8)
-        self.fuse_mode1234 = AFF(channels=m_channels * 16)
-        self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
-        self.pool = getattr(pooling_layers, pooling_func)(
-            in_dim=self.stats_dim * block.expansion)
-        self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats,
-                               embedding_size)
-        if self.two_emb_layer:
-            self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
-            self.seg_2 = nn.Linear(embedding_size, embedding_size)
-        else:
-            self.seg_bn_1 = nn.Identity()
-            self.seg_2 = nn.Identity()
-    def _make_layer(self, block, planes, num_blocks, stride):
-        strides = [stride] + [1] * (num_blocks - 1)
-        layers = []
-        for stride in strides:
-            layers.append(block(self.in_planes, planes, stride))
-            self.in_planes = planes * block.expansion
-        return nn.Sequential(*layers)
-    def forward(self, x):
-        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
-        x = x.unsqueeze_(1)
-        out = F.relu(self.bn1(self.conv1(x)))
-        out1 = self.layer1(out)
-        out2 = self.layer2(out1)
-        out1_downsample = self.layer1_downsample(out1)
-        fuse_out12 = self.fuse_mode12(out2, out1_downsample)
-        out3 = self.layer3(out2)
-        fuse_out12_downsample = self.layer2_downsample(fuse_out12)
-        fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
-        out4 = self.layer4(out3)
-        fuse_out123_downsample = self.layer3_downsample(fuse_out123)
-        fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
-        stats = self.pool(fuse_out1234)
-        embed_a = self.seg_1(stats)
-        if self.two_emb_layer:
-            out = F.relu(embed_a)
-            out = self.seg_bn_1(out)
-            embed_b = self.seg_2(out)
-            return embed_b
-        else:
-            return embed_a
-    def forward3(self, x):
-        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
-        x = x.unsqueeze_(1)
-        out = F.relu(self.bn1(self.conv1(x)))
-        out1 = self.layer1(out)
-        out2 = self.layer2(out1)
-        out1_downsample = self.layer1_downsample(out1)
-        fuse_out12 = self.fuse_mode12(out2, out1_downsample)
-        out3 = self.layer3(out2)
-        fuse_out12_downsample = self.layer2_downsample(fuse_out12)
-        fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
-        out4 = self.layer4(out3)
-        fuse_out123_downsample = self.layer3_downsample(fuse_out123)
-        fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2).mean(-1)
-        return fuse_out1234
-if __name__ == '__main__':
-    x = torch.zeros(10, 300, 80)
-    model = ERes2Net(feat_dim=80, embedding_size=192, pooling_func='TSTP')
-    model.eval()
-    out = model(x)
-    print(out.shape) # torch.Size([10, 192])
-    num_params = sum(param.numel() for param in model.parameters())
-    print("{} M".format(num_params / 1e6)) # 6.61M

eres2net/ERes2NetV2.py DELETED Viewed

@@ -1,292 +0,0 @@
-# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-"""
-    To further improve the short-duration feature extraction capability of ERes2Net, we expand the channel dimension
-    within each stage. However, this modification also increases the number of model parameters and computational complexity.
-    To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures, ultimately reducing
-    both the model parameters and its computational cost.
-"""
-import torch
-import math
-import torch.nn as nn
-import torch.nn.functional as F
-import pooling_layers as pooling_layers
-from fusion import AFF
-class ReLU(nn.Hardtanh):
-    def __init__(self, inplace=False):
-        super(ReLU, self).__init__(0, 20, inplace)
-    def __repr__(self):
-        inplace_str = 'inplace' if self.inplace else ''
-        return self.__class__.__name__ + ' (' \
-            + inplace_str + ')'
-class BasicBlockERes2NetV2(nn.Module):
-    def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
-        super(BasicBlockERes2NetV2, self).__init__()
-        width = int(math.floor(planes*(baseWidth/64.0)))
-        self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
-        self.bn1 = nn.BatchNorm2d(width*scale)
-        self.nums = scale
-        self.expansion = expansion
-        convs=[]
-        bns=[]
-        for i in range(self.nums):
-        	convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
-        	bns.append(nn.BatchNorm2d(width))
-        self.convs = nn.ModuleList(convs)
-        self.bns = nn.ModuleList(bns)
-        self.relu = ReLU(inplace=True)
-        self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes*self.expansion)
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_planes != self.expansion * planes:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes,
-                          self.expansion * planes,
-                          kernel_size=1,
-                          stride=stride,
-                          bias=False),
-                nn.BatchNorm2d(self.expansion * planes))
-        self.stride = stride
-        self.width = width
-        self.scale = scale
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        spx = torch.split(out,self.width,1)
-        for i in range(self.nums):
-        	if i==0:
-        		sp = spx[i]
-        	else:
-        		sp = sp + spx[i]
-        	sp = self.convs[i](sp)
-        	sp = self.relu(self.bns[i](sp))
-        	if i==0:
-        		out = sp
-        	else:
-        		out = torch.cat((out,sp),1)
-        out = self.conv3(out)
-        out = self.bn3(out)
-        residual = self.shortcut(x)
-        out += residual
-        out = self.relu(out)
-        return out
-class BasicBlockERes2NetV2AFF(nn.Module):
-    def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
-        super(BasicBlockERes2NetV2AFF, self).__init__()
-        width = int(math.floor(planes*(baseWidth/64.0)))
-        self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
-        self.bn1 = nn.BatchNorm2d(width*scale)
-        self.nums = scale
-        self.expansion = expansion
-        convs=[]
-        fuse_models=[]
-        bns=[]
-        for i in range(self.nums):
-        	convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
-        	bns.append(nn.BatchNorm2d(width))
-        for j in range(self.nums - 1):
-            fuse_models.append(AFF(channels=width, r=4))
-        self.convs = nn.ModuleList(convs)
-        self.bns = nn.ModuleList(bns)
-        self.fuse_models = nn.ModuleList(fuse_models)
-        self.relu = ReLU(inplace=True)
-        self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes*self.expansion)
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_planes != self.expansion * planes:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes,
-                          self.expansion * planes,
-                          kernel_size=1,
-                          stride=stride,
-                          bias=False),
-                nn.BatchNorm2d(self.expansion * planes))
-        self.stride = stride
-        self.width = width
-        self.scale = scale
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        spx = torch.split(out,self.width,1)
-        for i in range(self.nums):
-            if i==0:
-                sp = spx[i]
-            else:
-                sp = self.fuse_models[i-1](sp, spx[i])
-            sp = self.convs[i](sp)
-            sp = self.relu(self.bns[i](sp))
-            if i==0:
-                out = sp
-            else:
-                out = torch.cat((out,sp),1)
-        out = self.conv3(out)
-        out = self.bn3(out)
-        residual = self.shortcut(x)
-        out += residual
-        out = self.relu(out)
-        return out
-class ERes2NetV2(nn.Module):
-    def __init__(self,
-                 block=BasicBlockERes2NetV2,
-                 block_fuse=BasicBlockERes2NetV2AFF,
-                 num_blocks=[3, 4, 6, 3],
-                 m_channels=64,
-                 feat_dim=80,
-                 embedding_size=192,
-                 baseWidth=26,
-                 scale=2,
-                 expansion=2,
-                 pooling_func='TSTP',
-                 two_emb_layer=False):
-        super(ERes2NetV2, self).__init__()
-        self.in_planes = m_channels
-        self.feat_dim = feat_dim
-        self.embedding_size = embedding_size
-        self.stats_dim = int(feat_dim / 8) * m_channels * 8
-        self.two_emb_layer = two_emb_layer
-        self.baseWidth = baseWidth
-        self.scale = scale
-        self.expansion = expansion
-        self.conv1 = nn.Conv2d(1,
-                               m_channels,
-                               kernel_size=3,
-                               stride=1,
-                               padding=1,
-                               bias=False)
-        self.bn1 = nn.BatchNorm2d(m_channels)
-        self.layer1 = self._make_layer(block,
-                                       m_channels,
-                                       num_blocks[0],
-                                       stride=1)
-        self.layer2 = self._make_layer(block,
-                                       m_channels * 2,
-                                       num_blocks[1],
-                                       stride=2)
-        self.layer3 = self._make_layer(block_fuse,
-                                       m_channels * 4,
-                                       num_blocks[2],
-                                       stride=2)
-        self.layer4 = self._make_layer(block_fuse,
-                                       m_channels * 8,
-                                       num_blocks[3],
-                                       stride=2)
-        # Downsampling module
-        self.layer3_ds = nn.Conv2d(m_channels * 4 * self.expansion, m_channels * 8 * self.expansion, kernel_size=3, \
-                                   padding=1, stride=2, bias=False)
-        # Bottom-up fusion module
-        self.fuse34 = AFF(channels=m_channels * 8 * self.expansion, r=4)
-        self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
-        self.pool = getattr(pooling_layers, pooling_func)(
-            in_dim=self.stats_dim * self.expansion)
-        self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats,
-                               embedding_size)
-        if self.two_emb_layer:
-            self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
-            self.seg_2 = nn.Linear(embedding_size, embedding_size)
-        else:
-            self.seg_bn_1 = nn.Identity()
-            self.seg_2 = nn.Identity()
-    def _make_layer(self, block, planes, num_blocks, stride):
-        strides = [stride] + [1] * (num_blocks - 1)
-        layers = []
-        for stride in strides:
-            layers.append(block(self.in_planes, planes, stride, baseWidth=self.baseWidth, scale=self.scale, expansion=self.expansion))
-            self.in_planes = planes * self.expansion
-        return nn.Sequential(*layers)
-    def forward(self, x):
-        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
-        x = x.unsqueeze_(1)
-        out = F.relu(self.bn1(self.conv1(x)))
-        out1 = self.layer1(out)
-        out2 = self.layer2(out1)
-        out3 = self.layer3(out2)
-        out4 = self.layer4(out3)
-        out3_ds = self.layer3_ds(out3)
-        fuse_out34 = self.fuse34(out4, out3_ds)
-        stats = self.pool(fuse_out34)
-        embed_a = self.seg_1(stats)
-        if self.two_emb_layer:
-            out = F.relu(embed_a)
-            out = self.seg_bn_1(out)
-            embed_b = self.seg_2(out)
-            return embed_b
-        else:
-            return embed_a
-    def forward3(self, x):
-        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
-        x = x.unsqueeze_(1)
-        out = F.relu(self.bn1(self.conv1(x)))
-        out1 = self.layer1(out)
-        out2 = self.layer2(out1)
-        out3 = self.layer3(out2)
-        out4 = self.layer4(out3)
-        out3_ds = self.layer3_ds(out3)
-        fuse_out34 = self.fuse34(out4, out3_ds)
-        # print(111111111,fuse_out34.shape)#111111111 torch.Size([16, 2048, 10, 72])
-        return fuse_out34.flatten(start_dim=1,end_dim=2).mean(-1)
-        # stats = self.pool(fuse_out34)
-        #
-        # embed_a = self.seg_1(stats)
-        # if self.two_emb_layer:
-        #     out = F.relu(embed_a)
-        #     out = self.seg_bn_1(out)
-        #     embed_b = self.seg_2(out)
-        #     return embed_b
-        # else:
-        #     return embed_a
-if __name__ == '__main__':
-    x = torch.randn(1, 300, 80)
-    model = ERes2NetV2(feat_dim=80, embedding_size=192, m_channels=64, baseWidth=26, scale=2, expansion=2)
-    model.eval()
-    y = model(x)
-    print(y.size())
-    macs, num_params = profile(model, inputs=(x, ))
-    print("Params: {} M".format(num_params / 1e6)) # 17.86 M
-    print("MACs: {} G".format(macs / 1e9)) # 12.69 G

eres2net/ERes2Net_huge.py DELETED Viewed

@@ -1,286 +0,0 @@
-# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-""" Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
-    ERes2Net incorporates both local and global feature fusion techniques to improve the performance.
-    The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
-    The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
-    ERes2Net-huge is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better
-    recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance.
-"""
-import pdb
-import torch
-import math
-import torch.nn as nn
-import torch.nn.functional as F
-import pooling_layers as pooling_layers
-from fusion import AFF
-class ReLU(nn.Hardtanh):
-    def __init__(self, inplace=False):
-        super(ReLU, self).__init__(0, 20, inplace)
-    def __repr__(self):
-        inplace_str = 'inplace' if self.inplace else ''
-        return self.__class__.__name__ + ' (' \
-            + inplace_str + ')'
-class BasicBlockERes2Net(nn.Module):
-    expansion = 4
-    def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
-        super(BasicBlockERes2Net, self).__init__()
-        width = int(math.floor(planes*(baseWidth/64.0)))
-        self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
-        self.bn1 = nn.BatchNorm2d(width*scale)
-        self.nums = scale
-        convs=[]
-        bns=[]
-        for i in range(self.nums):
-        	convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
-        	bns.append(nn.BatchNorm2d(width))
-        self.convs = nn.ModuleList(convs)
-        self.bns = nn.ModuleList(bns)
-        self.relu = ReLU(inplace=True)
-        self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes*self.expansion)
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_planes != self.expansion * planes:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(self.expansion * planes))
-        self.stride = stride
-        self.width = width
-        self.scale = scale
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        spx = torch.split(out,self.width,1)
-        for i in range(self.nums):
-        	if i==0:
-        		sp = spx[i]
-        	else:
-        		sp = sp + spx[i]
-        	sp = self.convs[i](sp)
-        	sp = self.relu(self.bns[i](sp))
-        	if i==0:
-        		out = sp
-        	else:
-        		out = torch.cat((out,sp),1)
-        out = self.conv3(out)
-        out = self.bn3(out)
-        residual = self.shortcut(x)
-        out += residual
-        out = self.relu(out)
-        return out
-class BasicBlockERes2Net_diff_AFF(nn.Module):
-    expansion = 4
-    def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
-        super(BasicBlockERes2Net_diff_AFF, self).__init__()
-        width = int(math.floor(planes*(baseWidth/64.0)))
-        self.conv1 = nn.Conv2d(in_planes, width*scale, kernel_size=1, stride=stride, bias=False)
-        self.bn1 = nn.BatchNorm2d(width*scale)
-        self.nums = scale
-        convs=[]
-        fuse_models=[]
-        bns=[]
-        for i in range(self.nums):
-        	convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
-        	bns.append(nn.BatchNorm2d(width))
-        for j in range(self.nums - 1):
-            fuse_models.append(AFF(channels=width))
-        self.convs = nn.ModuleList(convs)
-        self.bns = nn.ModuleList(bns)
-        self.fuse_models = nn.ModuleList(fuse_models)
-        self.relu = ReLU(inplace=True)
-        self.conv3 = nn.Conv2d(width*scale, planes*self.expansion, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes*self.expansion)
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_planes != self.expansion * planes:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(self.expansion * planes))
-        self.stride = stride
-        self.width = width
-        self.scale = scale
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        spx = torch.split(out,self.width,1)
-        for i in range(self.nums):
-            if i==0:
-                sp = spx[i]
-            else:
-                sp = self.fuse_models[i-1](sp, spx[i])
-            sp = self.convs[i](sp)
-            sp = self.relu(self.bns[i](sp))
-            if i==0:
-                out = sp
-            else:
-                out = torch.cat((out,sp),1)
-        out = self.conv3(out)
-        out = self.bn3(out)
-        residual = self.shortcut(x)
-        out += residual
-        out = self.relu(out)
-        return out
-class ERes2Net(nn.Module):
-    def __init__(self,
-                 block=BasicBlockERes2Net,
-                 block_fuse=BasicBlockERes2Net_diff_AFF,
-                 num_blocks=[3, 4, 6, 3],
-                 m_channels=64,
-                 feat_dim=80,
-                 embedding_size=192,
-                 pooling_func='TSTP',
-                 two_emb_layer=False):
-        super(ERes2Net, self).__init__()
-        self.in_planes = m_channels
-        self.feat_dim = feat_dim
-        self.embedding_size = embedding_size
-        self.stats_dim = int(feat_dim / 8) * m_channels * 8
-        self.two_emb_layer = two_emb_layer
-        self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(m_channels)
-        self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
-        self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
-        self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
-        self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
-        self.layer1_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False)
-        self.layer2_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False)
-        self.layer3_downsample = nn.Conv2d(m_channels * 16, m_channels * 32, kernel_size=3, padding=1, stride=2, bias=False)
-        self.fuse_mode12 = AFF(channels=m_channels * 8)
-        self.fuse_mode123 = AFF(channels=m_channels * 16)
-        self.fuse_mode1234 = AFF(channels=m_channels * 32)
-        self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
-        self.pool = getattr(pooling_layers, pooling_func)(
-            in_dim=self.stats_dim * block.expansion)
-        self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, embedding_size)
-        if self.two_emb_layer:
-            self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
-            self.seg_2 = nn.Linear(embedding_size, embedding_size)
-        else:
-            self.seg_bn_1 = nn.Identity()
-            self.seg_2 = nn.Identity()
-    def _make_layer(self, block, planes, num_blocks, stride):
-        strides = [stride] + [1] * (num_blocks - 1)
-        layers = []
-        for stride in strides:
-            layers.append(block(self.in_planes, planes, stride))
-            self.in_planes = planes * block.expansion
-        return nn.Sequential(*layers)
-    def forward(self, x):
-        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
-        x = x.unsqueeze_(1)
-        out = F.relu(self.bn1(self.conv1(x)))
-        out1 = self.layer1(out)
-        out2 = self.layer2(out1)
-        out1_downsample = self.layer1_downsample(out1)
-        fuse_out12 = self.fuse_mode12(out2, out1_downsample)
-        out3 = self.layer3(out2)
-        fuse_out12_downsample = self.layer2_downsample(fuse_out12)
-        fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
-        out4 = self.layer4(out3)
-        fuse_out123_downsample = self.layer3_downsample(fuse_out123)
-        fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
-        stats = self.pool(fuse_out1234)
-        embed_a = self.seg_1(stats)
-        if self.two_emb_layer:
-            out = F.relu(embed_a)
-            out = self.seg_bn_1(out)
-            embed_b = self.seg_2(out)
-            return embed_b
-        else:
-            return embed_a
-    def forward2(self, x,if_mean):
-        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
-        x = x.unsqueeze_(1)
-        out = F.relu(self.bn1(self.conv1(x)))
-        out1 = self.layer1(out)
-        out2 = self.layer2(out1)
-        out1_downsample = self.layer1_downsample(out1)
-        fuse_out12 = self.fuse_mode12(out2, out1_downsample)
-        out3 = self.layer3(out2)
-        fuse_out12_downsample = self.layer2_downsample(fuse_out12)
-        fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
-        out4 = self.layer4(out3)
-        fuse_out123_downsample = self.layer3_downsample(fuse_out123)
-        fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2)#bs,20480,T
-        if(if_mean==False):
-            mean=fuse_out1234[0].transpose(1,0)#(T,20480),bs=T
-        else:
-            mean = fuse_out1234.mean(2)#bs,20480
-        mean_std=torch.cat([mean,torch.zeros_like(mean)],1)
-        return self.seg_1(mean_std)#(T,192)
-        # stats = self.pool(fuse_out1234)
-        # if self.two_emb_layer:
-        #     out = F.relu(embed_a)
-        #     out = self.seg_bn_1(out)
-        #     embed_b = self.seg_2(out)
-        #     return embed_b
-        # else:
-        #     return embed_a
-    def forward3(self, x):
-        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
-        x = x.unsqueeze_(1)
-        out = F.relu(self.bn1(self.conv1(x)))
-        out1 = self.layer1(out)
-        out2 = self.layer2(out1)
-        out1_downsample = self.layer1_downsample(out1)
-        fuse_out12 = self.fuse_mode12(out2, out1_downsample)
-        out3 = self.layer3(out2)
-        fuse_out12_downsample = self.layer2_downsample(fuse_out12)
-        fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
-        out4 = self.layer4(out3)
-        fuse_out123_downsample = self.layer3_downsample(fuse_out123)
-        fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1,end_dim=2).mean(-1)
-        return fuse_out1234
-        # print(fuse_out1234.shape)
-        # print(fuse_out1234.flatten(start_dim=1,end_dim=2).shape)
-        # pdb.set_trace()

eres2net/fusion.py DELETED Viewed

@@ -1,29 +0,0 @@
-# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-import torch
-import torch.nn as nn
-class AFF(nn.Module):
-    def __init__(self, channels=64, r=4):
-        super(AFF, self).__init__()
-        inter_channels = int(channels // r)
-        self.local_att = nn.Sequential(
-            nn.Conv2d(channels * 2, inter_channels, kernel_size=1, stride=1, padding=0),
-            nn.BatchNorm2d(inter_channels),
-            nn.SiLU(inplace=True),
-            nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
-            nn.BatchNorm2d(channels),
-        )
-    def forward(self, x, ds_y):
-        xa = torch.cat((x, ds_y), dim=1)
-        x_att = self.local_att(xa)
-        x_att = 1.0 + torch.tanh(x_att)
-        xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0-x_att)
-        return xo

eres2net/kaldi.py DELETED Viewed

@@ -1,819 +0,0 @@
-import math
-from typing import Tuple
-import torch
-import torchaudio
-from torch import Tensor
-__all__ = [
-    "get_mel_banks",
-    "inverse_mel_scale",
-    "inverse_mel_scale_scalar",
-    "mel_scale",
-    "mel_scale_scalar",
-    "spectrogram",
-    "fbank",
-    "mfcc",
-    "vtln_warp_freq",
-    "vtln_warp_mel_freq",
-]
-# numeric_limits<float>::epsilon() 1.1920928955078125e-07
-EPSILON = torch.tensor(torch.finfo(torch.float).eps)
-# 1 milliseconds = 0.001 seconds
-MILLISECONDS_TO_SECONDS = 0.001
-# window types
-HAMMING = "hamming"
-HANNING = "hanning"
-POVEY = "povey"
-RECTANGULAR = "rectangular"
-BLACKMAN = "blackman"
-WINDOWS = [HAMMING, HANNING, POVEY, RECTANGULAR, BLACKMAN]
-def _get_epsilon(device, dtype):
-    return EPSILON.to(device=device, dtype=dtype)
-def _next_power_of_2(x: int) -> int:
-    r"""Returns the smallest power of 2 that is greater than x"""
-    return 1 if x == 0 else 2 ** (x - 1).bit_length()
-def _get_strided(waveform: Tensor, window_size: int, window_shift: int, snip_edges: bool) -> Tensor:
-    r"""Given a waveform (1D tensor of size ``num_samples``), it returns a 2D tensor (m, ``window_size``)
-    representing how the window is shifted along the waveform. Each row is a frame.
-    Args:
-        waveform (Tensor): Tensor of size ``num_samples``
-        window_size (int): Frame length
-        window_shift (int): Frame shift
-        snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit
-            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
-            depends only on the frame_shift, and we reflect the data at the ends.
-    Returns:
-        Tensor: 2D tensor of size (m, ``window_size``) where each row is a frame
-    """
-    assert waveform.dim() == 1
-    num_samples = waveform.size(0)
-    strides = (window_shift * waveform.stride(0), waveform.stride(0))
-    if snip_edges:
-        if num_samples < window_size:
-            return torch.empty((0, 0), dtype=waveform.dtype, device=waveform.device)
-        else:
-            m = 1 + (num_samples - window_size) // window_shift
-    else:
-        reversed_waveform = torch.flip(waveform, [0])
-        m = (num_samples + (window_shift // 2)) // window_shift
-        pad = window_size // 2 - window_shift // 2
-        pad_right = reversed_waveform
-        if pad > 0:
-            # torch.nn.functional.pad returns [2,1,0,1,2] for 'reflect'
-            # but we want [2, 1, 0, 0, 1, 2]
-            pad_left = reversed_waveform[-pad:]
-            waveform = torch.cat((pad_left, waveform, pad_right), dim=0)
-        else:
-            # pad is negative so we want to trim the waveform at the front
-            waveform = torch.cat((waveform[-pad:], pad_right), dim=0)
-    sizes = (m, window_size)
-    return waveform.as_strided(sizes, strides)
-def _feature_window_function(
-    window_type: str,
-    window_size: int,
-    blackman_coeff: float,
-    device: torch.device,
-    dtype: int,
-) -> Tensor:
-    r"""Returns a window function with the given type and size"""
-    if window_type == HANNING:
-        return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype)
-    elif window_type == HAMMING:
-        return torch.hamming_window(window_size, periodic=False, alpha=0.54, beta=0.46, device=device, dtype=dtype)
-    elif window_type == POVEY:
-        # like hanning but goes to zero at edges
-        return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype).pow(0.85)
-    elif window_type == RECTANGULAR:
-        return torch.ones(window_size, device=device, dtype=dtype)
-    elif window_type == BLACKMAN:
-        a = 2 * math.pi / (window_size - 1)
-        window_function = torch.arange(window_size, device=device, dtype=dtype)
-        # can't use torch.blackman_window as they use different coefficients
-        return (
-            blackman_coeff
-            - 0.5 * torch.cos(a * window_function)
-            + (0.5 - blackman_coeff) * torch.cos(2 * a * window_function)
-        ).to(device=device, dtype=dtype)
-    else:
-        raise Exception("Invalid window type " + window_type)
-def _get_log_energy(strided_input: Tensor, epsilon: Tensor, energy_floor: float) -> Tensor:
-    r"""Returns the log energy of size (m) for a strided_input (m,*)"""
-    device, dtype = strided_input.device, strided_input.dtype
-    log_energy = torch.max(strided_input.pow(2).sum(1), epsilon).log()  # size (m)
-    if energy_floor == 0.0:
-        return log_energy
-    return torch.max(log_energy, torch.tensor(math.log(energy_floor), device=device, dtype=dtype))
-def _get_waveform_and_window_properties(
-    waveform: Tensor,
-    channel: int,
-    sample_frequency: float,
-    frame_shift: float,
-    frame_length: float,
-    round_to_power_of_two: bool,
-    preemphasis_coefficient: float,
-) -> Tuple[Tensor, int, int, int]:
-    r"""Gets the waveform and window properties"""
-    channel = max(channel, 0)
-    assert channel < waveform.size(0), "Invalid channel {} for size {}".format(channel, waveform.size(0))
-    waveform = waveform[channel, :]  # size (n)
-    window_shift = int(sample_frequency * frame_shift * MILLISECONDS_TO_SECONDS)
-    window_size = int(sample_frequency * frame_length * MILLISECONDS_TO_SECONDS)
-    padded_window_size = _next_power_of_2(window_size) if round_to_power_of_two else window_size
-    assert 2 <= window_size <= len(waveform), "choose a window size {} that is [2, {}]".format(
-        window_size, len(waveform)
-    )
-    assert 0 < window_shift, "`window_shift` must be greater than 0"
-    assert padded_window_size % 2 == 0, (
-        "the padded `window_size` must be divisible by two." " use `round_to_power_of_two` or change `frame_length`"
-    )
-    assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]"
-    assert sample_frequency > 0, "`sample_frequency` must be greater than zero"
-    return waveform, window_shift, window_size, padded_window_size
-def _get_window(
-    waveform: Tensor,
-    padded_window_size: int,
-    window_size: int,
-    window_shift: int,
-    window_type: str,
-    blackman_coeff: float,
-    snip_edges: bool,
-    raw_energy: bool,
-    energy_floor: float,
-    dither: float,
-    remove_dc_offset: bool,
-    preemphasis_coefficient: float,
-) -> Tuple[Tensor, Tensor]:
-    r"""Gets a window and its log energy
-    Returns:
-        (Tensor, Tensor): strided_input of size (m, ``padded_window_size``) and signal_log_energy of size (m)
-    """
-    device, dtype = waveform.device, waveform.dtype
-    epsilon = _get_epsilon(device, dtype)
-    # size (m, window_size)
-    strided_input = _get_strided(waveform, window_size, window_shift, snip_edges)
-    if dither != 0.0:
-        rand_gauss = torch.randn(strided_input.shape, device=device, dtype=dtype)
-        strided_input = strided_input + rand_gauss * dither
-    if remove_dc_offset:
-        # Subtract each row/frame by its mean
-        row_means = torch.mean(strided_input, dim=1).unsqueeze(1)  # size (m, 1)
-        strided_input = strided_input - row_means
-    if raw_energy:
-        # Compute the log energy of each row/frame before applying preemphasis and
-        # window function
-        signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor)  # size (m)
-    if preemphasis_coefficient != 0.0:
-        # strided_input[i,j] -= preemphasis_coefficient * strided_input[i, max(0, j-1)] for all i,j
-        offset_strided_input = torch.nn.functional.pad(strided_input.unsqueeze(0), (1, 0), mode="replicate").squeeze(
-            0
-        )  # size (m, window_size + 1)
-        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :-1]
-    # Apply window_function to each row/frame
-    window_function = _feature_window_function(window_type, window_size, blackman_coeff, device, dtype).unsqueeze(
-        0
-    )  # size (1, window_size)
-    strided_input = strided_input * window_function  # size (m, window_size)
-    # Pad columns with zero until we reach size (m, padded_window_size)
-    if padded_window_size != window_size:
-        padding_right = padded_window_size - window_size
-        strided_input = torch.nn.functional.pad(
-            strided_input.unsqueeze(0), (0, padding_right), mode="constant", value=0
-        ).squeeze(0)
-    # Compute energy after window function (not the raw one)
-    if not raw_energy:
-        signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor)  # size (m)
-    return strided_input, signal_log_energy
-def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
-    # subtracts the column mean of the tensor size (m, n) if subtract_mean=True
-    # it returns size (m, n)
-    if subtract_mean:
-        col_means = torch.mean(tensor, dim=0).unsqueeze(0)
-        tensor = tensor - col_means
-    return tensor
-def spectrogram(
-    waveform: Tensor,
-    blackman_coeff: float = 0.42,
-    channel: int = -1,
-    dither: float = 0.0,
-    energy_floor: float = 1.0,
-    frame_length: float = 25.0,
-    frame_shift: float = 10.0,
-    min_duration: float = 0.0,
-    preemphasis_coefficient: float = 0.97,
-    raw_energy: bool = True,
-    remove_dc_offset: bool = True,
-    round_to_power_of_two: bool = True,
-    sample_frequency: float = 16000.0,
-    snip_edges: bool = True,
-    subtract_mean: bool = False,
-    window_type: str = POVEY,
-) -> Tensor:
-    r"""Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's
-    compute-spectrogram-feats.
-    Args:
-        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
-        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
-        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
-        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
-            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
-        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
-            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
-            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
-        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
-        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
-        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
-        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
-        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
-        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
-        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
-            to FFT. (Default: ``True``)
-        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
-            specified there) (Default: ``16000.0``)
-        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
-            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
-            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
-        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
-            it this way.  (Default: ``False``)
-        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
-         (Default: ``'povey'``)
-    Returns:
-        Tensor: A spectrogram identical to what Kaldi would output. The shape is
-        (m, ``padded_window_size // 2 + 1``) where m is calculated in _get_strided
-    """
-    device, dtype = waveform.device, waveform.dtype
-    epsilon = _get_epsilon(device, dtype)
-    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
-        waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
-    )
-    if len(waveform) < min_duration * sample_frequency:
-        # signal is too short
-        return torch.empty(0)
-    strided_input, signal_log_energy = _get_window(
-        waveform,
-        padded_window_size,
-        window_size,
-        window_shift,
-        window_type,
-        blackman_coeff,
-        snip_edges,
-        raw_energy,
-        energy_floor,
-        dither,
-        remove_dc_offset,
-        preemphasis_coefficient,
-    )
-    # size (m, padded_window_size // 2 + 1, 2)
-    fft = torch.fft.rfft(strided_input)
-    # Convert the FFT into a power spectrum
-    power_spectrum = torch.max(fft.abs().pow(2.0), epsilon).log()  # size (m, padded_window_size // 2 + 1)
-    power_spectrum[:, 0] = signal_log_energy
-    power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
-    return power_spectrum
-def inverse_mel_scale_scalar(mel_freq: float) -> float:
-    return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
-def inverse_mel_scale(mel_freq: Tensor) -> Tensor:
-    return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
-def mel_scale_scalar(freq: float) -> float:
-    return 1127.0 * math.log(1.0 + freq / 700.0)
-def mel_scale(freq: Tensor) -> Tensor:
-    return 1127.0 * (1.0 + freq / 700.0).log()
-def vtln_warp_freq(
-    vtln_low_cutoff: float,
-    vtln_high_cutoff: float,
-    low_freq: float,
-    high_freq: float,
-    vtln_warp_factor: float,
-    freq: Tensor,
-) -> Tensor:
-    r"""This computes a VTLN warping function that is not the same as HTK's one,
-    but has similar inputs (this function has the advantage of never producing
-    empty bins).
-    This function computes a warp function F(freq), defined between low_freq
-    and high_freq inclusive, with the following properties:
-        F(low_freq) == low_freq
-        F(high_freq) == high_freq
-    The function is continuous and piecewise linear with two inflection
-        points.
-    The lower inflection point (measured in terms of the unwarped
-        frequency) is at frequency l, determined as described below.
-    The higher inflection point is at a frequency h, determined as
-        described below.
-    If l <= f <= h, then F(f) = f/vtln_warp_factor.
-    If the higher inflection point (measured in terms of the unwarped
-        frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
-        Since (by the last point) F(h) == h/vtln_warp_factor, then
-        max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
-        h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
-          = vtln_high_cutoff * min(1, vtln_warp_factor).
-    If the lower inflection point (measured in terms of the unwarped
-        frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
-        This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
-                            = vtln_low_cutoff * max(1, vtln_warp_factor)
-    Args:
-        vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
-        vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
-        low_freq (float): Lower frequency cutoffs in mel computation
-        high_freq (float): Upper frequency cutoffs in mel computation
-        vtln_warp_factor (float): Vtln warp factor
-        freq (Tensor): given frequency in Hz
-    Returns:
-        Tensor: Freq after vtln warp
-    """
-    assert vtln_low_cutoff > low_freq, "be sure to set the vtln_low option higher than low_freq"
-    assert vtln_high_cutoff < high_freq, "be sure to set the vtln_high option lower than high_freq [or negative]"
-    l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
-    h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
-    scale = 1.0 / vtln_warp_factor
-    Fl = scale * l  # F(l)
-    Fh = scale * h  # F(h)
-    assert l > low_freq and h < high_freq
-    # slope of left part of the 3-piece linear function
-    scale_left = (Fl - low_freq) / (l - low_freq)
-    # [slope of center part is just "scale"]
-    # slope of right part of the 3-piece linear function
-    scale_right = (high_freq - Fh) / (high_freq - h)
-    res = torch.empty_like(freq)
-    outside_low_high_freq = torch.lt(freq, low_freq) | torch.gt(freq, high_freq)  # freq < low_freq || freq > high_freq
-    before_l = torch.lt(freq, l)  # freq < l
-    before_h = torch.lt(freq, h)  # freq < h
-    after_h = torch.ge(freq, h)  # freq >= h
-    # order of operations matter here (since there is overlapping frequency regions)
-    res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
-    res[before_h] = scale * freq[before_h]
-    res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
-    res[outside_low_high_freq] = freq[outside_low_high_freq]
-    return res
-def vtln_warp_mel_freq(
-    vtln_low_cutoff: float,
-    vtln_high_cutoff: float,
-    low_freq,
-    high_freq: float,
-    vtln_warp_factor: float,
-    mel_freq: Tensor,
-) -> Tensor:
-    r"""
-    Args:
-        vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
-        vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
-        low_freq (float): Lower frequency cutoffs in mel computation
-        high_freq (float): Upper frequency cutoffs in mel computation
-        vtln_warp_factor (float): Vtln warp factor
-        mel_freq (Tensor): Given frequency in Mel
-    Returns:
-        Tensor: ``mel_freq`` after vtln warp
-    """
-    return mel_scale(
-        vtln_warp_freq(
-            vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, vtln_warp_factor, inverse_mel_scale(mel_freq)
-        )
-    )
-def get_mel_banks(
-    num_bins: int,
-    window_length_padded: int,
-    sample_freq: float,
-    low_freq: float,
-    high_freq: float,
-    vtln_low: float,
-    vtln_high: float,
-    vtln_warp_factor: float,device=None,dtype=None
-) -> Tuple[Tensor, Tensor]:
-    """
-    Returns:
-        (Tensor, Tensor): The tuple consists of ``bins`` (which is
-        melbank of size (``num_bins``, ``num_fft_bins``)) and ``center_freqs`` (which is
-        center frequencies of bins of size (``num_bins``)).
-    """
-    assert num_bins > 3, "Must have at least 3 mel bins"
-    assert window_length_padded % 2 == 0
-    num_fft_bins = window_length_padded / 2
-    nyquist = 0.5 * sample_freq
-    if high_freq <= 0.0:
-        high_freq += nyquist
-    assert (
-        (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq)
-    ), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
-    # fft-bin width [think of it as Nyquist-freq / half-window-length]
-    fft_bin_width = sample_freq / window_length_padded
-    mel_low_freq = mel_scale_scalar(low_freq)
-    mel_high_freq = mel_scale_scalar(high_freq)
-    # divide by num_bins+1 in next line because of end-effects where the bins
-    # spread out to the sides.
-    mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
-    if vtln_high < 0.0:
-        vtln_high += nyquist
-    assert vtln_warp_factor == 1.0 or (
-        (low_freq < vtln_low < high_freq) and (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)
-    ), "Bad values in options: vtln-low {} and vtln-high {}, versus " "low-freq {} and high-freq {}".format(
-        vtln_low, vtln_high, low_freq, high_freq
-    )
-    bin = torch.arange(num_bins).unsqueeze(1)
-    left_mel = mel_low_freq + bin * mel_freq_delta  # size(num_bins, 1)
-    center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta  # size(num_bins, 1)
-    right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta  # size(num_bins, 1)
-    if vtln_warp_factor != 1.0:
-        left_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, left_mel)
-        center_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, center_mel)
-        right_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, right_mel)
-    # center_freqs = inverse_mel_scale(center_mel)  # size (num_bins)
-    # size(1, num_fft_bins)
-    mel = mel_scale(fft_bin_width * torch.arange(num_fft_bins)).unsqueeze(0)
-    # size (num_bins, num_fft_bins)
-    up_slope = (mel - left_mel) / (center_mel - left_mel)
-    down_slope = (right_mel - mel) / (right_mel - center_mel)
-    if vtln_warp_factor == 1.0:
-        # left_mel < center_mel < right_mel so we can min the two slopes and clamp negative values
-        bins = torch.max(torch.zeros(1), torch.min(up_slope, down_slope))
-    else:
-        # warping can move the order of left_mel, center_mel, right_mel anywhere
-        bins = torch.zeros_like(up_slope)
-        up_idx = torch.gt(mel, left_mel) & torch.le(mel, center_mel)  # left_mel < mel <= center_mel
-        down_idx = torch.gt(mel, center_mel) & torch.lt(mel, right_mel)  # center_mel < mel < right_mel
-        bins[up_idx] = up_slope[up_idx]
-        bins[down_idx] = down_slope[down_idx]
-    return bins.to(device=device,dtype=dtype)#, center_freqs
-cache={}
-def fbank(
-    waveform: Tensor,
-    blackman_coeff: float = 0.42,
-    channel: int = -1,
-    dither: float = 0.0,
-    energy_floor: float = 1.0,
-    frame_length: float = 25.0,
-    frame_shift: float = 10.0,
-    high_freq: float = 0.0,
-    htk_compat: bool = False,
-    low_freq: float = 20.0,
-    min_duration: float = 0.0,
-    num_mel_bins: int = 23,
-    preemphasis_coefficient: float = 0.97,
-    raw_energy: bool = True,
-    remove_dc_offset: bool = True,
-    round_to_power_of_two: bool = True,
-    sample_frequency: float = 16000.0,
-    snip_edges: bool = True,
-    subtract_mean: bool = False,
-    use_energy: bool = False,
-    use_log_fbank: bool = True,
-    use_power: bool = True,
-    vtln_high: float = -500.0,
-    vtln_low: float = 100.0,
-    vtln_warp: float = 1.0,
-    window_type: str = POVEY,
-) -> Tensor:
-    r"""Create a fbank from a raw audio signal. This matches the input/output of Kaldi's
-    compute-fbank-feats.
-    Args:
-        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
-        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
-        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
-        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
-            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
-        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
-            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
-            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
-        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
-        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
-        high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
-         (Default: ``0.0``)
-        htk_compat (bool, optional): If true, put energy last.  Warning: not sufficient to get HTK compatible features
-         (need to change other parameters). (Default: ``False``)
-        low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
-        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
-        num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
-        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
-        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
-        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
-        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
-            to FFT. (Default: ``True``)
-        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
-            specified there) (Default: ``16000.0``)
-        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
-            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
-            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
-        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
-            it this way.  (Default: ``False``)
-        use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
-        use_log_fbank (bool, optional):If true, produce log-filterbank, else produce linear. (Default: ``True``)
-        use_power (bool, optional): If true, use power, else use magnitude. (Default: ``True``)
-        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
-            negative, offset from high-mel-freq (Default: ``-500.0``)
-        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
-        vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
-        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
-         (Default: ``'povey'``)
-    Returns:
-        Tensor: A fbank identical to what Kaldi would output. The shape is (m, ``num_mel_bins + use_energy``)
-        where m is calculated in _get_strided
-    """
-    device, dtype = waveform.device, waveform.dtype
-    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
-        waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
-    )
-    if len(waveform) < min_duration * sample_frequency:
-        # signal is too short
-        return torch.empty(0, device=device, dtype=dtype)
-    # strided_input, size (m, padded_window_size) and signal_log_energy, size (m)
-    strided_input, signal_log_energy = _get_window(
-        waveform,
-        padded_window_size,
-        window_size,
-        window_shift,
-        window_type,
-        blackman_coeff,
-        snip_edges,
-        raw_energy,
-        energy_floor,
-        dither,
-        remove_dc_offset,
-        preemphasis_coefficient,
-    )
-    # size (m, padded_window_size // 2 + 1)
-    spectrum = torch.fft.rfft(strided_input).abs()
-    if use_power:
-        spectrum = spectrum.pow(2.0)
-    # size (num_mel_bins, padded_window_size // 2)
-    # print(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp)
-    cache_key="%s-%s-%s-%s-%s-%s-%s-%s-%s-%s"%(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp,device,dtype)
-    if cache_key not in cache:
-        mel_energies = get_mel_banks(
-            num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp,device,dtype
-        )
-        cache[cache_key]=mel_energies
-    else:
-        mel_energies=cache[cache_key]
-    # pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1)
-    mel_energies = torch.nn.functional.pad(mel_energies, (0, 1), mode="constant", value=0)
-    # sum with mel fiterbanks over the power spectrum, size (m, num_mel_bins)
-    mel_energies = torch.mm(spectrum, mel_energies.T)
-    if use_log_fbank:
-        # avoid log of zero (which should be prevented anyway by dithering)
-        mel_energies = torch.max(mel_energies, _get_epsilon(device, dtype)).log()
-    # if use_energy then add it as the last column for htk_compat == true else first column
-    if use_energy:
-        signal_log_energy = signal_log_energy.unsqueeze(1)  # size (m, 1)
-        # returns size (m, num_mel_bins + 1)
-        if htk_compat:
-            mel_energies = torch.cat((mel_energies, signal_log_energy), dim=1)
-        else:
-            mel_energies = torch.cat((signal_log_energy, mel_energies), dim=1)
-    mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
-    return mel_energies
-def _get_dct_matrix(num_ceps: int, num_mel_bins: int) -> Tensor:
-    # returns a dct matrix of size (num_mel_bins, num_ceps)
-    # size (num_mel_bins, num_mel_bins)
-    dct_matrix = torchaudio.functional.create_dct(num_mel_bins, num_mel_bins, "ortho")
-    # kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins)
-    # this would be the first column in the dct_matrix for torchaudio as it expects a
-    # right multiply (which would be the first column of the kaldi's dct_matrix as kaldi
-    # expects a left multiply e.g. dct_matrix * vector).
-    dct_matrix[:, 0] = math.sqrt(1 / float(num_mel_bins))
-    dct_matrix = dct_matrix[:, :num_ceps]
-    return dct_matrix
-def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> Tensor:
-    # returns size (num_ceps)
-    # Compute liftering coefficients (scaling on cepstral coeffs)
-    # coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected.
-    i = torch.arange(num_ceps)
-    return 1.0 + 0.5 * cepstral_lifter * torch.sin(math.pi * i / cepstral_lifter)
-def mfcc(
-    waveform: Tensor,
-    blackman_coeff: float = 0.42,
-    cepstral_lifter: float = 22.0,
-    channel: int = -1,
-    dither: float = 0.0,
-    energy_floor: float = 1.0,
-    frame_length: float = 25.0,
-    frame_shift: float = 10.0,
-    high_freq: float = 0.0,
-    htk_compat: bool = False,
-    low_freq: float = 20.0,
-    num_ceps: int = 13,
-    min_duration: float = 0.0,
-    num_mel_bins: int = 23,
-    preemphasis_coefficient: float = 0.97,
-    raw_energy: bool = True,
-    remove_dc_offset: bool = True,
-    round_to_power_of_two: bool = True,
-    sample_frequency: float = 16000.0,
-    snip_edges: bool = True,
-    subtract_mean: bool = False,
-    use_energy: bool = False,
-    vtln_high: float = -500.0,
-    vtln_low: float = 100.0,
-    vtln_warp: float = 1.0,
-    window_type: str = POVEY,
-) -> Tensor:
-    r"""Create a mfcc from a raw audio signal. This matches the input/output of Kaldi's
-    compute-mfcc-feats.
-    Args:
-        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
-        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
-        cepstral_lifter (float, optional): Constant that controls scaling of MFCCs (Default: ``22.0``)
-        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
-        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
-            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
-        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
-            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
-            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
-        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
-        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
-        high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
-         (Default: ``0.0``)
-        htk_compat (bool, optional): If true, put energy last.  Warning: not sufficient to get HTK compatible
-         features (need to change other parameters). (Default: ``False``)
-        low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
-        num_ceps (int, optional): Number of cepstra in MFCC computation (including C0) (Default: ``13``)
-        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
-        num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
-        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
-        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
-        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
-        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
-            to FFT. (Default: ``True``)
-        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
-            specified there) (Default: ``16000.0``)
-        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
-            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
-            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
-        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
-            it this way.  (Default: ``False``)
-        use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
-        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
-            negative, offset from high-mel-freq (Default: ``-500.0``)
-        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
-        vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
-        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
-         (Default: ``"povey"``)
-    Returns:
-        Tensor: A mfcc identical to what Kaldi would output. The shape is (m, ``num_ceps``)
-        where m is calculated in _get_strided
-    """
-    assert num_ceps <= num_mel_bins, "num_ceps cannot be larger than num_mel_bins: %d vs %d" % (num_ceps, num_mel_bins)
-    device, dtype = waveform.device, waveform.dtype
-    # The mel_energies should not be squared (use_power=True), not have mean subtracted
-    # (subtract_mean=False), and use log (use_log_fbank=True).
-    # size (m, num_mel_bins + use_energy)
-    feature = fbank(
-        waveform=waveform,
-        blackman_coeff=blackman_coeff,
-        channel=channel,
-        dither=dither,
-        energy_floor=energy_floor,
-        frame_length=frame_length,
-        frame_shift=frame_shift,
-        high_freq=high_freq,
-        htk_compat=htk_compat,
-        low_freq=low_freq,
-        min_duration=min_duration,
-        num_mel_bins=num_mel_bins,
-        preemphasis_coefficient=preemphasis_coefficient,
-        raw_energy=raw_energy,
-        remove_dc_offset=remove_dc_offset,
-        round_to_power_of_two=round_to_power_of_two,
-        sample_frequency=sample_frequency,
-        snip_edges=snip_edges,
-        subtract_mean=False,
-        use_energy=use_energy,
-        use_log_fbank=True,
-        use_power=True,
-        vtln_high=vtln_high,
-        vtln_low=vtln_low,
-        vtln_warp=vtln_warp,
-        window_type=window_type,
-    )
-    if use_energy:
-        # size (m)
-        signal_log_energy = feature[:, num_mel_bins if htk_compat else 0]
-        # offset is 0 if htk_compat==True else 1
-        mel_offset = int(not htk_compat)
-        feature = feature[:, mel_offset : (num_mel_bins + mel_offset)]
-    # size (num_mel_bins, num_ceps)
-    dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins).to(dtype=dtype, device=device)
-    # size (m, num_ceps)
-    feature = feature.matmul(dct_matrix)
-    if cepstral_lifter != 0.0:
-        # size (1, num_ceps)
-        lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze(0)
-        feature *= lifter_coeffs.to(device=device, dtype=dtype)
-    # if use_energy then replace the last column for htk_compat == true else first column
-    if use_energy:
-        feature[:, 0] = signal_log_energy
-    if htk_compat:
-        energy = feature[:, 0].unsqueeze(1)  # size (m, 1)
-        feature = feature[:, 1:]  # size (m, num_ceps - 1)
-        if not use_energy:
-            # scale on C0 (actually removing a scale we previously added that's
-            # part of one common definition of the cosine transform.)
-            energy *= math.sqrt(2)
-        feature = torch.cat((feature, energy), dim=1)
-    feature = _subtract_column_mean(feature, subtract_mean)
-    return feature

eres2net/pooling_layers.py DELETED Viewed

@@ -1,104 +0,0 @@
-# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-""" This implementation is adapted from https://github.com/wenet-e2e/wespeaker."""
-import torch
-import torch.nn as nn
-class TAP(nn.Module):
-    """
-    Temporal average pooling, only first-order mean is considered
-    """
-    def __init__(self, **kwargs):
-        super(TAP, self).__init__()
-    def forward(self, x):
-        pooling_mean = x.mean(dim=-1)
-        # To be compatable with 2D input
-        pooling_mean = pooling_mean.flatten(start_dim=1)
-        return pooling_mean
-class TSDP(nn.Module):
-    """
-    Temporal standard deviation pooling, only second-order std is considered
-    """
-    def __init__(self, **kwargs):
-        super(TSDP, self).__init__()
-    def forward(self, x):
-        # The last dimension is the temporal axis
-        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
-        pooling_std = pooling_std.flatten(start_dim=1)
-        return pooling_std
-class TSTP(nn.Module):
-    """
-    Temporal statistics pooling, concatenate mean and std, which is used in
-    x-vector
-    Comment: simple concatenation can not make full use of both statistics
-    """
-    def __init__(self, **kwargs):
-        super(TSTP, self).__init__()
-    def forward(self, x):
-        # The last dimension is the temporal axis
-        pooling_mean = x.mean(dim=-1)
-        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
-        pooling_mean = pooling_mean.flatten(start_dim=1)
-        pooling_std = pooling_std.flatten(start_dim=1)
-        stats = torch.cat((pooling_mean, pooling_std), 1)
-        return stats
-class ASTP(nn.Module):
-    """ Attentive statistics pooling: Channel- and context-dependent
-        statistics pooling, first used in ECAPA_TDNN.
-    """
-    def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
-        super(ASTP, self).__init__()
-        self.global_context_att = global_context_att
-        # Use Conv1d with stride == 1 rather than Linear, then we don't
-        # need to transpose inputs.
-        if global_context_att:
-            self.linear1 = nn.Conv1d(
-                in_dim * 3, bottleneck_dim,
-                kernel_size=1)  # equals W and b in the paper
-        else:
-            self.linear1 = nn.Conv1d(
-                in_dim, bottleneck_dim,
-                kernel_size=1)  # equals W and b in the paper
-        self.linear2 = nn.Conv1d(bottleneck_dim, in_dim,
-                                 kernel_size=1)  # equals V and k in the paper
-    def forward(self, x):
-        """
-        x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
-            or a 4-dimensional tensor in resnet architecture (B,C,F,T)
-            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
-        """
-        if len(x.shape) == 4:
-            x = x.reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
-        assert len(x.shape) == 3
-        if self.global_context_att:
-            context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
-            context_std = torch.sqrt(
-                torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
-            x_in = torch.cat((x, context_mean, context_std), dim=1)
-        else:
-            x_in = x
-        # DON'T use ReLU here! ReLU may be hard to converge.
-        alpha = torch.tanh(
-            self.linear1(x_in))  # alpha = F.relu(self.linear1(x_in))
-        alpha = torch.softmax(self.linear2(alpha), dim=2)
-        mean = torch.sum(alpha * x, dim=2)
-        var = torch.sum(alpha * (x**2), dim=2) - mean**2
-        std = torch.sqrt(var.clamp(min=1e-10))
-        return torch.cat([mean, std], dim=1)