Spaces:
Sleeping
Sleeping
import torch.nn as nn | |
class MusicCNN(nn.Module): | |
def __init__(self, num_classes, dropout_rate=0.3, device="cuda"): | |
super(MusicCNN, self).__init__() | |
self.device = device | |
# Convolutional blocks | |
self.conv_block1 = nn.Sequential( | |
nn.Conv2d(1, 32, kernel_size=3, padding=1), | |
nn.BatchNorm2d(32), | |
nn.ReLU(), | |
nn.Conv2d(32, 32, kernel_size=3, padding=1), | |
nn.BatchNorm2d(32), | |
nn.ReLU(), | |
nn.MaxPool2d(2, 2), | |
nn.Dropout2d(dropout_rate) | |
).to(device) | |
self.conv_block2 = nn.Sequential( | |
nn.Conv2d(32, 64, kernel_size=3, padding=1), | |
nn.BatchNorm2d(64), | |
nn.ReLU(), | |
nn.Conv2d(64, 64, kernel_size=3, padding=1), | |
nn.BatchNorm2d(64), | |
nn.ReLU(), | |
nn.MaxPool2d(2, 2), | |
nn.Dropout2d(dropout_rate) | |
).to(device) | |
self.conv_block3 = nn.Sequential( | |
nn.Conv2d(64, 128, kernel_size=3, padding=1), | |
nn.BatchNorm2d(128), | |
nn.ReLU(), | |
nn.Conv2d(128, 128, kernel_size=3, padding=1), | |
nn.BatchNorm2d(128), | |
nn.ReLU(), | |
nn.MaxPool2d(2, 2), | |
nn.Dropout2d(dropout_rate) | |
).to(device) | |
self.fc_layers = None # Fully connected layers will be initialized later | |
self.num_classes = num_classes | |
self.dropout_rate = dropout_rate | |
def forward(self, x): | |
x = self.conv_block1(x) | |
x = self.conv_block2(x) | |
x = self.conv_block3(x) | |
# Flatten dynamically | |
x = x.view(x.size(0), -1) | |
# Initialize FC layers dynamically | |
if self.fc_layers is None: | |
fc_input_size = x.size(1) | |
self.fc_layers = nn.Sequential( | |
nn.Linear(fc_input_size, 512), | |
nn.BatchNorm1d(512), | |
nn.ReLU(), | |
nn.Dropout(self.dropout_rate), | |
nn.Linear(512, 256), | |
nn.BatchNorm1d(256), | |
nn.ReLU(), | |
nn.Dropout(self.dropout_rate), | |
nn.Linear(256, self.num_classes) | |
).to(self.device) | |
x = self.fc_layers(x) | |
return x | |
class MusicCRNN2D(nn.Module): | |
def __init__(self, num_classes, dropout_rate=0.1, gru_hidden_size=32, device="cuda"): | |
super(MusicCRNN2D, self).__init__() | |
self.device = device | |
# Input batch normalization | |
self.input_bn = nn.BatchNorm2d(1).to(device) | |
# Convolutional blocks | |
self.conv_block1 = nn.Sequential( | |
nn.Conv2d(1, 64, kernel_size=3, padding=1), | |
nn.BatchNorm2d(64), | |
nn.ELU(), | |
nn.MaxPool2d((2, 2)), | |
nn.Dropout2d(dropout_rate) | |
).to(device) | |
self.conv_block2 = nn.Sequential( | |
nn.Conv2d(64, 128, kernel_size=3, padding=1), | |
nn.BatchNorm2d(128), | |
nn.ELU(), | |
nn.MaxPool2d((4, 2)), | |
nn.Dropout2d(dropout_rate) | |
).to(device) | |
self.conv_block3 = nn.Sequential( | |
nn.Conv2d(128, 128, kernel_size=3, padding=1), | |
nn.BatchNorm2d(128), | |
nn.ELU(), | |
nn.MaxPool2d((4, 2)), | |
nn.Dropout2d(dropout_rate) | |
).to(device) | |
self.conv_block4 = nn.Sequential( | |
nn.Conv2d(128, 128, kernel_size=3, padding=1), | |
nn.BatchNorm2d(128), | |
nn.ELU(), | |
nn.MaxPool2d((4, 2)), | |
nn.Dropout2d(dropout_rate) | |
).to(device) | |
self.gru_stack = None # GRU layers will be initialized later | |
self.classifier = None | |
self.num_classes = num_classes | |
self.dropout_rate = dropout_rate | |
self.gru_hidden_size = gru_hidden_size | |
def forward(self, x): | |
x = self.input_bn(x) | |
x = self.conv_block1(x) | |
x = self.conv_block2(x) | |
x = self.conv_block3(x) | |
x = self.conv_block4(x) | |
# Reshape for GRU | |
batch_size, _, freq, time = x.shape | |
x = x.permute(0, 3, 1, 2) # (batch, time, channels, freq) | |
x = x.reshape(batch_size, time, -1) | |
# Initialize GRU dynamically | |
if self.gru_stack is None: | |
gru_input_size = x.size(2) | |
self.gru_stack = nn.GRU( | |
input_size=gru_input_size, | |
hidden_size=self.gru_hidden_size, | |
batch_first=True, | |
bidirectional=True, | |
).to(self.device) | |
self.classifier = nn.Sequential( | |
nn.Dropout(self.dropout_rate * 3), | |
nn.Linear(self.gru_hidden_size * 2, self.num_classes) # * 2 for bidirectional | |
).to(self.device) | |
x, _ = self.gru_stack(x) | |
# Take the last time step | |
x = x[:, -1, :] | |
# Classification | |
x = self.classifier(x) | |
return x |