import torch.nn as nn import torch from torchvision import models import numpy as np class EncodingBackbone(nn.Module): def __init__(self, encoding_size=256): super(EncodingBackbone, self).__init__() # Load the pretrained ResNet-50 backbone self.backbone = models.resnet50(pretrained=True) # Remove the fully connected layers (classification head) and average pooling layer from the pretrained ResNet-50 self.backbone = nn.Sequential(*list(self.backbone.children())[:-2]) # Add a global average pooling layer to reduce spatial dimensions self.global_avg_pooling = nn.AdaptiveAvgPool2d((1, 1)) # Linear layer for final encoding (to reduce the dimensionality of the output) self.encoding_layer = nn.Linear(2048, encoding_size) # Set requires_grad to False for all parameters for param in self.parameters(): param.requires_grad = False def forward(self, x): # Forward pass through the backbone x = self.backbone(x) # Global average pooling x = self.global_avg_pooling(x) # Reshape for the linear layer x = x.view(x.size(0), -1) # Linear layer for final encoding encoding = self.encoding_layer(x) return encoding