Spaces:
Sleeping
Sleeping
File size: 6,004 Bytes
b96b09e 9a884ef b96b09e 9f13c8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
"""
LoRA (Low-Rank Adaptation) implementation for convolutional layers.
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
class LoRALayer(nn.Module):
"""
LoRA (Low-Rank Adaptation) wrapper for convolutional layers.
Args:
original_layer: The Conv2d layer to adapt
rank: LoRA rank (default=8)
- Lower rank (4): Fewer parameters, less overfitting risk, less capacity
- Medium rank (8-16): Balanced trade-off (recommended for most tasks)
- Higher rank (32+): More capacity but approaches full fine-tuning
For small datasets (<1000 images), rank=8 provides sufficient
adaptation capacity while keeping parameters low (~2% of original layer).
"""
def __init__(self, original_layer, rank=8):
super().__init__()
self.original_layer = original_layer
self.rank = rank
# Get dimensions from original layer
out_channels = original_layer.out_channels
in_channels = original_layer.in_channels
kernel_size = original_layer.kernel_size
# LoRA matrices: A (down-projection) and B (up-projection)
# A reduces dimensions: in_channels -> rank
# Initialized with small random values to break symmetry
self.lora_A = nn.Parameter(
torch.randn(rank, in_channels, *kernel_size) * 0.01
)
# B expands dimensions: rank -> out_channels
# Initialized to zeros so LoRA starts as identity (preserves pretrained weights)
# This initialization strategy follows the original LoRA paper
self.lora_B = nn.Parameter(
torch.zeros(out_channels, rank, 1, 1)
)
# Freeze original weights (preserve ImageNet knowledge)
self.original_layer.weight.requires_grad = False
if self.original_layer.bias is not None:
self.original_layer.bias.requires_grad = False
def forward(self, x):
"""
Forward pass combining original frozen weights with LoRA adaptation.
Mathematical formulation:
output = W_frozen * x + (B * (A * x))
where * denotes convolution operation.
"""
# Original forward pass (frozen pretrained weights)
original_output = self.original_layer(x)
# LoRA adaptation pathway (low-rank decomposition)
# Step 1: Down-project with A (in_channels β rank)
lora_output = F.conv2d(
x,
self.lora_A,
stride=self.original_layer.stride,
padding=self.original_layer.padding
)
# Step 2: Up-project with B (rank β out_channels)
# These two sequential convolutions approximate a low-rank adaptation
lora_output = F.conv2d(lora_output, self.lora_B)
# Combine: W*x + (B*(A*x)) where * denotes convolution
return original_output + lora_output
def get_model(num_classes=2, pretrained=True):
"""
Load ResNet34 with optional pretrained weights.
Args:
num_classes: Number of output classes
pretrained: Whether to load ImageNet pretrained weights
Returns:
ResNet34 model
"""
if pretrained:
model = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1)
else:
model = models.resnet34(weights=None)
# Modify last layer for classification
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)
return model
def apply_lora_to_model(model, target_layers=['layer3', 'layer4'], rank=8):
"""
Apply LoRA adapters to specific layers in ResNet34.
Strategy: We target layer3 and layer4 (high-level feature extractors) because:
- layer1 & layer2: Extract low-level features (edges, textures) that are
universal across tasks β keep frozen, no adaptation needed
- layer3 & layer4: Extract high-level semantic features (objects, contexts)
that are task-specific β need slight adaptation for smoking detection
- fc: Brand new classifier head β fully trainable
This approach gives us the sweet spot:
- Full fine-tuning: 21.8M params (overfitting risk with small datasets)
- Only fc training: ~1K params (may underfit, features not adapted)
- LoRA on layer3+layer4: ~465K params (2.14% of model, balanced approach)
Args:
model: ResNet34 model
target_layers: List of layer names to apply LoRA to
rank: LoRA rank (default=8, adds ~2% params per adapted layer)
Returns:
Number of convolutional layers where LoRA was applied
"""
# Freeze ALL layers first (preserve ImageNet features)
for param in model.parameters():
param.requires_grad = False
# Unfreeze only the new classification head
for param in model.fc.parameters():
param.requires_grad = True
# Apply to layer3
for block in model.layer3:
if hasattr(block, 'conv1'):
block.conv1 = LoRALayer(block.conv1, rank=rank)
if hasattr(block, 'conv2'):
block.conv2 = LoRALayer(block.conv2, rank=rank)
# Apply to layer4
for block in model.layer4:
if hasattr(block, 'conv1'):
block.conv1 = LoRALayer(block.conv1, rank=rank)
if hasattr(block, 'conv2'):
block.conv2 = LoRALayer(block.conv2, rank=rank)
return model
def count_parameters(model):
"""
Count total and trainable parameters in the model.
Returns:
tuple: (total_params, trainable_params, trainable_percentage)
"""
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_pct = 100. * trainable_params / total_params
return total_params, trainable_params, trainable_pct |