File size: 6,004 Bytes
b96b09e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a884ef
 
 
 
 
 
 
 
 
 
 
 
 
 
b96b09e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f13c8b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""
LoRA (Low-Rank Adaptation) implementation for convolutional layers.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models


class LoRALayer(nn.Module):
    """
    LoRA (Low-Rank Adaptation) wrapper for convolutional layers.
    
    Args:
        original_layer: The Conv2d layer to adapt
        rank: LoRA rank (default=8)
              - Lower rank (4): Fewer parameters, less overfitting risk, less capacity
              - Medium rank (8-16): Balanced trade-off (recommended for most tasks)
              - Higher rank (32+): More capacity but approaches full fine-tuning
              
              For small datasets (<1000 images), rank=8 provides sufficient
              adaptation capacity while keeping parameters low (~2% of original layer).
    """
    
    def __init__(self, original_layer, rank=8):
        super().__init__()
        self.original_layer = original_layer
        self.rank = rank
        
        # Get dimensions from original layer
        out_channels = original_layer.out_channels
        in_channels = original_layer.in_channels
        kernel_size = original_layer.kernel_size
        
        # LoRA matrices: A (down-projection) and B (up-projection)
        # A reduces dimensions: in_channels -> rank
        # Initialized with small random values to break symmetry
        self.lora_A = nn.Parameter(
            torch.randn(rank, in_channels, *kernel_size) * 0.01
        )
        
        # B expands dimensions: rank -> out_channels
        # Initialized to zeros so LoRA starts as identity (preserves pretrained weights)
        # This initialization strategy follows the original LoRA paper
        self.lora_B = nn.Parameter(
            torch.zeros(out_channels, rank, 1, 1)
        )
        
        # Freeze original weights (preserve ImageNet knowledge)
        self.original_layer.weight.requires_grad = False
        if self.original_layer.bias is not None:
            self.original_layer.bias.requires_grad = False
    
    def forward(self, x):
        """
        Forward pass combining original frozen weights with LoRA adaptation.
        
        Mathematical formulation:
        output = W_frozen * x + (B * (A * x))
        
        where * denotes convolution operation.
        """
        # Original forward pass (frozen pretrained weights)
        original_output = self.original_layer(x)
        
        # LoRA adaptation pathway (low-rank decomposition)
        # Step 1: Down-project with A (in_channels β†’ rank)
        lora_output = F.conv2d(
            x,
            self.lora_A,
            stride=self.original_layer.stride,
            padding=self.original_layer.padding
        )
        
        # Step 2: Up-project with B (rank β†’ out_channels)
        # These two sequential convolutions approximate a low-rank adaptation
        lora_output = F.conv2d(lora_output, self.lora_B)
        
        # Combine: W*x + (B*(A*x)) where * denotes convolution
        return original_output + lora_output


def get_model(num_classes=2, pretrained=True):
    """
    Load ResNet34 with optional pretrained weights.
    
    Args:
        num_classes: Number of output classes
        pretrained: Whether to load ImageNet pretrained weights
    
    Returns:
        ResNet34 model
    """
    if pretrained:
        model = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1)
    else:
        model = models.resnet34(weights=None)
    
    # Modify last layer for classification
    num_features = model.fc.in_features
    model.fc = nn.Linear(num_features, num_classes)
    
    return model


def apply_lora_to_model(model, target_layers=['layer3', 'layer4'], rank=8):
    """
    Apply LoRA adapters to specific layers in ResNet34.
    
    Strategy: We target layer3 and layer4 (high-level feature extractors) because:
    - layer1 & layer2: Extract low-level features (edges, textures) that are 
      universal across tasks β†’ keep frozen, no adaptation needed
    - layer3 & layer4: Extract high-level semantic features (objects, contexts)
      that are task-specific β†’ need slight adaptation for smoking detection
    - fc: Brand new classifier head β†’ fully trainable
    
    This approach gives us the sweet spot:
    - Full fine-tuning: 21.8M params (overfitting risk with small datasets)
    - Only fc training: ~1K params (may underfit, features not adapted)
    - LoRA on layer3+layer4: ~465K params (2.14% of model, balanced approach)
    
    Args:
        model: ResNet34 model
        target_layers: List of layer names to apply LoRA to
        rank: LoRA rank (default=8, adds ~2% params per adapted layer)
    
    Returns:
        Number of convolutional layers where LoRA was applied
    """
    # Freeze ALL layers first (preserve ImageNet features)
    for param in model.parameters():
        param.requires_grad = False
    
    # Unfreeze only the new classification head
    for param in model.fc.parameters():
        param.requires_grad = True

    # Apply to layer3
    for block in model.layer3:
        if hasattr(block, 'conv1'):
            block.conv1 = LoRALayer(block.conv1, rank=rank)
        if hasattr(block, 'conv2'):
            block.conv2 = LoRALayer(block.conv2, rank=rank)
    
    # Apply to layer4
    for block in model.layer4:
        if hasattr(block, 'conv1'):
            block.conv1 = LoRALayer(block.conv1, rank=rank)
        if hasattr(block, 'conv2'):
            block.conv2 = LoRALayer(block.conv2, rank=rank)
    
    return model


def count_parameters(model):
    """
    Count total and trainable parameters in the model.
    
    Returns:
        tuple: (total_params, trainable_params, trainable_percentage)
    """
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    trainable_pct = 100. * trainable_params / total_params
    
    return total_params, trainable_params, trainable_pct