|
|
|
""" |
|
π¨ EMERGENCY OVERFITTING FIX π¨ |
|
Tiny GraphMamba designed specifically for 140 training samples |
|
""" |
|
|
|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
from torch_geometric.nn import GCNConv |
|
from torch_geometric.datasets import Planetoid |
|
from torch_geometric.transforms import NormalizeFeatures |
|
from torch_geometric.utils import to_undirected, add_self_loops |
|
import torch.optim as optim |
|
import time |
|
|
|
def get_device(): |
|
if torch.cuda.is_available(): |
|
device = torch.device('cuda') |
|
print(f"π Using GPU: {torch.cuda.get_device_name()}") |
|
torch.cuda.empty_cache() |
|
else: |
|
device = torch.device('cpu') |
|
print("π» Using CPU") |
|
return device |
|
|
|
class EmergencyTinyMamba(nn.Module): |
|
"""Emergency ultra-tiny model for 140 samples""" |
|
def __init__(self, input_dim=1433, hidden_dim=8, num_classes=7): |
|
super().__init__() |
|
|
|
|
|
self.feature_reduce = nn.Sequential( |
|
nn.Linear(input_dim, 32), |
|
nn.ReLU(), |
|
nn.Dropout(0.9), |
|
nn.Linear(32, hidden_dim) |
|
) |
|
|
|
|
|
self.gcn = GCNConv(hidden_dim, hidden_dim) |
|
|
|
|
|
self.temporal = nn.Sequential( |
|
nn.Linear(hidden_dim, hidden_dim), |
|
nn.Tanh(), |
|
nn.Dropout(0.9) |
|
) |
|
|
|
|
|
self.classifier = nn.Sequential( |
|
nn.Dropout(0.95), |
|
nn.Linear(hidden_dim, num_classes) |
|
) |
|
|
|
print(f"π¦Ύ Emergency Model - Parameters: {sum(p.numel() for p in self.parameters()):,}") |
|
|
|
def forward(self, x, edge_index): |
|
|
|
h = self.feature_reduce(x) |
|
|
|
|
|
h_gcn = F.relu(self.gcn(h, edge_index)) |
|
|
|
|
|
h_temporal = self.temporal(h_gcn) |
|
|
|
|
|
h = h + h_temporal * 0.1 |
|
|
|
|
|
return self.classifier(h) |
|
|
|
class MicroMamba(nn.Module): |
|
"""Even smaller model""" |
|
def __init__(self, input_dim=1433, hidden_dim=4, num_classes=7): |
|
super().__init__() |
|
|
|
|
|
self.features = nn.Sequential( |
|
nn.Linear(input_dim, 16), |
|
nn.ReLU(), |
|
nn.Dropout(0.95), |
|
nn.Linear(16, hidden_dim) |
|
) |
|
|
|
|
|
self.process = nn.Sequential( |
|
GCNConv(hidden_dim, hidden_dim), |
|
nn.ReLU(), |
|
nn.Dropout(0.9) |
|
) |
|
|
|
|
|
self.classify = nn.Sequential( |
|
nn.Dropout(0.95), |
|
nn.Linear(hidden_dim, num_classes) |
|
) |
|
|
|
print(f"π€ Micro Model - Parameters: {sum(p.numel() for p in self.parameters()):,}") |
|
|
|
def forward(self, x, edge_index): |
|
h = self.features(x) |
|
h = self.process[0](h, edge_index) |
|
h = self.process[1](h) |
|
h = self.process[2](h) |
|
return self.classify(h) |
|
|
|
class NanoMamba(nn.Module): |
|
"""Absolutely minimal model""" |
|
def __init__(self, input_dim=1433, num_classes=7): |
|
super().__init__() |
|
|
|
|
|
self.direct = nn.Sequential( |
|
nn.Linear(input_dim, num_classes), |
|
nn.Dropout(0.8) |
|
) |
|
|
|
|
|
self.gcn_path = nn.Sequential( |
|
nn.Linear(input_dim, 8), |
|
nn.Dropout(0.9) |
|
) |
|
self.gcn = GCNConv(8, num_classes) |
|
|
|
print(f"βοΈ Nano Model - Parameters: {sum(p.numel() for p in self.parameters()):,}") |
|
|
|
def forward(self, x, edge_index): |
|
|
|
direct_out = self.direct(x) |
|
|
|
|
|
h = self.gcn_path(x) |
|
gcn_out = self.gcn(h, edge_index) |
|
|
|
|
|
return direct_out * 0.7 + gcn_out * 0.3 |
|
|
|
def emergency_train(model, data, device, epochs=2000): |
|
"""Emergency training with extreme regularization""" |
|
model = model.to(device) |
|
data = data.to(device) |
|
|
|
|
|
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.5) |
|
|
|
|
|
criterion = nn.CrossEntropyLoss(label_smoothing=0.5) |
|
|
|
print(f"π¨ Emergency Training Protocol") |
|
print(f" Parameters: {sum(p.numel() for p in model.parameters()):,}") |
|
print(f" Per sample: {sum(p.numel() for p in model.parameters())/140:.1f}") |
|
print(f" Epochs: {epochs}") |
|
print(f" Learning rate: 0.001") |
|
print(f" Weight decay: 0.5") |
|
print(f" Label smoothing: 0.5") |
|
|
|
best_val_acc = 0 |
|
patience = 0 |
|
|
|
for epoch in range(epochs): |
|
|
|
model.train() |
|
optimizer.zero_grad() |
|
|
|
out = model(data.x, data.edge_index) |
|
loss = criterion(out[data.train_mask], data.y[data.train_mask]) |
|
|
|
loss.backward() |
|
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) |
|
optimizer.step() |
|
|
|
|
|
if (epoch + 1) % 100 == 0: |
|
model.eval() |
|
with torch.no_grad(): |
|
out = model(data.x, data.edge_index) |
|
|
|
train_pred = out[data.train_mask].argmax(dim=1) |
|
train_acc = (train_pred == data.y[data.train_mask]).float().mean().item() |
|
|
|
val_pred = out[data.val_mask].argmax(dim=1) |
|
val_acc = (val_pred == data.y[data.val_mask]).float().mean().item() |
|
|
|
test_pred = out[data.test_mask].argmax(dim=1) |
|
test_acc = (test_pred == data.y[data.test_mask]).float().mean().item() |
|
|
|
gap = train_acc - val_acc |
|
|
|
print(f" Epoch {epoch+1:4d}: Train {train_acc:.3f} | Val {val_acc:.3f} | " |
|
f"Test {test_acc:.3f} | Gap {gap:.3f}") |
|
|
|
if val_acc > best_val_acc: |
|
best_val_acc = val_acc |
|
patience = 0 |
|
else: |
|
patience += 100 |
|
|
|
if patience >= 500: |
|
print(f" Early stopping at epoch {epoch+1}") |
|
break |
|
|
|
|
|
model.eval() |
|
with torch.no_grad(): |
|
out = model(data.x, data.edge_index) |
|
|
|
train_pred = out[data.train_mask].argmax(dim=1) |
|
train_acc = (train_pred == data.y[data.train_mask]).float().mean().item() |
|
|
|
val_pred = out[data.val_mask].argmax(dim=1) |
|
val_acc = (val_pred == data.y[data.val_mask]).float().mean().item() |
|
|
|
test_pred = out[data.test_mask].argmax(dim=1) |
|
test_acc = (test_pred == data.y[data.test_mask]).float().mean().item() |
|
|
|
gap = train_acc - val_acc |
|
|
|
return { |
|
'train_acc': train_acc, |
|
'val_acc': val_acc, |
|
'test_acc': test_acc, |
|
'gap': gap |
|
} |
|
|
|
def run_emergency_fix(): |
|
"""Emergency overfitting fix""" |
|
print("π¨π¨π¨ EMERGENCY OVERFITTING FIX π¨π¨π¨") |
|
print("π©Ή Ultra-Tiny Models for 140 Training Samples") |
|
print("=" * 60) |
|
|
|
device = get_device() |
|
|
|
|
|
print("\nπ Loading Cora dataset...") |
|
dataset = Planetoid(root='/tmp/Cora', name='Cora', transform=NormalizeFeatures()) |
|
data = dataset[0].to(device) |
|
data.edge_index = to_undirected(data.edge_index) |
|
data.edge_index, _ = add_self_loops(data.edge_index, num_nodes=data.x.size(0)) |
|
|
|
print(f"β
Dataset: {data.num_nodes} nodes, Train: {data.train_mask.sum()} samples") |
|
print(f"π― Target: <50 parameters per sample = <7,000 total parameters") |
|
|
|
|
|
models = { |
|
'Emergency Tiny (8D)': EmergencyTinyMamba(hidden_dim=8), |
|
'Micro (4D)': MicroMamba(hidden_dim=4), |
|
'Nano (Direct)': NanoMamba() |
|
} |
|
|
|
results = {} |
|
|
|
for name, model in models.items(): |
|
print(f"\nποΈ Testing {name}...") |
|
|
|
total_params = sum(p.numel() for p in model.parameters()) |
|
params_per_sample = total_params / 140 |
|
|
|
print(f" Parameters: {total_params:,} ({params_per_sample:.1f} per sample)") |
|
|
|
if params_per_sample < 50: |
|
print(f" β
EXCELLENT parameter ratio!") |
|
elif params_per_sample < 100: |
|
print(f" π Good parameter ratio!") |
|
else: |
|
print(f" β οΈ Still might overfit") |
|
|
|
|
|
with torch.no_grad(): |
|
out = model(data.x, data.edge_index) |
|
print(f" Forward: {data.x.shape} -> {out.shape} β
") |
|
|
|
try: |
|
|
|
result = emergency_train(model, data, device) |
|
results[name] = result |
|
|
|
print(f" π― Final Results:") |
|
print(f" Test Accuracy: {result['test_acc']:.3f} ({result['test_acc']*100:.1f}%)") |
|
print(f" Train Accuracy: {result['train_acc']:.3f}") |
|
print(f" Overfitting Gap: {result['gap']:.3f}") |
|
|
|
if result['gap'] < 0.1: |
|
print(f" π OVERFITTING SOLVED!") |
|
elif result['gap'] < 0.2: |
|
print(f" π Much better generalization!") |
|
elif result['gap'] < 0.3: |
|
print(f" π Improved generalization") |
|
else: |
|
print(f" β οΈ Still overfitting") |
|
|
|
except Exception as e: |
|
print(f" β Training failed: {e}") |
|
|
|
|
|
print(f"\n{'='*60}") |
|
print("π¨ EMERGENCY RESULTS SUMMARY") |
|
print(f"{'='*60}") |
|
|
|
best_gap = float('inf') |
|
best_model = None |
|
|
|
for name, result in results.items(): |
|
print(f"π {name}:") |
|
print(f" Test: {result['test_acc']:.3f} | Gap: {result['gap']:.3f}") |
|
|
|
if result['gap'] < best_gap: |
|
best_gap = result['gap'] |
|
best_model = name |
|
|
|
if best_model: |
|
print(f"\nπ Best Generalization: {best_model} (Gap: {best_gap:.3f})") |
|
|
|
if best_gap < 0.1: |
|
print(f"π MISSION ACCOMPLISHED! Overfitting crisis resolved!") |
|
elif best_gap < 0.2: |
|
print(f"π Significant improvement in generalization!") |
|
else: |
|
print(f"π Progress made, but still work to do...") |
|
|
|
|
|
print(f"\nπ Comparison:") |
|
print(f" Your model: 194K params, Gap ~0.5") |
|
if best_model and best_gap < 0.3: |
|
improvement = 0.5 - best_gap |
|
print(f" Best tiny model: Gap {best_gap:.3f} (Improvement: {improvement:.3f})") |
|
print(f" π― {improvement/0.5*100:.0f}% reduction in overfitting!") |
|
|
|
print(f"\nπ‘ Key Lesson: With only 140 samples, bigger β better!") |
|
print(f"π§ Tiny models can achieve competitive performance with much better generalization.") |
|
|
|
return results |
|
|
|
if __name__ == "__main__": |
|
results = run_emergency_fix() |
|
|
|
print(f"\nπ Emergency fix complete. Process staying alive...") |
|
try: |
|
while True: |
|
time.sleep(60) |
|
except KeyboardInterrupt: |
|
print("\nπ Emergency protocol terminated.") |