#!/usr/bin/env python3 """ Stage 4: SigLIP v2 Multi-Head Classifier Training Trains a SigLIP v2-based multi-head classifier on pseudo-labeled data """ import os import json import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader from transformers import SiglipModel, AutoProcessor import numpy as np from PIL import Image from pathlib import Path import logging from typing import Dict, List, Any import pickle import matplotlib.pyplot as plt from torch.optim.lr_scheduler import LambdaLR # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) CKPT = "google/siglip-base-patch16-256" def load_task_config(config_path: str = './task_config.json'): """Load task configuration from JSON file""" if not os.path.exists(config_path): raise FileNotFoundError(f"Task configuration not found: {config_path}") with open(config_path, 'r') as f: config = json.load(f) logger.info(f"Loaded task configuration with {len(config['tasks'])} tasks") return config class MultiHeadDataset(Dataset): """Dataset for multi-head classification with configurable tasks""" def __init__(self, data_dir: str, processor, task_config: Dict): self.data_dir = Path(data_dir) self.processor = processor self.task_config = task_config # Load all metadata files from stage 2 (with _stage2 suffix) metadata_dir = self.data_dir / 'metadata' if not metadata_dir.exists(): raise FileNotFoundError("Metadata directory not found. Run stages 1 and 2 first.") metadata_files = list(metadata_dir.glob('meta_*_stage2.json')) if not metadata_files: raise FileNotFoundError("No stage 2 metadata files found. Run stage 2 first.") # Load all samples self.samples = [] skipped_incomplete = 0 for meta_file in metadata_files: try: with open(meta_file, 'r') as f: metadata = json.load(f) # Check if classification is complete if not metadata.get('stage2_complete', False): logger.warning(f"Skipping {meta_file} - classification not complete") skipped_incomplete += 1 continue # Check if classification contains incomplete data (empty or "..." values) classification = metadata.get('classification', {}) if not classification or self._is_incomplete_classification(classification): logger.warning(f"Skipping {meta_file} - incomplete classification data") skipped_incomplete += 1 continue # Check if image exists image_path = metadata['image_path'] if not os.path.exists(image_path): logger.warning(f"Image not found: {image_path}") skipped_incomplete += 1 continue self.samples.append(metadata) except Exception as e: logger.error(f"Error loading {meta_file}: {e}") skipped_incomplete += 1 # Create label mappings from task config self.label_mappings = {} for task in self.task_config['tasks']: if task['type'] == 'multi_class': self.label_mappings[task['key']] = { label: idx for idx, label in enumerate(task['labels']) } if skipped_incomplete > 0: logger.warning(f"Skipped {skipped_incomplete} incomplete samples") logger.info(f"Loaded {len(self.samples)} valid samples for training") def _is_incomplete_classification(self, classification: Dict) -> bool: """Check if classification contains incomplete data (empty or '...' values)""" required_tasks = [task['key'] for task in self.task_config['tasks']] for task_key in required_tasks: if task_key not in classification: return True value = classification[task_key] # Check for incomplete markers if not value or value == "..." or value == "" or value is None: return True return False def __len__(self): return len(self.samples) def __getitem__(self, idx): sample = self.samples[idx] # Load image image = Image.open(sample['image_path']).convert('RGB') # Process image only inputs = self.processor( images=image, return_tensors="pt" ) # Convert classifications to labels based on task config classification = sample['classification'] labels = {} for task in self.task_config['tasks']: task_key = task['key'] if task['type'] == 'binary': # Binary tasks: convert yes/no to 1/0 labels[task_key] = 1 if classification[task_key] == 'yes' else 0 elif task['type'] == 'multi_class': # Multi-class tasks: convert to index label_str = classification[task_key] labels[task_key] = self.label_mappings[task_key].get(label_str, 0) # default to first class return { 'pixel_values': inputs['pixel_values'].squeeze(0), 'labels': labels, 'metadata': { 'idx': sample['idx'], 'caption': sample['caption'], 'image_path': sample['image_path'] } } class MultiHeadSiglipClassifier(nn.Module): """SigLIP-based multi-head classifier with configurable tasks""" def __init__(self, task_config: Dict, model_name: str = CKPT): super().__init__() self.task_config = task_config self.siglip = SiglipModel.from_pretrained(model_name) # Freeze SigLIP parameters initially for param in self.siglip.parameters(): param.requires_grad = False # Create classification heads dynamically based on task config hidden_size = self.siglip.config.vision_config.hidden_size self.classification_heads = nn.ModuleDict() for task in task_config['tasks']: task_key = task['key'] num_classes = len(task['labels']) # Create linear layer for this task head = nn.Linear(hidden_size, num_classes) # Initialize with zeros head.weight.data.zero_() head.bias.data.zero_() self.classification_heads[task_key] = head logger.info(f"Created {len(self.classification_heads)} classification heads") def forward(self, pixel_values): # Get SigLIP image embeddings only combined_embeds = self.siglip.get_image_features(pixel_values=pixel_values) # Apply all classification heads outputs = {} for task_key, head in self.classification_heads.items(): outputs[task_key] = head(combined_embeds) return outputs def calculate_accuracy(predictions, labels): """Calculate accuracy for binary/multi-class predictions""" pred_classes = torch.argmax(predictions, dim=1) correct = (pred_classes == labels).float() return correct.mean().item() def plot_validation_accuracies(history, task_config, save_path='./checkpoints/validation_accuracies.png'): """Create and save validation accuracy plots""" tasks = [task['key'] for task in task_config['tasks']] task_names = [task['name'] for task in task_config['tasks']] # Calculate grid size n_tasks = len(tasks) n_cols = 3 n_rows = (n_tasks + n_cols - 1) // n_cols # Ceiling division fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6 * n_rows)) fig.suptitle('Training Progress Dashboard', fontsize=16, fontweight='bold') # Flatten axes for easier indexing if n_rows == 1: axes = [axes] if n_cols == 1 else axes else: axes = axes.flatten() epochs = range(1, len(history['val_accuracy'][tasks[0]]) + 1) colors = plt.cm.Set1(np.linspace(0, 1, n_tasks)) # Plot individual validation accuracies for i, (task_key, task_name, color) in enumerate(zip(tasks, task_names, colors)): if i < len(axes): axes[i].plot(epochs, history['val_accuracy'][task_key], label=task_name, marker='o', color=color, linewidth=2, markersize=4) axes[i].set_xlabel('Epoch') axes[i].set_ylabel('Validation Accuracy') axes[i].set_title(f'{task_name} Validation Accuracy') axes[i].grid(True, alpha=0.3) axes[i].set_ylim(0, 1) # Hide unused subplots for i in range(n_tasks, len(axes)): axes[i].set_visible(False) plt.tight_layout() plt.savefig(save_path, dpi=300, bbox_inches='tight') plt.close() logger.info(f"Validation accuracy plots saved to {save_path}") # Calculate summary statistics best_accs = [max(history['val_accuracy'][task]) for task in tasks] final_accs = [history['val_accuracy'][task][-1] for task in tasks] return best_accs, final_accs def train_multi_head_classifier(data_dir: str, task_config_path: str = './task_config.json', epochs: int = 30, batch_size: int = 4): """Train the multi-head SigLIP v2 classifier""" logger.info("Starting multi-head classifier training...") # Load task configuration task_config = load_task_config(task_config_path) # Create checkpoints directory checkpoint_dir = Path('./checkpoints') checkpoint_dir.mkdir(exist_ok=True) logger.info(f"Checkpoints will be saved to: {checkpoint_dir}") # Save task config to checkpoints for inference with open(checkpoint_dir / 'task_config.json', 'w') as f: json.dump(task_config, f, indent=2) # Load processor and model processor = AutoProcessor.from_pretrained(CKPT) model = MultiHeadSiglipClassifier(task_config, model_name=CKPT) # Dataset and dataloader dataset = MultiHeadDataset(data_dir, processor, task_config) if len(dataset) == 0: logger.error("No training data found!") return # Split dataset (simple train/val split) train_size = int(0.8 * len(dataset)) val_size = len(dataset) - train_size train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size]) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) # Setup training device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info(f"Using device: {device}") model.to(device) # Optimizer and loss functions # Get model parameters that require gradients (only classification heads) params = [] for name, param in model.named_parameters(): if param.requires_grad: params.append(param) optimizer = optim.AdamW(params, lr=1e-2) # Linear cooldown LR scheduler def linear_cooldown(epoch): return max(0.1, 1.0 - (epoch / epochs)) scheduler = LambdaLR(optimizer, lr_lambda=linear_cooldown) criterion = nn.CrossEntropyLoss() # Initialize training history history = { 'train_loss': [], 'val_loss': [], 'learning_rates': [], 'val_accuracy': {task['key']: [] for task in task_config['tasks']}, 'epoch_val_accuracy': [] } # Training loop for epoch in range(epochs): # Training phase model.train() total_train_loss = 0 for batch_idx, batch in enumerate(train_loader): optimizer.zero_grad() # Move to device pixel_values = batch['pixel_values'].to(device) # Forward pass outputs = model(pixel_values) # Calculate losses for each task losses = [] for task in task_config['tasks']: task_key = task['key'] labels = batch['labels'][task_key].to(device) loss = criterion(outputs[task_key], labels) losses.append(loss) # Total loss total_batch_loss = sum(losses) total_batch_loss.backward() optimizer.step() total_train_loss += total_batch_loss.item() if batch_idx % 10 == 0: logger.info(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx}/{len(train_loader)}, Loss: {total_batch_loss.item():.4f}") avg_train_loss = total_train_loss / len(train_loader) history['train_loss'].append(avg_train_loss) # Record learning rate current_lr = optimizer.param_groups[0]['lr'] history['learning_rates'].append(current_lr) # Validation phase model.eval() total_val_loss = 0 val_accuracies = {task['key']: [] for task in task_config['tasks']} with torch.no_grad(): for batch in val_loader: pixel_values = batch['pixel_values'].to(device) outputs = model(pixel_values) # Calculate validation losses and accuracies losses = [] for task in task_config['tasks']: task_key = task['key'] labels = batch['labels'][task_key].to(device) loss = criterion(outputs[task_key], labels) losses.append(loss) # Calculate accuracy acc = calculate_accuracy(outputs[task_key], labels) val_accuracies[task_key].append(acc) total_val_loss += sum(losses).item() avg_val_loss = total_val_loss / len(val_loader) history['val_loss'].append(avg_val_loss) # Calculate average accuracies epoch_accuracies = {} for task in task_config['tasks']: task_key = task['key'] avg_acc = np.mean(val_accuracies[task_key]) epoch_accuracies[task_key] = avg_acc history['val_accuracy'][task_key].append(avg_acc) history['epoch_val_accuracy'].append(epoch_accuracies.copy()) logger.info(f"Epoch {epoch+1}/{epochs}") logger.info(f" Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}") logger.info(f" Learning Rate: {current_lr:.6f}") logger.info(f" Val Accuracies: {epoch_accuracies}") # Step the learning rate scheduler scheduler.step() # Create comprehensive checkpoint checkpoint = { 'epoch': epochs, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'history': history, 'final_accuracies': epoch_accuracies, 'task_config': task_config } # Save the trained model and checkpoint torch.save(model.state_dict(), checkpoint_dir / 'multi_head_siglip2_classifier.pth') torch.save(checkpoint, checkpoint_dir / 'training_checkpoint.pth') logger.info(f"Model saved to {checkpoint_dir / 'multi_head_siglip2_classifier.pth'}") logger.info(f"Full checkpoint saved to {checkpoint_dir / 'training_checkpoint.pth'}") # Save processor for inference processor.save_pretrained(checkpoint_dir / 'siglip2_processor') logger.info(f"Processor saved to {checkpoint_dir / 'siglip2_processor'}") # Save training history as JSON with open(checkpoint_dir / 'training_history.json', 'w') as f: json_history = {} for key, value in history.items(): if key == 'val_accuracy': json_history[key] = {task: [float(acc) for acc in accs] for task, accs in value.items()} elif key == 'epoch_val_accuracy': json_history[key] = [{task: float(acc) for task, acc in epoch.items()} for epoch in value] else: json_history[key] = [float(x) for x in value] json.dump(json_history, f, indent=2) logger.info(f"Training history saved to {checkpoint_dir / 'training_history.json'}") # Generate and save validation accuracy plots best_accs, final_accs = plot_validation_accuracies(history, task_config, checkpoint_dir / 'validation_accuracies.png') # Save detailed validation accuracy summary val_summary = { 'best_accuracies': { task['key']: float(max(history['val_accuracy'][task['key']])) for task in task_config['tasks'] }, 'final_accuracies': {task: float(acc) for task, acc in epoch_accuracies.items()}, 'average_best_accuracy': float(np.mean(best_accs)), 'average_final_accuracy': float(np.mean(final_accs)), 'improvement_per_task': { task['key']: float(history['val_accuracy'][task['key']][-1] - history['val_accuracy'][task['key']][0]) for task in task_config['tasks'] } } with open(checkpoint_dir / 'validation_summary.json', 'w') as f: json.dump(val_summary, f, indent=2) logger.info(f"Validation summary saved to {checkpoint_dir / 'validation_summary.json'}") # Save final training summary final_summary = { "model_type": "SigLIP2 Multi-Head Classifier", "training_samples": len(train_dataset), "validation_samples": len(val_dataset), "epochs": epochs, "final_train_loss": avg_train_loss, "final_val_loss": avg_val_loss, "final_accuracies": epoch_accuracies, "task_config": task_config, "classification_heads": { task['key']: f"{task['type']} - {task['description']}" for task in task_config['tasks'] } } with open(checkpoint_dir / 'stage4_summary.json', 'w') as f: json.dump(final_summary, f, indent=2) logger.info(f"Stage 4 summary saved to {checkpoint_dir / 'stage4_summary.json'}") # Log summary of saved artifacts logger.info("="*60) logger.info("TRAINING COMPLETE - ARTIFACTS SAVED:") logger.info(f"📁 Checkpoint Directory: {checkpoint_dir}") logger.info(f"🤖 Model Weights: multi_head_siglip2_classifier.pth") logger.info(f"💾 Full Checkpoint: training_checkpoint.pth") logger.info(f"🔧 Processor: siglip2_processor/") logger.info(f"⚙️ Task Config: task_config.json") logger.info(f"📊 Training History: training_history.json") logger.info(f"📈 Validation Plots: validation_accuracies.png") logger.info(f"📋 Validation Summary: validation_summary.json") logger.info(f"📄 Stage Summary: stage4_summary.json") logger.info("="*60) def main(): """Main execution for Stage 4""" logger.info("Starting Stage 4: SigLIP v2 Multi-Head Training...") # Train classifier train_multi_head_classifier('./data', epochs=10, batch_size=2) logger.info("Stage 4 completed successfully!") logger.info("🎉 Complete pipeline finished! Check ./checkpoints/ for all training artifacts.") if __name__ == "__main__": main()