Spaces:

kfoughali
/

serpent

Running

File size: 51,059 Bytes
"""
Core compression algorithms for Enhanced SPG.
Contains EnhancedSlidingPrecisionGradient and QuantizedKVCache implementations.
STRICT COMPLIANCE: No estimations, only measured values.
"""

import torch
import torch.nn.functional as F
import numpy as np
from typing import Tuple, Optional, Dict, Any, List
import logging
from dataclasses import replace

from config import (
    CompressionConfig, CompressionType, EnhancedSPGConfig,
    ResearchConstants, logger
)


class EnhancedSlidingPrecisionGradient:
    """
    Research-grade Enhanced SPG with RocketKV-style 450x compression capability.
    NO ESTIMATIONS OR HARDCODED VALUES - all parameters from validated config.
    """
    
    def __init__(self, config: EnhancedSPGConfig):
        self.config = config
        self.constants = ResearchConstants()
        self.layer_decay_rates: Optional[List[float]] = None
        self.compression_stats: List[Dict[str, Any]] = []
        
        # Progressive compression state
        self.current_compression_ratio = config.initial_compression_ratio if config.enable_progressive else None
        self.progressive_step = 0
        self.quality_history: List[float] = []
        
        # Adaptive state
        self.adaptive_enabled = config.enable_adaptive
        self.decay_adjustment_rate = config.decay_adjustment_rate
        self.target_perplexity_delta = config.target_perplexity_delta
        
        # RocketKV-style adaptive decomposition
        self.use_adaptive_decomposition = config.use_adaptive_decomposition
        self.use_hybrid_sparse_attention = config.use_hybrid_sparse_attention
        self.target_compression_ratio = config.target_compression_ratio
        
        logger.info(f"Enhanced SPG initialized with {config.magnitude_threshold_mode} magnitude thresholds")
        if self.use_hybrid_sparse_attention:
            logger.info("RocketKV-style Hybrid Sparse Attention enabled")
        
    def initialize_layer_decay_rates(self, n_layers: int) -> None:
        """Initialize per-layer decay rates with validation."""
        if not self.constants.MIN_LAYERS <= n_layers <= self.constants.MAX_LAYERS:
            logger.warning(f"n_layers {n_layers} outside typical range [{self.constants.MIN_LAYERS}, {self.constants.MAX_LAYERS}]")
        
        if self.config.per_layer_decay:
            self.layer_decay_rates = [self.config.base_decay_rate] * n_layers
        else:
            self.layer_decay_rates = [self.config.base_decay_rate] * n_layers
        
        self.n_layers = n_layers
        logger.info(f"Initialized decay rates for {n_layers} layers")
    
    def update_decay_rate(self, layer_idx: int, quality_metric: float, target_quality: float) -> None:
        """Update decay rate for adaptive SPG with proper validation."""
        if not self.adaptive_enabled or self.layer_decay_rates is None:
            return
        
        if not 0 <= layer_idx < len(self.layer_decay_rates):
            logger.error(f"Invalid layer_idx {layer_idx}, valid range: [0, {len(self.layer_decay_rates)})")
            return
        
        # Validate and clamp inputs
        quality_metric = max(0.1, min(1000.0, float(quality_metric)))
        target_quality = max(0.1, min(1000.0, float(target_quality)))
        
        # Compute adjustment
        quality_delta = quality_metric - target_quality
        
        if quality_delta > 0:  # Quality worse than target
            adjustment = -self.decay_adjustment_rate * (quality_delta / target_quality)
        else:  # Quality better than target
            adjustment = self.decay_adjustment_rate * (abs(quality_delta) / target_quality)
        
        # Apply with bounds
        old_rate = self.layer_decay_rates[layer_idx]
        new_rate = max(0.8, min(0.99, old_rate + adjustment))
        self.layer_decay_rates[layer_idx] = new_rate
        
        logger.debug(f"Adaptive SPG Layer {layer_idx}: quality={quality_metric:.3f}, "
                    f"target={target_quality:.3f}, decay_rate: {old_rate:.3f} → {new_rate:.3f}")
    
    def compute_magnitude_importance(self, keys: torch.Tensor, values: torch.Tensor) -> torch.Tensor:
        """
        Compute importance scores based on magnitude statistics.
        This is an EXPLICIT magnitude-based proxy, not an estimation.
        """
        try:
            # Compute L2 norm across head dimension for each token
            k_norms = keys.norm(dim=-1).mean(dim=1).mean(dim=0)  # [seq_len]
            v_norms = values.norm(dim=-1).mean(dim=1).mean(dim=0)  # [seq_len]
            
            # Combine key and value magnitudes (explicit formula)
            importance_scores = (k_norms + v_norms) / 2.0
            
            # Normalize to [0, 1] range for consistent thresholding
            score_min = importance_scores.min()
            score_max = importance_scores.max()
            
            if score_max > score_min:
                importance_scores = (importance_scores - score_min) / (score_max - score_min)
            else:
                importance_scores = torch.ones_like(importance_scores)
            
            logger.debug(f"Computed magnitude importance: min={score_min:.6f}, max={score_max:.6f}")
            return importance_scores
            
        except Exception as e:
            logger.error(f"Error computing magnitude importance: {e}")
            raise

    def estimate_attention_sparsity(self, keys: torch.Tensor, values: torch.Tensor) -> float:
        """Estimate attention pattern sparsity for adaptive decomposition. FAIL FAST on error."""
        try:
            # Compute approximate attention patterns using key-key similarity
            k_norm = F.normalize(keys.float(), p=2, dim=-1)
            attention_approx = torch.matmul(k_norm, k_norm.transpose(-2, -1))
            
            # Measure sparsity as fraction of near-zero attention weights
            # Use configurable threshold from constants
            threshold = self.constants.ATTENTION_SPARSITY_THRESHOLD
            sparse_fraction = (attention_approx.abs() < threshold).float().mean().item()
            
            return sparse_fraction
            
        except Exception as e:
            # FAIL FAST - NO FALLBACK VALUES
            logger.error(f"Failed to estimate attention sparsity: {e}")
            raise RuntimeError(f"Cannot measure attention sparsity: {e}")
    
    def adaptive_stage_split(self, target_ratio: float, seq_len: int, sparsity: float) -> Tuple[float, float]:
        """RocketKV-style adaptive compression decomposition with explicit parameters."""
        # Use explicit formulas from research constants
        if sparsity > self.constants.SPARSITY_HIGH_THRESHOLD:
            stage1_power = self.constants.SPARSE_STAGE1_POWER
        elif sparsity > self.constants.SPARSITY_MEDIUM_THRESHOLD:
            stage1_power = self.constants.BALANCED_STAGE1_POWER
        else:
            stage1_power = self.constants.DENSE_STAGE1_POWER
        
        stage1_ratio = target_ratio ** stage1_power
        stage2_ratio = target_ratio / stage1_ratio
        
        # Bounds checking with explicit limits from config
        stage1_ratio = max(self.config.stage_compression_min, min(self.config.stage_compression_max, stage1_ratio))
        stage2_ratio = max(self.config.stage_compression_min, min(self.config.stage_compression_max, stage2_ratio))
        
        logger.debug(f"Adaptive split: sparsity={sparsity:.3f}, stage1={stage1_ratio:.1f}x, stage2={stage2_ratio:.1f}x")
        return stage1_ratio, stage2_ratio
    
    def snapkv_plus_plus(self, keys: torch.Tensor, values: torch.Tensor, 
                        compression_ratio: float) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
        """SnapKV++ with GQA support and adaptive pooling - no hardcoded values."""
        batch_size, n_heads, seq_len, head_dim = keys.shape
        
        # Adaptive kernel size based on sequence length (from config)
        kernel_size = self.config.get_adaptive_kernel_size(seq_len)
        
        # Compute importance scores with adaptive pooling
        key_norms = keys.norm(dim=-1)  # [batch, heads, seq]
        value_norms = values.norm(dim=-1)
        combined_importance = (key_norms + value_norms) / 2.0
        
        # Multi-head aggregation with adaptive pooling
        if kernel_size > 1:
            # Apply 1D pooling along sequence dimension
            pooled_importance = F.avg_pool1d(
                combined_importance.mean(dim=1).unsqueeze(1),  # [batch, 1, seq]
                kernel_size=kernel_size,
                stride=1,
                padding=kernel_size // 2
            ).squeeze(1)  # [batch, seq]
            # Ensure pooled output matches original sequence length
            if pooled_importance.shape[-1] != seq_len:
                pooled_importance = pooled_importance[:, :seq_len]
        else:
            pooled_importance = combined_importance.mean(dim=1)
        
        # Aggregate across batch
        final_importance = pooled_importance.mean(dim=0)  # [seq]
        
        # Ensure importance tensor matches sequence length
        if final_importance.shape[0] != seq_len:
            final_importance = final_importance[:seq_len]
        
        # Preserve sink and recent tokens
        preserve_mask = torch.zeros(seq_len, dtype=torch.bool, device=keys.device)
        preserve_mask[:min(self.config.sink_tokens, seq_len)] = True
        preserve_mask[-min(self.config.recent_window, seq_len):] = True
        
        # Top-k selection for remaining tokens
        n_keep = max(self.config.sink_tokens + self.config.recent_window,
                    int(seq_len / compression_ratio))
        n_keep = min(n_keep, seq_len)  # Ensure we don't exceed sequence length
        remaining_slots = n_keep - preserve_mask.sum().item()
        
        if remaining_slots > 0:
            masked_importance = final_importance.clone()
            masked_importance[preserve_mask] = -float('inf')
            
            available_indices = (~preserve_mask).nonzero(as_tuple=True)[0]
            if len(available_indices) > 0:
                k = min(remaining_slots, len(available_indices))
                if k > 0:
                    _, relative_top_indices = torch.topk(masked_importance[available_indices], k)
                    absolute_top_indices = available_indices[relative_top_indices]
                    preserve_mask[absolute_top_indices] = True
        
        # Extract retained tokens with bounds checking
        retained_indices = torch.where(preserve_mask)[0]
        retained_indices = retained_indices[retained_indices < seq_len]  # Safety check
        
        keys_compressed = keys[:, :, retained_indices, :]
        values_compressed = values[:, :, retained_indices, :]
        
        actual_ratio = seq_len / len(retained_indices) if len(retained_indices) > 0 else float('inf')
        logger.debug(f"SnapKV++: {seq_len} → {len(retained_indices)} tokens ({actual_ratio:.1f}x)")
        
        return keys_compressed, values_compressed, retained_indices.tolist()
    
    def hybrid_sparse_attention(self, keys: torch.Tensor, values: torch.Tensor, 
                               head_budget: int, seq_budget: int) -> Dict[str, Any]:
        """RocketKV-style Hybrid Sparse Attention for Stage 2 - no hardcoded values."""
        batch_size, n_heads, seq_len, head_dim = keys.shape
        
        # 1. Head-wise importance scoring
        head_importance = (
            keys.float().pow(2).sum(dim=(-1, -2)).sum(dim=0) +  # Sum over batch, seq, hidden
            values.float().pow(2).sum(dim=(-1, -2)).sum(dim=0)
        )  # [n_heads]
        
        # Select top heads
        actual_head_budget = min(head_budget, n_heads)
        _, top_head_indices = torch.topk(head_importance, actual_head_budget)
        
        compressed_data = {
            'keys': {},
            'values': {},
            'metadata': {
                'head_selection': top_head_indices.tolist(),
                'original_shape': keys.shape,
                'compression_type': 'hybrid_sparse_attention'
            }
        }
        
        # 2. Sequence-wise top-k selection per selected head
        for head_idx in top_head_indices:
            head_keys = keys[:, head_idx:head_idx+1, :, :]  # Keep head dimension
            head_values = values[:, head_idx:head_idx+1, :, :]
            
            # Compute sequence importance for this head
            seq_importance = (
                head_keys.norm(dim=-1).squeeze(1).mean(dim=0) +  # [seq]
                head_values.norm(dim=-1).squeeze(1).mean(dim=0)
            ) / 2.0
            
            # Apply position-based boost (from research constants)
            position_boost = torch.ones_like(seq_importance)
            position_boost[:self.config.sink_tokens] *= self.constants.POSITION_BOOST_SINK
            position_boost[-self.config.recent_window:] *= self.constants.POSITION_BOOST_RECENT
            boosted_importance = seq_importance * position_boost
            
            # Select top tokens for this head
            actual_seq_budget = min(seq_budget, seq_len)
            _, top_token_indices = torch.topk(boosted_importance, actual_seq_budget)
            
            # Store compressed data
            head_key = f'head_{head_idx.item()}'
            compressed_data['keys'][head_key] = {
                'data': head_keys[:, :, top_token_indices, :].clone(),
                'indices': top_token_indices.tolist()
            }
            compressed_data['values'][head_key] = {
                'data': head_values[:, :, top_token_indices, :].clone(),
                'indices': top_token_indices.tolist()
            }
        
        return compressed_data
    
    def stage1_permanent_eviction(self, keys: torch.Tensor, values: torch.Tensor, 
                                 layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
        """
        Stage 1: RocketKV-style permanent eviction with SnapKV++ or magnitude-guided approach.
        """
        batch_size, n_heads, seq_len, head_dim = keys.shape
        
        if self.use_adaptive_decomposition:
            # Use adaptive compression split
            sparsity = self.estimate_attention_sparsity(keys, values)  # May raise if fails
            stage1_ratio, _ = self.adaptive_stage_split(self.target_compression_ratio, seq_len, sparsity)
        else:
            stage1_ratio = self.config.stage1_compression_ratio
        
        # Choose compression method based on configuration
        if self.config.use_snapkv_plus_plus:
            return self.snapkv_plus_plus(keys, values, stage1_ratio)
        else:
            # Original magnitude-guided approach
            return self._magnitude_guided_stage1(keys, values, layer_idx, stage1_ratio)
    
    def _magnitude_guided_stage1(self, keys: torch.Tensor, values: torch.Tensor,
                                layer_idx: int, compression_ratio: float) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
        """Original magnitude-guided Stage 1 eviction with explicit parameters."""
        batch_size, n_heads, seq_len, head_dim = keys.shape
        
        # Calculate retention based on compression ratio
        retention_ratio = 1.0 / compression_ratio
        min_retain = self.config.sink_tokens + self.config.recent_window
        n_retain = max(min_retain, int(seq_len * retention_ratio))
        
        # Apply layer-specific constraints (from research constants)
        layer_position = layer_idx / max(getattr(self, 'n_layers', 12) - 1, 1)
        if layer_position <= 0.5:  # Early layers
            max_retain = int(seq_len * self.constants.EARLY_LAYER_MAX_RETENTION)
        else:  # Late layers
            max_retain = int(seq_len * self.constants.LATE_LAYER_MAX_RETENTION)
        
        n_retain = min(n_retain, max_retain)
        
        # Compute magnitude-based importance
        importance_scores = self.compute_magnitude_importance(keys, values)
        
        # Quality preservation: boost recent tokens (explicit formula from config)
        recent_boost = torch.zeros_like(importance_scores)
        if self.config.recent_window > 0:
            recent_boost[-self.config.recent_window:] = importance_scores.max() * self.config.recent_boost_factor
        importance_scores = importance_scores + recent_boost
        
        # Initialize preservation mask
        preserve_mask = torch.zeros(seq_len, dtype=torch.bool, device=keys.device)
        preserve_mask[:self.config.sink_tokens] = True
        preserve_mask[-self.config.recent_window:] = True
        
        # Select additional tokens based on importance
        remaining_slots = n_retain - preserve_mask.sum().item()
        if remaining_slots > 0:
            masked_importance = importance_scores.clone()
            masked_importance[preserve_mask] = -float('inf')
            
            # Use configured threshold (not hardcoded)
            magnitude_threshold = torch.quantile(
                importance_scores.float(), 
                self.config.get_magnitude_threshold()
            )
            
            below_threshold = masked_importance < magnitude_threshold
            masked_importance[below_threshold] = -float('inf')
            
            available = (masked_importance > -float('inf')).sum().item()
            k = min(remaining_slots, available)
            if k > 0:
                _, top_indices = torch.topk(masked_importance, k)
                preserve_mask[top_indices] = True
        
        # Extract retained tokens
        retained_indices = torch.where(preserve_mask)[0]
        keys_stage1 = keys[:, :, retained_indices, :]
        values_stage1 = values[:, :, retained_indices, :]
        
        actual_ratio = seq_len / len(retained_indices) if len(retained_indices) > 0 else float('inf')
        logger.debug(f"Stage 1 Layer {layer_idx}: {seq_len} → {len(retained_indices)} tokens ({actual_ratio:.1f}x)")
        
        return keys_stage1, values_stage1, retained_indices.tolist()
    
    def stage2_multi_dimensional_compression(self, keys: torch.Tensor, values: torch.Tensor,
                                           layer_idx: int, retained_indices: List[int]) -> Dict[str, Any]:
        """
        Stage 2: RocketKV-style Hybrid Sparse Attention compression.
        Uses dynamic top-k selection with head and sequence reductions.
        """
        batch_size, n_heads, seq_len, head_dim = keys.shape
        
        if self.use_hybrid_sparse_attention:
            # RocketKV-style compression with adaptive budgets
            sparsity = self.estimate_attention_sparsity(keys, values)  # May raise if fails
            
            if self.use_adaptive_decomposition:
                _, stage2_ratio = self.adaptive_stage_split(
                    self.target_compression_ratio, seq_len, sparsity
                )
            else:
                stage2_ratio = self.config.stage2_compression_ratio
            
            # Dynamic budgets based on compression target (from config)
            head_retention_ratio = self.config.get_head_retention_ratio()
            head_budget = max(1, int(n_heads * head_retention_ratio))
            seq_budget = max(self.config.min_tokens_for_stability, int(seq_len / stage2_ratio))
            
            # Use hybrid sparse attention
            compressed_data = self.hybrid_sparse_attention(keys, values, head_budget, seq_budget)
            
            # Add metadata
            compressed_data['metadata'].update({
                'stage1_retained_indices': retained_indices,
                'original_shape_after_stage1': keys.shape,
                'original_dtype': keys.dtype,
                'layer_idx': layer_idx,
                'sparsity_estimate': sparsity,
                'stage2_compression_ratio': stage2_ratio,
                'head_budget': head_budget,
                'seq_budget': seq_budget,
                'head_retention_ratio': head_retention_ratio
            })
            
            return compressed_data
        
        # Fallback to original multi-dimensional compression
        return self._original_stage2_compression(keys, values, layer_idx, retained_indices)
    
    def _original_stage2_compression(self, keys: torch.Tensor, values: torch.Tensor,
                                   layer_idx: int, retained_indices: List[int]) -> Dict[str, Any]:
        """Original Stage 2 implementation for comparison."""
        batch_size, n_heads, seq_len, head_dim = keys.shape
        
        # Compute importance for remaining tokens
        importance_scores = self.compute_magnitude_importance(keys, values)
        
        # Combine with position-based decay (explicit formula)
        decay_rate = self.layer_decay_rates[layer_idx] if self.layer_decay_rates else self.config.base_decay_rate
        position_scores = torch.pow(
            decay_rate, 
            torch.arange(seq_len, device=keys.device).float() / self.config.decay_normalization
        )
        
        combined_importance = importance_scores * position_scores
        
        compressed_data = {
            'keys': {},
            'values': {},
            'metadata': {
                'stage1_retained_indices': retained_indices,
                'importance_scores': combined_importance,
                'original_shape_after_stage1': keys.shape,
                'original_dtype': keys.dtype,
                'layer_idx': layer_idx,
                'magnitude_threshold_mode': self.config.magnitude_threshold_mode,
                'compression_type': 'original_multi_dimensional'
            }
        }
        
        # Head dimension compression with explicit parameters
        if self.config.enable_head_compression:
            n_important_heads = max(1, int(n_heads * self.config.head_compression_ratio))
            
            # UPDATED: Always reserve top head_fp16_reserve heads at full precision
            n_reserved_heads = min(getattr(self.config, 'head_fp16_reserve', 2), n_heads)
            n_important_heads = max(n_reserved_heads, n_important_heads)
            
            # Compute head importance (explicit calculation)
            head_importance = (
                keys.float().pow(2).sum(dim=(-1, -2)).sum(dim=0) +
                values.float().pow(2).sum(dim=(-1, -2)).sum(dim=0)
            )
            
            _, important_head_indices = torch.topk(head_importance, n_important_heads)
            other_head_indices = torch.tensor(
                [h for h in range(n_heads) if h not in important_head_indices.tolist()],
                device=keys.device, dtype=torch.long
            )
            
            # Store important heads at full precision
            compressed_data['keys']['heads_fp16'] = {
                'data': keys[:, important_head_indices, :, :].clone(),
                'indices': important_head_indices.tolist()
            }
            compressed_data['values']['heads_fp16'] = {
                'data': values[:, important_head_indices, :, :].clone(),
                'indices': important_head_indices.tolist()
            }
            
            if other_head_indices.numel() == 0:
                return compressed_data
            
            seq_keys = keys[:, other_head_indices, :, :]
            seq_values = values[:, other_head_indices, :, :]
        else:
            seq_keys = keys
            seq_values = values
        
        # Sequence dimension compression with explicit ratios
        levels = self.config.precision_levels
        
        # Explicit top-K selection for FP16
        keep_fp16 = max(0, int(seq_len * self.config.sequence_compression_ratio))
        top_fp16 = torch.topk(combined_importance, k=keep_fp16).indices if keep_fp16 > 0 else torch.empty(0, dtype=torch.long, device=keys.device)
        is_fp16 = torch.zeros(seq_len, dtype=torch.bool, device=keys.device)
        if keep_fp16 > 0:
            is_fp16[top_fp16] = True
        
        # Vectorized token binning
        thresh = torch.tensor([pl.threshold for pl in levels], device=keys.device)
        thresh_sorted, order = torch.sort(thresh, descending=True)
        level_ids = torch.bucketize(combined_importance, thresh_sorted, right=False)
        
        # Assign tokens to precision levels
        for i in range(seq_len):
            if is_fp16[i]:
                precision_key = 'seq_fp16'
            else:
                level_idx = min(level_ids[i].item(), len(levels) - 1)
                level = levels[order[level_idx]]
                
                if level.bits is not None:
                    precision_key = f'seq_{level.bits}bit'
                else:
                    precision_key = f'seq_{level.name}'
            
            if precision_key not in compressed_data['keys']:
                compressed_data['keys'][precision_key] = {
                    'indices': [], 'data': None, 'scale': None, 'zero': None
                }
                compressed_data['values'][precision_key] = {
                    'indices': [], 'data': None, 'scale': None, 'zero': None
                }
            
            compressed_data['keys'][precision_key]['indices'].append(i)
            compressed_data['values'][precision_key]['indices'].append(i)
        
        # Store data with aggressive precision (FP16 for most important tokens)
        keys_to_delete = []
        for precision_key in list(compressed_data['keys'].keys()):
            if not precision_key.startswith('seq_'):
                continue
            
            indices = compressed_data['keys'][precision_key]['indices']
            if not indices:
                keys_to_delete.append(precision_key)
                continue
            
            if precision_key == 'seq_discard':
                keys_to_delete.append(precision_key)
                continue
            
            idx_tensor = torch.tensor(indices, device=keys.device, dtype=torch.long)
            k_slice = seq_keys.index_select(2, idx_tensor)
            v_slice = seq_values.index_select(2, idx_tensor)
            
            # Store with aggressive precision - only FP16 for ultra-selective tokens
            compressed_data['keys'][precision_key]['data'] = k_slice.clone()
            compressed_data['values'][precision_key]['data'] = v_slice.clone()
        
        # Clean up empty keys
        for pk in keys_to_delete:
            compressed_data['keys'].pop(pk, None)
            compressed_data['values'].pop(pk, None)
        
        return compressed_data
    
    def compress_with_enhanced_gradient(self, keys: torch.Tensor, values: torch.Tensor,
                                       layer_idx: int, current_position: int) -> Dict[str, Any]:
        """
        Main compression function with explicit two-stage approach.
        """
        if not self.config.enable_two_stage:
            return self._fallback_to_original_spg(keys, values, layer_idx, current_position)
        
        try:
            # Record original shape
            orig_shape_full = keys.shape
            
            # Stage 1: Permanent eviction
            keys_stage1, values_stage1, retained_indices = self.stage1_permanent_eviction(
                keys, values, layer_idx
            )
            
            # Stage 2: Multi-dimensional compression
            compressed_data = self.stage2_multi_dimensional_compression(
                keys_stage1, values_stage1, layer_idx, retained_indices
            )
            
            # Add metadata
            compressed_data['metadata']['original_full_shape'] = orig_shape_full
            
            # Progressive compression
            if self.config.enable_progressive:
                compressed_data = self._apply_progressive_compression(compressed_data, layer_idx)
            
            return compressed_data
            
        except Exception as e:
            logger.error(f"Error in enhanced compression for layer {layer_idx}: {e}")
            raise
    
    def _fallback_to_original_spg(self, keys: torch.Tensor, values: torch.Tensor,
                                 layer_idx: int, current_position: Optional[int]) -> Dict[str, Any]:
        """Fallback to original SPG implementation with actual data storage."""
        batch_size, n_heads, seq_len, head_dim = keys.shape
        
        # Original position-based precision computation
        device = keys.device
        precision_scores = torch.zeros(seq_len, device=device)
        
        decay_rate = self.layer_decay_rates[layer_idx] if self.layer_decay_rates else self.config.base_decay_rate
        
        positions = torch.arange(seq_len, device=device)
        if current_position is None or not isinstance(current_position, (int, float)):
            current_position = seq_len
        current_position = int(current_position)
        distances = torch.tensor(current_position, device=device, dtype=positions.dtype) - positions
        
        precision_scores = torch.pow(decay_rate, distances.float() / self.config.decay_normalization)
        precision_scores[:self.config.sink_tokens] = 1.0
        
        recent_mask = distances < self.config.recent_window
        precision_scores[recent_mask] = torch.maximum(
            precision_scores[recent_mask],
            torch.tensor(self.config.recent_min_precision, device=device)
        )
        
        # Apply precision levels with actual data storage
        compressed_data = {
            'keys': {},
            'values': {},
            'metadata': {
                'precision_scores': precision_scores,
                'original_shape': keys.shape,
                'original_dtype': keys.dtype,
                'layer_idx': layer_idx,
                'compression_type': 'original_spg'
            }
        }
        
        # Exclusive binning for precision levels
        levels = self.config.precision_levels
        for i, score in enumerate(precision_scores):
            for j, level in enumerate(levels):
                lo = level.threshold
                hi = levels[j-1].threshold if j > 0 else float('inf')
                
                if lo <= score < hi:
                    if level.bits is not None:
                        precision_key = f'{level.bits}bit'
                    else:
                        precision_key = level.name
                    
                    if precision_key not in compressed_data['keys']:
                        compressed_data['keys'][precision_key] = {
                            'indices': [], 'data': None, 'scale': None, 'zero': None
                        }
                        compressed_data['values'][precision_key] = {
                            'indices': [], 'data': None, 'scale': None, 'zero': None
                        }
                    
                    compressed_data['keys'][precision_key]['indices'].append(i)
                    compressed_data['values'][precision_key]['indices'].append(i)
                    break
        
        # Process data
        keys_to_delete = []
        for precision_key in list(compressed_data['keys'].keys()):
            indices = compressed_data['keys'][precision_key]['indices']
            if not indices:
                keys_to_delete.append(precision_key)
                continue
                
            if precision_key == 'discard':
                keys_to_delete.append(precision_key)
                continue
                
            level_indices = torch.tensor(indices, device=device, dtype=torch.long)
            k_slice = keys.index_select(2, level_indices)
            v_slice = values.index_select(2, level_indices)
            
            # Store with FP16 precision (simplified for original SPG)
            compressed_data['keys'][precision_key]['data'] = k_slice.clone()
            compressed_data['values'][precision_key]['data'] = v_slice.clone()
        
        # Clean up empty keys
        for pk in keys_to_delete:
            compressed_data['keys'].pop(pk, None)
            compressed_data['values'].pop(pk, None)
        
        return compressed_data
    
    def _apply_progressive_compression(self, compressed_data: Dict, layer_idx: int) -> Dict:
        """Apply progressive compression with relative quality change detection."""
        if len(self.quality_history) >= self.constants.PROGRESSIVE_QUALITY_WINDOW:
            recent = float(np.mean(self.quality_history[-self.constants.PROGRESSIVE_RECENT_WINDOW:]))
            prev = float(np.mean(self.quality_history[-self.constants.PROGRESSIVE_QUALITY_WINDOW:-self.constants.PROGRESSIVE_RECENT_WINDOW]))
            rel_delta = (recent - prev) / max(prev, 1e-9)
            
            if rel_delta <= self.config.quality_threshold:
                old_ratio = self.current_compression_ratio or self.config.initial_compression_ratio
                new_ratio = min(old_ratio * self.config.progression_factor, self.config.max_compression_ratio)
                
                if new_ratio > old_ratio:
                    self.current_compression_ratio = new_ratio
                    compression_factor = new_ratio / old_ratio
                    
                    # Tighten compression ratios (use configurable minimum from config)
                    self.config.head_compression_ratio = max(self.config.progressive_min_ratio, 
                        self.config.head_compression_ratio / compression_factor)
                    self.config.sequence_compression_ratio = max(self.config.progressive_min_ratio,
                        self.config.sequence_compression_ratio / compression_factor)
                    
                    self.progressive_step += 1
                    
                    logger.info(f"Progressive step {self.progressive_step}: rel_delta={rel_delta:.4f}, new_ratio={new_ratio:.1f}x")
        
        compressed_data['metadata']['progressive_compression_ratio'] = self.current_compression_ratio
        compressed_data['metadata']['progressive_step'] = self.progressive_step
        
        return compressed_data
    
    def decompress(self, compressed_data: Dict) -> Tuple[torch.Tensor, torch.Tensor]:
        """Decompress enhanced SPG compressed data."""
        metadata = compressed_data['metadata']
        
        if metadata.get('compression_type') == 'original_spg':
            return self._decompress_original_spg(compressed_data)
        
        return self._decompress_enhanced_spg(compressed_data)
    
    def _decompress_enhanced_spg(self, compressed_data: Dict) -> Tuple[torch.Tensor, torch.Tensor]:
        """Decompress enhanced multi-stage compressed data with HSA support."""
        metadata = compressed_data['metadata']
        
        # Get device from first available tensor
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        for storage_type in ['keys', 'values']:
            for key, data in compressed_data[storage_type].items():
                if isinstance(data, dict) and 'data' in data and isinstance(data['data'], torch.Tensor):
                    device = data['data'].device
                    break
            if device != torch.device('cuda' if torch.cuda.is_available() else 'cpu'):
                break
        
        # Handle hybrid sparse attention format
        if metadata.get('compression_type') == 'hybrid_sparse_attention':
            return self._decompress_hybrid_sparse_attention(compressed_data)
        
        # Original enhanced SPG decompression
        original_shape = metadata['original_shape_after_stage1']
        original_dtype = metadata['original_dtype']
        
        keys_full = torch.zeros(original_shape, dtype=original_dtype, device=device)
        values_full = torch.zeros(original_shape, dtype=original_dtype, device=device)
        
        # Decompress head dimension data first
        if 'heads_fp16' in compressed_data['keys']:
            head_indices = compressed_data['keys']['heads_fp16']['indices']
            head_idx_tensor = torch.tensor(head_indices, device=device, dtype=torch.long)
            keys_full[:, head_idx_tensor, :, :] = compressed_data['keys']['heads_fp16']['data']
            values_full[:, head_idx_tensor, :, :] = compressed_data['values']['heads_fp16']['data']
            
            if self.config.enable_head_compression:
                n_heads = original_shape[1]
                other_head_indices = torch.tensor([h for h in range(n_heads) if h not in head_indices],
                                                 device=device, dtype=torch.long)
            else:
                other_head_indices = head_idx_tensor
        else:
            other_head_indices = torch.arange(original_shape[1], device=device, dtype=torch.long)
        
        # Decompress sequence dimension data
        for precision_key in [k for k in compressed_data['keys'].keys() if k.startswith('seq_')]:
            if 'data' not in compressed_data['keys'][precision_key]:
                continue
                
            indices = compressed_data['keys'][precision_key]['indices']
            idx_tensor = torch.tensor(indices, device=device, dtype=torch.long)
            
            # All data stored as FP16 in this simplified version
            keys_full[:, other_head_indices, :, :].index_copy_(2, idx_tensor, 
                compressed_data['keys'][precision_key]['data'])
            values_full[:, other_head_indices, :, :].index_copy_(2, idx_tensor,
                compressed_data['values'][precision_key]['data'])
        
        return keys_full, values_full
    
    def _decompress_hybrid_sparse_attention(self, compressed_data: Dict) -> Tuple[torch.Tensor, torch.Tensor]:
        """Decompress RocketKV-style hybrid sparse attention data."""
        metadata = compressed_data['metadata']
        original_shape = metadata['original_shape']
        
        # Get device from first available tensor
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        for head_key in compressed_data['keys'].keys():
            if head_key.startswith('head_'):
                device = compressed_data['keys'][head_key]['data'].device
                break
        
        # Initialize full tensors
        keys_full = torch.zeros(original_shape, dtype=torch.float16, device=device)
        values_full = torch.zeros(original_shape, dtype=torch.float16, device=device)
        
        # Reconstruct selected heads with their tokens
        for head_key in compressed_data['keys'].keys():
            if not head_key.startswith('head_'):
                continue
                
            head_idx = int(head_key.split('_')[1])
            head_data_k = compressed_data['keys'][head_key]
            head_data_v = compressed_data['values'][head_key]
            
            token_indices = head_data_k['indices']
            
            # Place data in the correct head and token positions
            keys_full[:, head_idx:head_idx+1, token_indices, :] = head_data_k['data']
            values_full[:, head_idx:head_idx+1, token_indices, :] = head_data_v['data']
        
        return keys_full, values_full
    
    def _decompress_original_spg(self, compressed_data: Dict) -> Tuple[torch.Tensor, torch.Tensor]:
        """Decompress original SPG data."""
        metadata = compressed_data['metadata']
        original_shape = metadata['original_shape']
        original_dtype = metadata['original_dtype']
        device = metadata['precision_scores'].device
        
        keys_full = torch.zeros(original_shape, dtype=original_dtype, device=device)
        values_full = torch.zeros(original_shape, dtype=original_dtype, device=device)
        
        for precision_key in compressed_data['keys']:
            data_dict = compressed_data['keys'][precision_key]
            if 'data' in data_dict and 'indices' in data_dict:
                indices = data_dict['indices']
                idx_tensor = torch.tensor(indices, device=device, dtype=torch.long)
                
                # All data stored as original precision
                keys_full.index_copy_(2, idx_tensor, data_dict['data'])
                values_full.index_copy_(2, idx_tensor, compressed_data['values'][precision_key]['data'])
        
        return keys_full, values_full
    
    def get_memory_footprint(self, compressed_data: Dict[str, Any]) -> int:
        """
        Calculate ACTUAL memory usage - NO ESTIMATES.
        Every byte is accounted for explicitly.
        """
        total_bytes = 0
        
        try:
            # Count all stored tensors
            for storage_type in ['keys', 'values']:
                for key, data in compressed_data[storage_type].items():
                    if isinstance(data, dict):
                        # Data tensors
                        if 'data' in data and isinstance(data['data'], torch.Tensor):
                            total_bytes += data['data'].nelement() * data['data'].element_size()
                        
                        # Scale/zero tensors
                        if 'scale' in data and isinstance(data['scale'], torch.Tensor):
                            total_bytes += data['scale'].nelement() * data['scale'].element_size()
                        if 'zero' in data and isinstance(data['zero'], torch.Tensor):
                            total_bytes += data['zero'].nelement() * data['zero'].element_size()
                        
                        # Levels tensor for bit-packed data
                        if 'levels' in data and isinstance(data['levels'], torch.Tensor):
                            total_bytes += data['levels'].nelement() * data['levels'].element_size()
                        
                        # Metadata overhead (measured, not estimated)
                        if 'meta' in data and isinstance(data['meta'], dict):
                            total_bytes += self.constants.INT2_METADATA_BYTES
                        
                        # Indices (count only once under keys to avoid double counting)
                        if storage_type == 'keys' and 'indices' in data and data['indices']:
                            total_bytes += len(data['indices']) * self.constants.INDEX_SIZE_BYTES
            
            # Metadata overhead
            total_bytes += self.constants.METADATA_OVERHEAD_BYTES
            
            logger.debug(f"Measured memory footprint: {total_bytes} bytes ({total_bytes/1024/1024:.2f} MB)")
            return total_bytes
            
        except Exception as e:
            logger.error(f"Error calculating memory footprint: {e}")
            raise
    
    def update_quality_feedback(self, layer_idx: int, quality_metric: float):
        """Update quality feedback for progressive compression."""
        self.quality_history.append(quality_metric)
        
        # Keep only recent history
        if len(self.quality_history) > self.constants.QUALITY_HISTORY_MAX_SIZE:
            self.quality_history = self.quality_history[-self.constants.QUALITY_HISTORY_MAX_SIZE:]


class QuantizedKVCache:
    """Enhanced quantized KV cache with working multi-stage SPG support."""
    
    def __init__(self, config: CompressionConfig):
        self.config = config
        self.compressed_data = {}
        self.dtypes = {}
        
        # Initialize enhanced SPG with RocketKV features
        if config.compression_type in [CompressionType.SPG, CompressionType.ADAPTIVE_SPG]:
            spg_config = replace(config.enhanced_spg_config, 
                               enable_two_stage=False,
                               enable_adaptive=(config.compression_type == CompressionType.ADAPTIVE_SPG))
            self.spg = EnhancedSlidingPrecisionGradient(spg_config)
        elif config.compression_type in [CompressionType.ENHANCED_SPG, CompressionType.PROGRESSIVE_SPG]:
            enhanced_config = config.enhanced_spg_config
            if config.compression_type == CompressionType.PROGRESSIVE_SPG:
                enhanced_config.enable_progressive = True
            self.spg = EnhancedSlidingPrecisionGradient(enhanced_config)
        else:
            self.spg = None
        
        self.current_position = 0
        self.quality_history = []
        self.n_layers = None
        
    def compress_and_store(self, layer_idx: int, keys: torch.Tensor, values: torch.Tensor):
        """Compress and store KV pairs with enhanced SPG support."""
        key_dtype = keys.dtype
        value_dtype = values.dtype
        
        if self.config.compression_type in [CompressionType.SPG, CompressionType.ADAPTIVE_SPG,
                                           CompressionType.ENHANCED_SPG, CompressionType.PROGRESSIVE_SPG]:
            if self.spg.layer_decay_rates is None:
                if self.n_layers is None:
                    raise ValueError("Model layer count not set - call detect_model_layers first")
                self.spg.initialize_layer_decay_rates(self.n_layers)
            
            if self.config.compression_type in [CompressionType.ENHANCED_SPG, CompressionType.PROGRESSIVE_SPG]:
                compressed_data = self.spg.compress_with_enhanced_gradient(
                    keys, values, layer_idx, self.current_position
                )
            else:
                compressed_data = self.spg._fallback_to_original_spg(
                    keys, values, layer_idx, self.current_position
                )
            
            self.compressed_data[layer_idx] = compressed_data
            self.dtypes[layer_idx] = {'keys': key_dtype, 'values': value_dtype}
        else:
            # No compression - store original tensors
            self.compressed_data[layer_idx] = {
                'keys': {'original': {'data': keys.clone(), 'indices': list(range(keys.shape[2]))}},
                'values': {'original': {'data': values.clone(), 'indices': list(range(values.shape[2]))}},
                'metadata': {
                    'compression_type': 'none',
                    'original_shape': keys.shape,
                    'original_dtype': keys.dtype
                }
            }
            self.dtypes[layer_idx] = {'keys': key_dtype, 'values': value_dtype}
    
    def get_decompressed(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        """Get decompressed KV pairs with enhanced SPG support."""
        if self.config.compression_type in [CompressionType.SPG, CompressionType.ADAPTIVE_SPG,
                                           CompressionType.ENHANCED_SPG, CompressionType.PROGRESSIVE_SPG]:
            if layer_idx in self.compressed_data:
                return self.spg.decompress(self.compressed_data[layer_idx])
            return None, None
        else:
            # No compression - return original tensors
            if layer_idx in self.compressed_data:
                data = self.compressed_data[layer_idx]
                return data['keys']['original']['data'], data['values']['original']['data']
            return None, None
    
    def get_memory_footprint(self) -> int:
        """Calculate actual memory usage with enhanced SPG support."""
        total_bytes = 0
        constants = ResearchConstants()
        
        if self.config.compression_type in [CompressionType.SPG, CompressionType.ADAPTIVE_SPG,
                                           CompressionType.ENHANCED_SPG, CompressionType.PROGRESSIVE_SPG]:
            for layer_idx in self.compressed_data:
                total_bytes += self.spg.get_memory_footprint(self.compressed_data[layer_idx])
        else:
            # No compression - calculate uncompressed memory
            for layer_idx in self.compressed_data:
                data = self.compressed_data[layer_idx]
                keys_data = data['keys']['original']['data']
                values_data = data['values']['original']['data']
                total_bytes += keys_data.nelement() * keys_data.element_size()
                total_bytes += values_data.nelement() * values_data.element_size()
                total_bytes += constants.METADATA_OVERHEAD_BYTES
        
        return total_bytes
    
    def update_position(self, new_position: int):
        """Update current generation position."""
        self.current_position = new_position
    
    def update_quality_feedback(self, layer_idx: int, quality_metric: float):
        """Provide quality feedback for adaptive methods."""
        if self.config.compression_type == CompressionType.ADAPTIVE_SPG and hasattr(self.spg, 'update_decay_rate'):
            target_quality = self.config.enhanced_spg_config.target_perplexity_delta
            self.spg.update_decay_rate(layer_idx, quality_metric, target_quality)
            self.quality_history.append((layer_idx, quality_metric))
        elif self.config.compression_type in [CompressionType.ENHANCED_SPG, CompressionType.PROGRESSIVE_SPG]:
            self.spg.update_quality_feedback(layer_idx, quality_metric)


def detect_model_layers(model) -> int:
    """Detect the number of transformer layers with comprehensive validation."""
    config_attrs = [
        'num_hidden_layers',
        'n_layer',
        'num_layers',
        'n_layers',
        'decoder_layers',
        'n_head_layers',
    ]
    
    for attr in config_attrs:
        if hasattr(model.config, attr):
            n_layers = getattr(model.config, attr)
            if isinstance(n_layers, int) and n_layers > 0:
                logger.info(f"Detected {n_layers} layers from config.{attr}")
                return n_layers
    
    layer_patterns = [
        'layer', 'layers', 'h', 'blocks', 'decoder.layers', 'transformer_blocks', 'decoderLayer',
    ]
    
    for module_name, module in model.named_modules():
        for pattern in layer_patterns:
            if pattern in module_name.lower():
                if hasattr(module, '__len__'):
                    n_layers = len(module)
                    if n_layers > 0:
                        logger.info(f"Detected {n_layers} layers by counting {module_name}")
                        return n_layers
    
    decoder_layer_types = [
        'TransformerBlock', 'DecoderLayer', 'EncoderLayer', 'Block', 'Layer',
        'GPT2Block', 'LlamaDecoderLayer', 'MistralDecoderLayer', 'OPTDecoderLayer',
    ]
    
    layers = []
    for module in model.modules():
        module_type = type(module).__name__
        if any(layer_type in module_type for layer_type in decoder_layer_types):
            layers.append(module)
    
    if layers:
        n_layers = len(set(layers))
        if n_layers > 0:
            logger.info(f"Detected {n_layers} layers by module type matching")
            return n_layers
    
    # Fail fast if cannot detect layers
    raise ValueError(
        f"Could not automatically detect the number of layers for model {type(model).__name__}. "
        "Please check the model architecture and update the detection logic."
    )