File size: 7,331 Bytes
bc6498b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
#!/usr/bin/env python3
"""Real LLM compression and energy test - Phase 4"""
import time
import json
import torch
import numpy as np
import psutil
import os
def get_model_size(model):
"""Calculate actual model size in memory"""
param_size = 0
for param in model.parameters():
param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
buffer_size += buffer.nelement() * buffer.element_size()
return (param_size + buffer_size) / 1024 / 1024 # MB
def measure_inference_speed(model, tokenizer, prompts, device='cpu'):
"""Measure actual inference speed"""
model.eval()
total_tokens = 0
start_time = time.time()
with torch.no_grad():
for prompt in prompts:
inputs = tokenizer(prompt, return_tensors='pt', padding=True).to(device)
outputs = model.generate(
**inputs,
max_new_tokens=20,
do_sample=False,
pad_token_id=tokenizer.pad_token_id
)
total_tokens += outputs.shape[1]
inference_time = time.time() - start_time
return {
'total_tokens': total_tokens,
'time_seconds': inference_time,
'tokens_per_second': total_tokens / inference_time
}
def run_real_compression_test():
"""Run actual model compression test"""
print("="*70)
print(" "*20 + "REAL LLM COMPRESSION TEST")
print("="*70)
# Use a smaller model that will actually download
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "distilgpt2" # 82M params, ~320MB
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"\n📥 Loading {model_name} model...")
print(f"Device: {device}")
# Test prompts
test_prompts = [
"The future of artificial intelligence is",
"Quantum computers will revolutionize",
"Energy efficiency in computing means",
"Machine learning algorithms can",
"The next breakthrough in technology"
]
results = {}
# 1. Baseline FP32 Model
print("\n🔵 Testing FP32 baseline model...")
model_fp32 = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float32
).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
fp32_size = get_model_size(model_fp32)
fp32_speed = measure_inference_speed(model_fp32, tokenizer, test_prompts, device)
results['fp32'] = {
'size_mb': fp32_size,
'dtype': 'float32',
**fp32_speed
}
del model_fp32
if device == 'cuda':
torch.cuda.empty_cache()
# 2. FP16 Model
print("\n🟢 Testing FP16 model...")
model_fp16 = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16
).to(device)
fp16_size = get_model_size(model_fp16)
fp16_speed = measure_inference_speed(model_fp16, tokenizer, test_prompts, device)
results['fp16'] = {
'size_mb': fp16_size,
'dtype': 'float16',
**fp16_speed
}
del model_fp16
if device == 'cuda':
torch.cuda.empty_cache()
# 3. INT8 Quantization (simulated via torch.quantization)
print("\n🟡 Testing INT8 quantized model...")
model_int8 = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float32
)
# Dynamic quantization
model_int8 = torch.quantization.quantize_dynamic(
model_int8,
{torch.nn.Linear},
dtype=torch.qint8
)
int8_size = get_model_size(model_int8)
int8_speed = measure_inference_speed(model_int8, tokenizer, test_prompts, 'cpu') # INT8 on CPU
results['int8'] = {
'size_mb': int8_size,
'dtype': 'int8',
**int8_speed
}
# Calculate improvements
results['compression_ratios'] = {
'fp32_to_fp16': results['fp32']['size_mb'] / results['fp16']['size_mb'],
'fp32_to_int8': results['fp32']['size_mb'] / results['int8']['size_mb'],
'fp16_to_int8': results['fp16']['size_mb'] / results['int8']['size_mb']
}
results['speedup_ratios'] = {
'fp16_vs_fp32': results['fp16']['tokens_per_second'] / results['fp32']['tokens_per_second'],
'int8_vs_fp32': results['int8']['tokens_per_second'] / results['fp32']['tokens_per_second']
}
# Energy estimation (based on time and model size)
# Simplified: Energy ∝ time × model_size
baseline_energy = results['fp32']['time_seconds'] * results['fp32']['size_mb']
fp16_energy = results['fp16']['time_seconds'] * results['fp16']['size_mb']
int8_energy = results['int8']['time_seconds'] * results['int8']['size_mb']
results['energy_estimates'] = {
'fp32_relative': 1.0,
'fp16_relative': fp16_energy / baseline_energy,
'int8_relative': int8_energy / baseline_energy,
'fp16_reduction_percent': (1 - fp16_energy / baseline_energy) * 100,
'int8_reduction_percent': (1 - int8_energy / baseline_energy) * 100
}
# Check acceptance criteria
results['acceptance_criteria'] = {
'compression_4x': max(results['compression_ratios'].values()) >= 4.0,
'energy_reduction_40': max(
results['energy_estimates']['fp16_reduction_percent'],
results['energy_estimates']['int8_reduction_percent']
) >= 40.0,
'criteria_met': False
}
results['acceptance_criteria']['criteria_met'] = (
results['acceptance_criteria']['compression_4x'] or
results['acceptance_criteria']['energy_reduction_40']
)
return results
if __name__ == "__main__":
print("\n🔬 Starting REAL LLM Compression Test...")
# Run the test
results = run_real_compression_test()
# Display results
print("\n" + "="*70)
print(" "*25 + "RESULTS")
print("="*70)
print("\n📊 Model Sizes:")
for dtype in ['fp32', 'fp16', 'int8']:
if dtype in results:
print(f" {dtype:5}: {results[dtype]['size_mb']:>8.1f} MB")
print("\n⚡ Inference Speed:")
for dtype in ['fp32', 'fp16', 'int8']:
if dtype in results:
print(f" {dtype:5}: {results[dtype]['tokens_per_second']:>8.1f} tokens/sec")
print("\n📉 Compression Ratios:")
for key, value in results['compression_ratios'].items():
print(f" {key}: {value:.2f}x")
print("\n🔋 Energy Reduction Estimates:")
print(f" FP16: {results['energy_estimates']['fp16_reduction_percent']:.1f}%")
print(f" INT8: {results['energy_estimates']['int8_reduction_percent']:.1f}%")
print("\n✅ Acceptance Criteria:")
print(f" 4x Compression: {'PASS' if results['acceptance_criteria']['compression_4x'] else 'FAIL'}")
print(f" 40% Energy Reduction: {'PASS' if results['acceptance_criteria']['energy_reduction_40'] else 'FAIL'}")
# Save results
os.makedirs("phase4_outputs", exist_ok=True)
with open("phase4_outputs/real_llm_results.json", "w") as f:
json.dump(results, f, indent=2)
print(f"\n💾 Results saved to phase4_outputs/real_llm_results.json")
print("="*70) |