slower than qwen 2.5 on a100 40gb
====================================================================================================
BENCHMARK RESULTS - Qwen/Qwen2.5-0.5B-Instruct
Batch Input Output Throughput TTFT Time Memory GPU Mem
1 256 128 17.62 tok/s 4152.1 ms 7.26s 957.7 MB 962.5 MB
1 256 256 17.77 tok/s 8151.1 ms 14.41s 958.5 MB 962.5 MB
1 512 128 17.60 tok/s 4180.4 ms 7.27s 964.7 MB 974.4 MB
1 512 256 17.76 tok/s 8158.8 ms 14.42s 961.6 MB 974.4 MB
1 1024 128 17.50 tok/s 4160.0 ms 7.31s 986.5 MB 999.8 MB
1 1024 256 17.64 tok/s 8234.0 ms 14.52s 976.0 MB 999.8 MB
====================================================================================================
BENCHMARK RESULTS - openbmb/MiniCPM4-0.5B ====================================================================================================
Batch Input Output Throughput TTFT Time Memory GPU Mem
1 256 128 13.66 tok/s 4713.0 ms 9.37s 1226.4 MB 1230.8 MB
1 256 256 13.98 tok/s 9131.9 ms 18.32s 1226.2 MB 1230.8 MB
1 512 128 13.74 tok/s 4670.8 ms 9.31s 1234.1 MB 1243.2 MB
1 512 256 13.89 tok/s 9164.9 ms 18.43s 1236.1 MB 1243.2 MB
1 1024 128 13.74 tok/s 4651.2 ms 9.31s 1250.3 MB 1263.8 MB
1 1024 256 13.84 tok/s 9272.9 ms 18.50s 1244.3 MB 1263.8 MB
I run 3 times each config and observe that it's not more efficient than qwen 2.5 ? any bug here
Here is my benchmark script:
import torch
import torch.cuda
from transformers import AutoModelForCausalLM, AutoTokenizer
try:
from transformers.models.lfm2.modeling_lfm2 import Lfm2ForCausalLM
except:
print("No lfm2")
import time
import psutil
import gc
from typing import Dict, List, Tuple, Optional
import numpy as np
import json
from dataclasses import dataclass, asdict
import threading
import queue
from contextlib import contextmanager
import argparse
@dataclass
class BenchmarkConfig:
model_name: str
batch_sizes: List[int]
input_lengths: List[int]
output_lengths: List[int]
num_runs: int = 5
warmup_runs: int = 2
device: str = "auto"
torch_dtype: str = "auto"
use_cache: bool = True
do_sample: bool = False
temperature: float = 1.0
@dataclass
class BenchmarkResult:
model_name: str
batch_size: int
input_length: int
output_length: int
throughput_tokens_per_sec: float
ttft_ms: float
total_time_sec: float
peak_memory_mb: float
gpu_memory_mb: float
tokens_generated: int
class MemoryTracker:
def init(self, device="cuda"):
self.device = device
self.peak_memory = 0
self.monitoring = False
self.memory_queue = queue.Queue()
def _monitor_memory(self):
while self.monitoring:
if self.device == "cuda" and torch.cuda.is_available():
current_memory = torch.cuda.memory_allocated() / (1024**2) # MB
else:
current_memory = psutil.Process().memory_info().rss / (1024**2) # MB
self.peak_memory = max(self.peak_memory, current_memory)
time.sleep(0.01) # Check every 10ms
@contextmanager
def track(self):
self.peak_memory = 0
self.monitoring = True
monitor_thread = threading.Thread(target=self._monitor_memory)
monitor_thread.start()
try:
yield self
finally:
self.monitoring = False
monitor_thread.join()
class LLMBenchmarker:
def init(self, config: BenchmarkConfig):
self.config = config
self.device = self._setup_device()
self.model = None
self.tokenizer = None
self.results = []
def _setup_device(self):
if self.config.device == "auto":
return "cuda" if torch.cuda.is_available() else "cpu"
return self.config.device
def _get_torch_dtype(self):
if self.config.torch_dtype == "auto":
return torch.bfloat16 if self.device == "cuda" else torch.float32
return getattr(torch, self.config.torch_dtype)
def load_model(self):
"""Load model and tokenizer with optimizations."""
print(f"Loading model: {self.config.model_name}")
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
self.config.model_name,
padding_side="left"
)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# Load model with optimizations
torch_dtype = self._get_torch_dtype()
if "lfm" not in self.config.model_name.lower():
self.model = AutoModelForCausalLM.from_pretrained(
self.config.model_name,
torch_dtype=torch_dtype,
device_map="auto" if self.device == "cuda" else None,
trust_remote_code=True,
use_cache=self.config.use_cache,
attn_implementation='flash_attention_2'
)
else:
self.model = Lfm2ForCausalLM.from_pretrained(
self.config.model_name,
torch_dtype=torch_dtype,
device_map="auto" if self.device == "cuda" else None,
trust_remote_code=True,
use_cache=self.config.use_cache,
attn_implementation='flash_attention_2'
)
if self.device != "cuda":
self.model = self.model.to(self.device)
self.model.eval()
# Compile model for PyTorch 2.0+ if available
try:
if hasattr(torch, 'compile'):
print("Compiling model with torch.compile...")
self.model = torch.compile(self.model)
except Exception as e:
print(f"Could not compile model: {e}")
def _generate_sample_inputs(self, batch_size: int, input_length: int) -> Dict[str, torch.Tensor]:
"""Generate realistic sample inputs."""
sample_prompts = [
"The future of artificial intelligence is",
"In a world where technology advances rapidly",
"Climate change poses significant challenges that require",
"The integration of machine learning in healthcare",
"Quantum computing represents a paradigm shift in"
]
# Cycle through prompts to create batch
prompts = [sample_prompts[i % len(sample_prompts)] for i in range(batch_size)]
# Tokenize and pad to desired input length
encoded = self.tokenizer(
prompts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=input_length
)
# Ensure exact input length by padding or truncating
current_length = encoded['input_ids'].shape[1]
if current_length < input_length:
# Pad with pad tokens
pad_length = input_length - current_length
pad_token_id = self.tokenizer.pad_token_id
padding = torch.full((batch_size, pad_length), pad_token_id, dtype=torch.long)
encoded['input_ids'] = torch.cat([encoded['input_ids'], padding], dim=1)
encoded['attention_mask'] = torch.cat([
encoded['attention_mask'],
torch.zeros(batch_size, pad_length, dtype=torch.long)
], dim=1)
elif current_length > input_length:
# Truncate
encoded['input_ids'] = encoded['input_ids'][:, :input_length]
encoded['attention_mask'] = encoded['attention_mask'][:, :input_length]
return {k: v.to(self.device) for k, v in encoded.items()}
def _benchmark_single_configuration(self, batch_size: int, input_length: int, output_length: int) -> BenchmarkResult:
print(f"Benchmarking: batch_size={batch_size}, input_len={input_length}, output_len={output_length}")
inputs = self._generate_sample_inputs(batch_size, input_length)
# Warmup runs
for _ in range(self.config.warmup_runs):
with torch.no_grad():
self.model.generate(**inputs, max_new_tokens=min(output_length, 10))
if self.device == "cuda":
torch.cuda.synchronize()
gc.collect()
times = []
ttft_times = []
memory_tracker = MemoryTracker(self.device)
for run in range(self.config.num_runs):
if self.device == "cuda":
torch.cuda.empty_cache()
with memory_tracker.track():
with torch.no_grad():
# Single generation call with callback to capture TTFT
start_time = time.perf_counter()
first_token_time = None
def first_token_callback(input_ids, **kwargs):
nonlocal first_token_time
if first_token_time is None:
if self.device == "cuda":
torch.cuda.synchronize()
first_token_time = time.perf_counter() - start_time
# Generate with streaming to capture first token timing
outputs = self.model.generate(
**inputs,
max_new_tokens=output_length,
do_sample=self.config.do_sample,
temperature=self.config.temperature,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=None,
# Use a simple approach: generate tokens one by one for first token
)
# Alternative simpler approach - measure first token separately but correctly
# Generate first token
first_outputs = self.model.generate(
**inputs,
max_new_tokens=1,
do_sample=self.config.do_sample,
temperature=self.config.temperature,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=None,
use_cache=True
)
if self.device == "cuda":
torch.cuda.synchronize()
ttft_time = time.perf_counter() - start_time
# Now continue generation from where we left off
if output_length > 1:
remaining_start = time.perf_counter()
# Update inputs to include the first generated token
updated_inputs = {
'input_ids': first_outputs,
'attention_mask': torch.ones_like(first_outputs)
}
remaining_outputs = self.model.generate(
**updated_inputs,
max_new_tokens=output_length - 1,
do_sample=self.config.do_sample,
temperature=self.config.temperature,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=None,
use_cache=True
)
if self.device == "cuda":
torch.cuda.synchronize()
remaining_time = time.perf_counter() - remaining_start
total_time = ttft_time + remaining_time
else:
total_time = ttft_time
ttft_times.append(ttft_time * 1000) # Convert to ms
times.append(total_time)
# Rest of the calculation remains the same
avg_time = np.mean(times)
avg_ttft = np.mean(ttft_times)
tokens_generated = batch_size * output_length
throughput = tokens_generated / avg_time
# Get GPU memory if available
gpu_memory = 0
if self.device == "cuda" and torch.cuda.is_available():
gpu_memory = torch.cuda.max_memory_allocated() / (1024**2)
return BenchmarkResult(
model_name=self.config.model_name,
batch_size=batch_size,
input_length=input_length,
output_length=output_length,
throughput_tokens_per_sec=throughput,
ttft_ms=avg_ttft,
total_time_sec=avg_time,
peak_memory_mb=memory_tracker.peak_memory,
gpu_memory_mb=gpu_memory,
tokens_generated=tokens_generated
)
def run_benchmark(self) -> List[BenchmarkResult]:
"""Run complete benchmark suite."""
if self.model is None:
self.load_model()
print(f"Starting benchmark on device: {self.device}")
print(f"Model dtype: {next(self.model.parameters()).dtype}")
total_configs = (len(self.config.batch_sizes) *
len(self.config.input_lengths) *
len(self.config.output_lengths))
current_config = 0
for batch_size in self.config.batch_sizes:
for input_length in self.config.input_lengths:
for output_length in self.config.output_lengths:
current_config += 1
print(f"\nProgress: {current_config}/{total_configs}")
try:
result = self._benchmark_single_configuration(
batch_size, input_length, output_length
)
self.results.append(result)
# Print immediate results
print(f" Throughput: {result.throughput_tokens_per_sec:.2f} tokens/sec")
print(f" TTFT: {result.ttft_ms:.2f} ms")
print(f" Peak Memory: {result.peak_memory_mb:.2f} MB")
except Exception as e:
print(f" Error in configuration: {e}")
continue
return self.results
def print_results(self):
"""Print formatted benchmark results."""
if not self.results:
print("No results to display.")
return
print("\n" + "="*100)
print(f"BENCHMARK RESULTS - {self.config.model_name}")
print("="*100)
# Table header
header = f"{'Batch':>5} {'Input':>8} {'Output':>8} {'Throughput':>12} {'TTFT':>8} {'Time':>8} {'Memory':>10} {'GPU Mem':>10}"
print(header)
print("-" * len(header))
# Results
for result in self.results:
print(f"{result.batch_size:>5} "
f"{result.input_length:>8} "
f"{result.output_length:>8} "
f"{result.throughput_tokens_per_sec:>8.2f} tok/s "
f"{result.ttft_ms:>6.1f} ms "
f"{result.total_time_sec:>6.2f}s "
f"{result.peak_memory_mb:>8.1f} MB "
f"{result.gpu_memory_mb:>8.1f} MB")
# Summary statistics
print("\nSUMMARY:")
throughputs = [r.throughput_tokens_per_sec for r in self.results]
ttfts = [r.ttft_ms for r in self.results]
print(f"Average Throughput: {np.mean(throughputs):.2f} tokens/sec")
print(f"Max Throughput: {np.max(throughputs):.2f} tokens/sec")
print(f"Average TTFT: {np.mean(ttfts):.2f} ms")
print(f"Min TTFT: {np.min(ttfts):.2f} ms")
def save_results(self, filename: str):
"""Save results to JSON file."""
results_dict = [asdict(result) for result in self.results]
with open(filename, 'w') as f:
json.dump(results_dict, f, indent=2)
print(f"Results saved to {filename}")
def main():
import sys
# Example configurations for different model sizes
configs = [
# Small model for quick testing
BenchmarkConfig(
model_name=sys.argv[1],
batch_sizes=[1],
input_lengths=[256, 512, 1024],
output_lengths=[128, 256],
num_runs=3
),
# Medium model - uncomment if you have sufficient resources
# BenchmarkConfig(
# model_name="HuggingFaceTB/SmolLM-135M",
# batch_sizes=[1, 2, 4],
# input_lengths=[512, 1024],
# output_lengths=[128, 256],
# num_runs=3
# ),
# Large model - uncomment if you have high-end GPU
# BenchmarkConfig(
# model_name="huggingface/CodeBERTa-small-v1",
# batch_sizes=[1],
# input_lengths=[256, 512],
# output_lengths=[100, 256],
# num_runs=2
# )
]
for i, config in enumerate(configs):
print(f"\n{'='*60}")
print(f"Running benchmark {i+1}/{len(configs)}: {config.model_name}")
print('='*60)
benchmarker = LLMBenchmarker(config)
try:
results = benchmarker.run_benchmark()
benchmarker.print_results()
# Save results
safe_model_name = config.model_name.replace('/', '_')
# if config.input_lengths[0] > 2048:
# filename = f"trash_{safe_model_name}.json"
# else:
filename = f"benchmark_results_{safe_model_name}.json"
benchmarker.save_results(filename)
except Exception as e:
print(f"Error running benchmark for {config.model_name}: {e}")
continue
# Clean up
del benchmarker.model
del benchmarker.tokenizer
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
if name == "main":
main()
I wanna discuss about this isssue, tks