slower than qwen 2.5 on a100 40gb

#10
by ambivalent02 - opened

====================================================================================================
BENCHMARK RESULTS - Qwen/Qwen2.5-0.5B-Instruct

Batch Input Output Throughput TTFT Time Memory GPU Mem

1      256      128    17.62 tok/s 4152.1 ms   7.26s    957.7 MB    962.5 MB
1      256      256    17.77 tok/s 8151.1 ms  14.41s    958.5 MB    962.5 MB
1      512      128    17.60 tok/s 4180.4 ms   7.27s    964.7 MB    974.4 MB
1      512      256    17.76 tok/s 8158.8 ms  14.42s    961.6 MB    974.4 MB
1     1024      128    17.50 tok/s 4160.0 ms   7.31s    986.5 MB    999.8 MB
1     1024      256    17.64 tok/s 8234.0 ms  14.52s    976.0 MB    999.8 MB

====================================================================================================
BENCHMARK RESULTS - openbmb/MiniCPM4-0.5B ====================================================================================================
Batch Input Output Throughput TTFT Time Memory GPU Mem

1 256 128 13.66 tok/s 4713.0 ms 9.37s 1226.4 MB 1230.8 MB
1 256 256 13.98 tok/s 9131.9 ms 18.32s 1226.2 MB 1230.8 MB
1 512 128 13.74 tok/s 4670.8 ms 9.31s 1234.1 MB 1243.2 MB
1 512 256 13.89 tok/s 9164.9 ms 18.43s 1236.1 MB 1243.2 MB
1 1024 128 13.74 tok/s 4651.2 ms 9.31s 1250.3 MB 1263.8 MB
1 1024 256 13.84 tok/s 9272.9 ms 18.50s 1244.3 MB 1263.8 MB

I run 3 times each config and observe that it's not more efficient than qwen 2.5 ? any bug here
Here is my benchmark script:

import torch
import torch.cuda
from transformers import AutoModelForCausalLM, AutoTokenizer
try:
from transformers.models.lfm2.modeling_lfm2 import Lfm2ForCausalLM
except:
print("No lfm2")
import time
import psutil
import gc
from typing import Dict, List, Tuple, Optional
import numpy as np
import json
from dataclasses import dataclass, asdict
import threading
import queue
from contextlib import contextmanager
import argparse

@dataclass
class BenchmarkConfig:
model_name: str
batch_sizes: List[int]
input_lengths: List[int]
output_lengths: List[int]
num_runs: int = 5
warmup_runs: int = 2
device: str = "auto"
torch_dtype: str = "auto"
use_cache: bool = True
do_sample: bool = False
temperature: float = 1.0

@dataclass
class BenchmarkResult:
model_name: str
batch_size: int
input_length: int
output_length: int
throughput_tokens_per_sec: float
ttft_ms: float
total_time_sec: float
peak_memory_mb: float
gpu_memory_mb: float
tokens_generated: int

class MemoryTracker:
def init(self, device="cuda"):
self.device = device
self.peak_memory = 0
self.monitoring = False
self.memory_queue = queue.Queue()

def _monitor_memory(self):
    while self.monitoring:
        if self.device == "cuda" and torch.cuda.is_available():
            current_memory = torch.cuda.memory_allocated() / (1024**2)  # MB
        else:
            current_memory = psutil.Process().memory_info().rss / (1024**2)  # MB
        
        self.peak_memory = max(self.peak_memory, current_memory)
        time.sleep(0.01)  # Check every 10ms

@contextmanager
def track(self):
    self.peak_memory = 0
    self.monitoring = True
    monitor_thread = threading.Thread(target=self._monitor_memory)
    monitor_thread.start()
    
    try:
        yield self
    finally:
        self.monitoring = False
        monitor_thread.join()

class LLMBenchmarker:
def init(self, config: BenchmarkConfig):
self.config = config
self.device = self._setup_device()
self.model = None
self.tokenizer = None
self.results = []

def _setup_device(self):
    if self.config.device == "auto":
        return "cuda" if torch.cuda.is_available() else "cpu"
    return self.config.device

def _get_torch_dtype(self):
    if self.config.torch_dtype == "auto":
        return torch.bfloat16 if self.device == "cuda" else torch.float32
    return getattr(torch, self.config.torch_dtype)

def load_model(self):
    """Load model and tokenizer with optimizations."""
    print(f"Loading model: {self.config.model_name}")
    
    # Load tokenizer
    self.tokenizer = AutoTokenizer.from_pretrained(
        self.config.model_name,
        padding_side="left"
    )
    if self.tokenizer.pad_token is None:
        self.tokenizer.pad_token = self.tokenizer.eos_token
    
    # Load model with optimizations
    torch_dtype = self._get_torch_dtype()
    if "lfm" not in self.config.model_name.lower(): 
        self.model = AutoModelForCausalLM.from_pretrained(
            self.config.model_name,
            torch_dtype=torch_dtype,
            device_map="auto" if self.device == "cuda" else None,
            trust_remote_code=True,
            use_cache=self.config.use_cache,
            attn_implementation='flash_attention_2'
        )
    else:
        self.model = Lfm2ForCausalLM.from_pretrained(
            self.config.model_name,
            torch_dtype=torch_dtype,
            device_map="auto" if self.device == "cuda" else None,
            trust_remote_code=True,
            use_cache=self.config.use_cache,
            attn_implementation='flash_attention_2'
        )
        
    
    if self.device != "cuda":
        self.model = self.model.to(self.device)
    
    self.model.eval()
    
    # Compile model for PyTorch 2.0+ if available
    try:
        if hasattr(torch, 'compile'):
            print("Compiling model with torch.compile...")
            self.model = torch.compile(self.model)
    except Exception as e:
        print(f"Could not compile model: {e}")

def _generate_sample_inputs(self, batch_size: int, input_length: int) -> Dict[str, torch.Tensor]:
    """Generate realistic sample inputs."""
    sample_prompts = [
        "The future of artificial intelligence is",
        "In a world where technology advances rapidly",
        "Climate change poses significant challenges that require",
        "The integration of machine learning in healthcare",
        "Quantum computing represents a paradigm shift in"
    ]
    
    # Cycle through prompts to create batch
    prompts = [sample_prompts[i % len(sample_prompts)] for i in range(batch_size)]
    
    # Tokenize and pad to desired input length
    encoded = self.tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=input_length
    )
    
    # Ensure exact input length by padding or truncating
    current_length = encoded['input_ids'].shape[1]
    if current_length < input_length:
        # Pad with pad tokens
        pad_length = input_length - current_length
        pad_token_id = self.tokenizer.pad_token_id
        padding = torch.full((batch_size, pad_length), pad_token_id, dtype=torch.long)
        encoded['input_ids'] = torch.cat([encoded['input_ids'], padding], dim=1)
        encoded['attention_mask'] = torch.cat([
            encoded['attention_mask'], 
            torch.zeros(batch_size, pad_length, dtype=torch.long)
        ], dim=1)
    elif current_length > input_length:
        # Truncate
        encoded['input_ids'] = encoded['input_ids'][:, :input_length]
        encoded['attention_mask'] = encoded['attention_mask'][:, :input_length]
    
    return {k: v.to(self.device) for k, v in encoded.items()}

def _benchmark_single_configuration(self, batch_size: int, input_length: int, output_length: int) -> BenchmarkResult:
    print(f"Benchmarking: batch_size={batch_size}, input_len={input_length}, output_len={output_length}")

    inputs = self._generate_sample_inputs(batch_size, input_length)
    
    # Warmup runs
    for _ in range(self.config.warmup_runs):
        with torch.no_grad():
            self.model.generate(**inputs, max_new_tokens=min(output_length, 10))
    
    if self.device == "cuda":
        torch.cuda.synchronize()
    gc.collect()
    
    times = []
    ttft_times = []
    memory_tracker = MemoryTracker(self.device)
    
    for run in range(self.config.num_runs):
        if self.device == "cuda":
            torch.cuda.empty_cache()
        
        with memory_tracker.track():
            with torch.no_grad():
                # Single generation call with callback to capture TTFT
                start_time = time.perf_counter()
                first_token_time = None
                
                def first_token_callback(input_ids, **kwargs):
                    nonlocal first_token_time
                    if first_token_time is None:
                        if self.device == "cuda":
                            torch.cuda.synchronize()
                        first_token_time = time.perf_counter() - start_time
                
                # Generate with streaming to capture first token timing
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=output_length,
                    do_sample=self.config.do_sample,
                    temperature=self.config.temperature,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=None,
                    # Use a simple approach: generate tokens one by one for first token
                )
                
                # Alternative simpler approach - measure first token separately but correctly
                # Generate first token
                first_outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=1,
                    do_sample=self.config.do_sample,
                    temperature=self.config.temperature,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=None,
                    use_cache=True
                )
                
                if self.device == "cuda":
                    torch.cuda.synchronize()
                ttft_time = time.perf_counter() - start_time
                
                # Now continue generation from where we left off
                if output_length > 1:
                    remaining_start = time.perf_counter()
                    # Update inputs to include the first generated token
                    updated_inputs = {
                        'input_ids': first_outputs,
                        'attention_mask': torch.ones_like(first_outputs)
                    }
                    
                    remaining_outputs = self.model.generate(
                        **updated_inputs,
                        max_new_tokens=output_length - 1,
                        do_sample=self.config.do_sample,
                        temperature=self.config.temperature,
                        pad_token_id=self.tokenizer.pad_token_id,
                        eos_token_id=None,
                        use_cache=True
                    )
                    
                    if self.device == "cuda":
                        torch.cuda.synchronize()
                    
                    remaining_time = time.perf_counter() - remaining_start
                    total_time = ttft_time + remaining_time
                else:
                    total_time = ttft_time
                
                ttft_times.append(ttft_time * 1000)  # Convert to ms
                times.append(total_time)
    
    # Rest of the calculation remains the same
    avg_time = np.mean(times)
    avg_ttft = np.mean(ttft_times)
    tokens_generated = batch_size * output_length
    throughput = tokens_generated / avg_time
    

    # Get GPU memory if available
    gpu_memory = 0
    if self.device == "cuda" and torch.cuda.is_available():
        gpu_memory = torch.cuda.max_memory_allocated() / (1024**2)
    
    return BenchmarkResult(
        model_name=self.config.model_name,
        batch_size=batch_size,
        input_length=input_length,
        output_length=output_length,
        throughput_tokens_per_sec=throughput,
        ttft_ms=avg_ttft,
        total_time_sec=avg_time,
        peak_memory_mb=memory_tracker.peak_memory,
        gpu_memory_mb=gpu_memory,
        tokens_generated=tokens_generated
    )

def run_benchmark(self) -> List[BenchmarkResult]:
    """Run complete benchmark suite."""
    if self.model is None:
        self.load_model()
    
    print(f"Starting benchmark on device: {self.device}")
    print(f"Model dtype: {next(self.model.parameters()).dtype}")
    
    total_configs = (len(self.config.batch_sizes) * 
                    len(self.config.input_lengths) * 
                    len(self.config.output_lengths))
    current_config = 0
    
    for batch_size in self.config.batch_sizes:
        for input_length in self.config.input_lengths:
            for output_length in self.config.output_lengths:
                current_config += 1
                print(f"\nProgress: {current_config}/{total_configs}")
                
                try:
                    result = self._benchmark_single_configuration(
                        batch_size, input_length, output_length
                    )
                    self.results.append(result)
                    
                    # Print immediate results
                    print(f"  Throughput: {result.throughput_tokens_per_sec:.2f} tokens/sec")
                    print(f"  TTFT: {result.ttft_ms:.2f} ms")
                    print(f"  Peak Memory: {result.peak_memory_mb:.2f} MB")
                    
                except Exception as e:
                    print(f"  Error in configuration: {e}")
                    continue
    
    return self.results

def print_results(self):
    """Print formatted benchmark results."""
    if not self.results:
        print("No results to display.")
        return
    
    print("\n" + "="*100)
    print(f"BENCHMARK RESULTS - {self.config.model_name}")
    print("="*100)
    
    # Table header
    header = f"{'Batch':>5} {'Input':>8} {'Output':>8} {'Throughput':>12} {'TTFT':>8} {'Time':>8} {'Memory':>10} {'GPU Mem':>10}"
    print(header)
    print("-" * len(header))
    
    # Results
    for result in self.results:
        print(f"{result.batch_size:>5} "
              f"{result.input_length:>8} "
              f"{result.output_length:>8} "
              f"{result.throughput_tokens_per_sec:>8.2f} tok/s "
              f"{result.ttft_ms:>6.1f} ms "
              f"{result.total_time_sec:>6.2f}s "
              f"{result.peak_memory_mb:>8.1f} MB "
              f"{result.gpu_memory_mb:>8.1f} MB")
    
    # Summary statistics
    print("\nSUMMARY:")
    throughputs = [r.throughput_tokens_per_sec for r in self.results]
    ttfts = [r.ttft_ms for r in self.results]
    
    print(f"Average Throughput: {np.mean(throughputs):.2f} tokens/sec")
    print(f"Max Throughput: {np.max(throughputs):.2f} tokens/sec")
    print(f"Average TTFT: {np.mean(ttfts):.2f} ms")
    print(f"Min TTFT: {np.min(ttfts):.2f} ms")

def save_results(self, filename: str):
    """Save results to JSON file."""
    results_dict = [asdict(result) for result in self.results]
    with open(filename, 'w') as f:
        json.dump(results_dict, f, indent=2)
    print(f"Results saved to {filename}")

def main():
import sys
# Example configurations for different model sizes
configs = [
# Small model for quick testing
BenchmarkConfig(
model_name=sys.argv[1],
batch_sizes=[1],
input_lengths=[256, 512, 1024],
output_lengths=[128, 256],
num_runs=3
),

    # Medium model - uncomment if you have sufficient resources
    # BenchmarkConfig(
    #     model_name="HuggingFaceTB/SmolLM-135M",
    #     batch_sizes=[1, 2, 4],
    #     input_lengths=[512, 1024],
    #     output_lengths=[128, 256],
    #     num_runs=3
    # ),
    
    # Large model - uncomment if you have high-end GPU
    # BenchmarkConfig(
    #     model_name="huggingface/CodeBERTa-small-v1",
    #     batch_sizes=[1],
    #     input_lengths=[256, 512],
    #     output_lengths=[100, 256],
    #     num_runs=2
    # )
]

for i, config in enumerate(configs):
    print(f"\n{'='*60}")
    print(f"Running benchmark {i+1}/{len(configs)}: {config.model_name}")
    print('='*60)
    
    benchmarker = LLMBenchmarker(config)
    
    try:
        results = benchmarker.run_benchmark()
        benchmarker.print_results()
        
        # Save results
        safe_model_name = config.model_name.replace('/', '_')
        # if config.input_lengths[0] > 2048:
        #     filename = f"trash_{safe_model_name}.json"
        # else:
        filename = f"benchmark_results_{safe_model_name}.json"
        benchmarker.save_results(filename)
        
    except Exception as e:
        print(f"Error running benchmark for {config.model_name}: {e}")
        continue
    
    # Clean up
    del benchmarker.model
    del benchmarker.tokenizer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

if name == "main":
main()

I wanna discuss about this isssue, tks

Sign up or log in to comment