Spaces:

scratchtoscale
/

training-time-calculator

Running

File size: 9,264 Bytes

32e471c

import gradio as gr
import csv
import os
import numpy as np

def load_gpu_data():
    """Load GPU data from gpus.csv file."""
    gpu_data = {}
    csv_path = os.path.join(os.path.dirname(__file__), 'gpus.csv')
    
    try:
        with open(csv_path, 'r') as file:
            reader = csv.DictReader(file)
            for row in reader:
                gpu_name = row['gpu_model'].replace('_', ' ')
                tflops = float(row['sparce_tflops'])
                gpu_data[gpu_name] = tflops
    except Exception as e:
        print(f"Error loading GPU data: {e}")
        gpu_data = {"Custom": 0}
    
    return gpu_data

def calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage):
    """
    Calculate the time to train a model.
    
    Formula:
    - Total FLOPs = 6 * num_params * num_tokens
    - Effective FLOPs per second = tflops_per_gpu * num_gpus * 10^12 * (MFU/100)
    - Training time = Total FLOPs / Effective FLOPs per second
    
    Args:
        model_size_billions: Model size in billions of parameters
        tflops_per_gpu: BF16 TFLOPs per GPU (effective, non-sparsity)
        num_gpus: Number of GPUs used
        tokens_millions: Number of tokens in millions
        mfu_percentage: Model FLOPs Utilization percentage
    
    Returns:
        Training time in hours
    """
    # Convert inputs to base units
    num_params = model_size_billions * 1e9
    num_tokens = tokens_millions * 1e6
    
    # Calculate total FLOPs needed
    total_flops = 6 * num_params * num_tokens
    
    # Calculate effective FLOPs per second
    # tflops_per_gpu is in 10^12 FLOPs per second
    flops_per_second = tflops_per_gpu * num_gpus * 1e12 * (mfu_percentage / 100)
    
    # Calculate training time in seconds
    training_time_seconds = total_flops / flops_per_second
    
    # Convert to hours
    training_time_hours = training_time_seconds / 3600
    
    return training_time_hours

def format_output(hours):
    """Format the output in a readable way."""
    if hours < 24:
        return f"{hours:.2f} hours"
    else:
        days = hours / 24
        if days < 30:
            return f"{days:.2f} days ({hours:.1f} hours)"
        else:
            months = days / 30
            return f"{months:.2f} months ({days:.1f} days, {hours:.0f} hours)"

def slider_to_model_size(value):
    """Convert logarithmic slider value to actual model size in billions."""
    # Map 0-100 to 0.1B-1000B logarithmically
    min_log = np.log10(0.1)  # -1
    max_log = np.log10(1000)  # 3
    log_value = min_log + (max_log - min_log) * value / 100
    return 10 ** log_value

def model_size_to_slider(size_billions):
    """Convert model size in billions to slider value."""
    min_log = np.log10(0.1)
    max_log = np.log10(1000)
    log_value = np.log10(size_billions)
    return 100 * (log_value - min_log) / (max_log - min_log)

def format_model_size(size_billions):
    """Format model size for display."""
    if size_billions < 1:
        return f"{size_billions * 1000:.0f}M"
    elif size_billions < 1000:
        return f"{size_billions:.1f}B"
    else:
        return f"{size_billions / 1000:.1f}T"

def update_calculation(model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu_percentage):
    """Update the calculation and return formatted results."""
    # Convert model size to billions
    if model_size_unit == "B":
        model_size_billions = model_size_value
    else:  # T
        model_size_billions = model_size_value * 1000
    
    # Convert tokens to millions
    if tokens_unit == "M":
        tokens_millions = tokens_value
    elif tokens_unit == "B":
        tokens_millions = tokens_value * 1000
    else:  # T
        tokens_millions = tokens_value * 1000000
    
    # Determine TFLOPs value
    if use_gpu_model and gpu_model != "Custom":
        gpu_data = load_gpu_data()
        tflops_per_gpu = gpu_data.get(gpu_model, custom_tflops)
        gpu_info = f"{gpu_model} ({tflops_per_gpu} TFLOPs)"
    else:
        tflops_per_gpu = custom_tflops
        gpu_info = f"Custom ({tflops_per_gpu} TFLOPs)"
    
    hours = calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage)
    
    # Create detailed breakdown
    total_flops = 6 * (model_size_billions * 1e9) * (tokens_millions * 1e6)
    effective_tflops = tflops_per_gpu * num_gpus * (mfu_percentage / 100)
    
    breakdown = f"""
### Calculation Breakdown:
- **GPU Selection**: {gpu_info}
- **Model Size**: {format_model_size(model_size_billions)} parameters ({model_size_billions:.2f}B)
- **Training Tokens**: {tokens_value}{tokens_unit} tokens ({tokens_millions:.0f}M)
- **Total FLOPs**: {total_flops:.2e} FLOPs
- **Formula**: 6 × {model_size_billions:.2f}B params × {tokens_millions:.0f}M tokens
- **Effective TFLOPs**: {effective_tflops:.2f} TFLOPs/s
- **Formula**: {tflops_per_gpu} TFLOPs/GPU × {num_gpus} GPUs × {mfu_percentage}% MFU

### Training Time:
**{format_output(hours)}**
"""
    
    return breakdown

# Load GPU data
gpu_data = load_gpu_data()
gpu_choices = ["Custom"] + list(gpu_data.keys())

# Create the Gradio interface
with gr.Blocks(title="Model Training Time Calculator") as demo:
    gr.Markdown("# Model Training Time Calculator")
    gr.Markdown("Calculate the time required to train a model based on model size, hardware specs, and token count.")
    
    with gr.Row():
        with gr.Column():
            with gr.Row():
                model_size_value = gr.Number(
                    minimum=0.5,
                    maximum=1000,
                    value=7,
                    step=0.1,
                    label="Model Size",
                    info="Enter model size (0.5-1000)"
                )
                model_size_unit = gr.Radio(
                    choices=["B", "T"],
                    value="B",
                    label="Unit",
                    info="Model size unit"
                )
            
            # GPU Selection
            use_gpu_model = gr.Checkbox(
                value=True,
                label="Use GPU Model from List",
                info="Check to select a GPU model, uncheck to input custom TFLOPs"
            )
            
            gpu_model = gr.Dropdown(
                choices=gpu_choices,
                value="H100" if "H100" in gpu_choices else gpu_choices[0],
                label="GPU Model",
                info="Select a GPU model from the list",
                visible=True
            )
            
            custom_tflops = gr.Slider(
                minimum=10,
                maximum=2000,
                value=300,
                step=10,
                label="Custom BF16 TFLOPs per GPU",
                info="Effective (non-sparsity) TFLOPs per GPU",
                visible=False
            )
            
            num_gpus = gr.Slider(
                minimum=1,
                maximum=1024,
                value=8,
                step=1,
                label="Number of GPUs",
                info="Total number of GPUs for training"
            )
            
            with gr.Row():
                tokens_value = gr.Slider(
                    minimum=1,
                    maximum=1000,
                    value=100,
                    step=1,
                    label="Training Tokens",
                    info="Number of training tokens"
                )
                tokens_unit = gr.Radio(
                    choices=["M", "B", "T"],
                    value="B",
                    label="Unit",
                    info="Token count unit"
                )
            
            mfu = gr.Slider(
                minimum=10,
                maximum=100,
                value=50,
                step=5,
                label="Model FLOPs Utilization (MFU) %",
                info="Efficiency of hardware utilization (50% is typical for low-end estimate)"
            )
        
        with gr.Column():
            output = gr.Markdown(label="Results")
    
    # Toggle between GPU model and custom TFLOPs
    def toggle_gpu_input(use_gpu):
        return gr.update(visible=use_gpu), gr.update(visible=not use_gpu or use_gpu and gpu_model.value == "Custom")
    
    use_gpu_model.change(
        fn=toggle_gpu_input,
        inputs=[use_gpu_model],
        outputs=[gpu_model, custom_tflops]
    )
    
    # Show custom TFLOPs when "Custom" is selected
    def check_custom_selected(gpu_model_value):
        return gr.update(visible=gpu_model_value == "Custom")
    
    gpu_model.change(
        fn=check_custom_selected,
        inputs=[gpu_model],
        outputs=[custom_tflops]
    )
    
    # Set up live updating
    all_inputs = [model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu]
    
    for input_component in all_inputs:
        input_component.change(
            fn=update_calculation,
            inputs=all_inputs,
            outputs=output
        )
    
    # Initial calculation
    demo.load(
        fn=update_calculation,
        inputs=all_inputs,
        outputs=output
    )

if __name__ == "__main__":
    demo.launch()