{
  "base_model": "meta-llama/Llama-3.1-8B-Instruct",
  "dataset": "newmindai/euro_hpc-legal",
  "training_args": {
    "model_id": "meta-llama/Llama-3.1-8B-Instruct",
    "trust_remote_code": true,
    "dataset": "newmindai/euro_hpc-legal",
    "dataset_config": null,
    "split": "train",
    "format": "qa",
    "seq_len": 4096,
    "num_samples": null,
    "combine_configs": true,
    "output_dir": "./Llama-3.1-8B-Instruct_w16a16_4nodes",
    "num_epochs": 1,
    "max_steps": 1000,
    "batch_size": 4,
    "gradient_accumulation_steps": 8,
    "lr": 2e-05,
    "weight_decay": 0.01,
    "adam_beta1": 0.9,
    "adam_beta2": 0.95,
    "adam_epsilon": 1e-08,
    "max_grad_norm": 1.0,
    "lr_scheduler_type": "cosine",
    "warmup_steps": 100,
    "warmup_ratio": 0.1,
    "dtype": "bf16",
    "fp8": false,
    "save_steps": 500,
    "save_total_limit": 5,
    "save_best_metric": "val_loss",
    "resume_from_checkpoint": null,
    "val_steps": 10,
    "val_samples": 500,
    "val_batch_size": null,
    "eval_split": "test",
    "eval_samples": 2000,
    "run_final_eval": true,
    "logging_steps": 1,
    "log_file": "./Llama-3.1-8B-Instruct_w16a16_4nodes/logs/training.log",
    "use_wandb": true,
    "wandb_entity": null,
    "use_tensorboard": false,
    "tensorboard_dir": null,
    "compile": false,
    "gradient_checkpointing": true,
    "seed": 100,
    "fsdp_cpu_offload": false,
    "enable_profiler": true,
    "profiler_output_dir": "./Llama-3.1-8B-Instruct_w16a16_4nodes/profiler",
    "profiler_wait_steps": 10,
    "profiler_warmup_steps": 0,
    "profiler_active_steps": 5,
    "profiler_repeat": 1,
    "profiler_record_shapes": false,
    "profiler_profile_memory": false,
    "profiler_with_stack": false,
    "profiler_with_flops": false,
    "profiler_with_modules": false,
    "profiler_print_summary": false,
    "enable_tf32": true,
    "disable_tf32": false,
    "cudnn_benchmark": true,
    "flash_attention": true,
    "memory_efficient_attention": true,
    "attn_implementation": "sdpa",
    "input_scaling_granularity": "axiswise",
    "input_dtype": "e4m3fn",
    "weight_scaling_granularity": "axiswise",
    "weight_dtype": "e4m3fn",
    "grad_output_scaling_granularity": "axiswise",
    "grad_output_dtype": "e4m3fn",
    "fp8_recipe": null
  },
  "checkpoint_type": "huggingface_standard",
  "converted_from": "./Llama-3.1-8B-Instruct_w16a16_4nodes/final_model/sharded"
}