{ "base_model": "meta-llama/Llama-3.1-8B-Instruct", "dataset": "newmindai/euro_hpc-legal", "training_args": { "model_id": "meta-llama/Llama-3.1-8B-Instruct", "trust_remote_code": true, "dataset": "newmindai/euro_hpc-legal", "dataset_config": null, "split": "train", "format": "qa", "seq_len": 4096, "num_samples": null, "combine_configs": true, "output_dir": "./Llama-3.1-8B-Instruct_w16a16_4nodes", "num_epochs": 1, "max_steps": 1000, "batch_size": 4, "gradient_accumulation_steps": 8, "lr": 2e-05, "weight_decay": 0.01, "adam_beta1": 0.9, "adam_beta2": 0.95, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, "lr_scheduler_type": "cosine", "warmup_steps": 100, "warmup_ratio": 0.1, "dtype": "bf16", "fp8": false, "save_steps": 500, "save_total_limit": 5, "save_best_metric": "val_loss", "resume_from_checkpoint": null, "val_steps": 10, "val_samples": 500, "val_batch_size": null, "eval_split": "test", "eval_samples": 2000, "run_final_eval": true, "logging_steps": 1, "log_file": "./Llama-3.1-8B-Instruct_w16a16_4nodes/logs/training.log", "use_wandb": true, "wandb_entity": null, "use_tensorboard": false, "tensorboard_dir": null, "compile": false, "gradient_checkpointing": true, "seed": 100, "fsdp_cpu_offload": false, "enable_profiler": true, "profiler_output_dir": "./Llama-3.1-8B-Instruct_w16a16_4nodes/profiler", "profiler_wait_steps": 10, "profiler_warmup_steps": 0, "profiler_active_steps": 5, "profiler_repeat": 1, "profiler_record_shapes": false, "profiler_profile_memory": false, "profiler_with_stack": false, "profiler_with_flops": false, "profiler_with_modules": false, "profiler_print_summary": false, "enable_tf32": true, "disable_tf32": false, "cudnn_benchmark": true, "flash_attention": true, "memory_efficient_attention": true, "attn_implementation": "sdpa", "input_scaling_granularity": "axiswise", "input_dtype": "e4m3fn", "weight_scaling_granularity": "axiswise", "weight_dtype": "e4m3fn", "grad_output_scaling_granularity": "axiswise", "grad_output_dtype": "e4m3fn", "fp8_recipe": null }, "checkpoint_type": "huggingface_standard", "converted_from": "./Llama-3.1-8B-Instruct_w16a16_4nodes/final_model/sharded" }