{ "model_config": { "model_name_or_path": "unsloth/DeepSeek-R1-Distill-Qwen-14B-bnb-4bit", "use_cache": false, "rope_scaling": { "type": "dynamic", "factor": 2.0 } }, "training_config": { "num_train_epochs": 3, "per_device_train_batch_size": 2, "gradient_accumulation_steps": 4, "learning_rate": 2e-5, "lr_scheduler_type": "cosine", "warmup_ratio": 0.03, "weight_decay": 0.01, "optim": "adamw_torch", "max_grad_norm": 0.3, "max_seq_length": 2048, "logging_steps": 10, "save_steps": 200, "save_total_limit": 3, "evaluation_strategy": "steps", "eval_steps": 200, "load_best_model_at_end": true, "output_dir": "fine_tuned_model", "disable_tqdm": false, "report_to": ["tensorboard"], "logging_first_step": true }, "hardware_config": { "fp16": true, "bf16": false, "gradient_checkpointing": true, "device_map": "auto", "attn_implementation": "eager", "use_flash_attention": false }, "quantization_config": { "load_in_4bit": true, "bnb_4bit_compute_dtype": "float16", "bnb_4bit_quant_type": "nf4", "bnb_4bit_use_double_quant": true }, "lora_config": { "r": 16, "lora_alpha": 32, "lora_dropout": 0.05, "bias": "none", "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ] }, "dataset_config": { "sort_by_field": "prompt_number", "max_tokens": 2048, "text_field": "conversations", "training_phase_only": true, "pre_tokenized": true, "input_ids_field": "input_ids", "skip_tokenization": true } }