See axolotl config
axolotl version: 0.9.2
base_model: google/gemma-3-12b-it
#load_in_4bit: true
auto_resume_from_checkpoints: false
# gemma3 doesn't seem to play nice with ddp
ddp_find_unused_parameters: true
tokenizer_config: le-llm/gemma-3-reasoning-tokenizer
# added_tokens_overrides: {6: "<|begin_of_thought|>", 7: "<|end_of_thought|>", 8: "<|begin_of_solution|>", 9: "<|end_of_solution|>"}
#chat_template: gemma3
eot_tokens:
- <end_of_turn>
shuffle_merged_datasets: true
datasets:
# - path: le-llm/hermes3-uk
# type: chat_template
#
# field_messages: conversations
# message_property_mappings:
# role: from
# content: value
- path: le-llm/open-thoughts-114K
type: chat_template
train_on_eos: all
field_messages: conversations
drop_system_message: true
message_property_mappings:
role: from
content: value
dataset_processes: 64
#dataset_keep_in_memory: true
#dataloader_num_workers: 8
#dataloader_prefetch_factor: 16
dataset_prepared_path: last_run_prepared_reasoning
# val_set_size: 0.01
output_dir: ./outputs/lapa-v.0.1-reasoning-only-12b-eos
#adapter: qlora
#lora_model_dir:
sequence_len: 16384 # 2048 32768 #
sample_packing: true # true
pad_to_sequence_len: true
train_on_inputs: true
# The number of GPUs to shard the model parameters across (FSDP dimension).
dp_shard_size: 8
# The number of times to replicate the sharded model (DDP dimension).
# dp_replicate_size: 1
# Number of GPUs for Tensor Parallelism.
tensor_parallel_size: 1 # (default is 1, no TP)
# Number of GPUs for Context/Sequence Parallelism.
context_parallel_size: 8 # (default is 1, no CP)
# tiled_mlp: true
#context_parallel_size: 8
# dp_shard_size: 4
plugins:
- axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true
# spectrum
#- axolotl.integrations.spectrum.SpectrumPlugin
#spectrum_top_fraction: 0.5
#spectrum_model_name: google/gemma-3-12b-it
wandb_project: gemma-3-12b-reasoning
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 1
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_torch_fused # muon #adamw_bnb_8bit
lr_scheduler: warmup_stable_decay
learning_rate: 5e-5
lr_scheduler_kwargs: {"num_decay_steps": 150}
bf16: auto
# fp16:
tf32: false # TODO: double check precision impact
# deepspeed: deepspeed_configs/zero2.json # deepspeed_configs/zero3_bf16.json
# TODO: When using FSDP full shard, instead of using `gradient_checkpointing` in TrainingArguments, please use `activation_checkpointing` in `fsdp_config`. The former introduces a redundant AllGather operation in backward pass. Reference: https://github.com/huggingface/transformers/issues/30404
#fsdp:
# - full_shard
# - auto_wrap
#fsdp_config:
# fsdp_offload_params: true
# fsdp_state_dict_type: FULL_STATE_DICT
# fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
#fp8: true
#fp8_enable_fsdp_float8_all_gather: true
#torch_compile: true
fsdp:
- full_shard
- auto_wrap
fsdp_config:
fsdp_version: 2
fsdp_offload_params: false
fsdp_cpu_ram_efficient_loading: false
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
fsdp_state_dict_type: FULL_STATE_DICT
fsdp_sharding_strategy: FULL_SHARD
fsdp_reshard_after_forward: true
# fsdp_activation_checkpointing: true
gradient_checkpointing: true # required for activation offloading
activation_offloading: legacy
#gradient_checkpointing: true
#gradient_checkpointing_kwargs:
# use_reentrant: false
#activation_offloading: true
logging_steps: 1
flash_attention: true # not recommended for gemma3 due to soft logit capping, but it should be fixed in the lates flash attention
# xformers_attention: true
#eager_attention:
# torch_compile: True
warmup_steps: 150 #0.4
evals_per_epoch: 1
save_steps: 100
save_total_limit: 6
#saves_per_epoch: 1
weight_decay: 0.0
outputs/lapa-v.0.1-reasoning-only-12b-eos
This model is a fine-tuned version of google/gemma-3-12b-it on the le-llm/open-thoughts-114K dataset.
Model description
More information needed
Intended uses & limitations
More information needed
Training and evaluation data
More information needed
Training procedure
Training hyperparameters
The following hyperparameters were used during training:
- learning_rate: 5e-05
- train_batch_size: 4
- eval_batch_size: 4
- seed: 42
- distributed_type: multi-GPU
- num_devices: 48
- total_train_batch_size: 192
- total_eval_batch_size: 192
- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
- lr_scheduler_type: warmup_stable_decay
- lr_scheduler_warmup_steps: 150
- num_epochs: 1.0
Training results
Framework versions
- Transformers 4.51.3
- Pytorch 2.6.0+cu124
- Datasets 3.5.1
- Tokenizers 0.21.2
- Downloads last month
- 41