|
|
+ deepspeed --master_port 29900 --module safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/1000/train.json --model_name_or_path /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000 --max_length 512 --trust_remote_code True --epochs 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000-Q2-1000 --log_type wandb --log_run_name imdb-Qwen1.5-4B-s3-Q1-5000-Q2-1000 --log_project Inverse_Alignment_IMDb --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
nvcc warning : incompatible redefinition for option |
|
|
[rank3]:[W526 18:43:33.915613592 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
|
[rank7]:[W526 18:43:33.463103285 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 7] using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
|
[rank1]:[W526 18:43:33.589776628 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
|
[rank6]:[W526 18:43:34.800229132 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 6] using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
|
[rank5]:[W526 18:43:34.834361237 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 5] using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
|
[rank2]:[W526 18:43:34.834457161 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
|
[rank4]:[W526 18:43:34.045972210 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 4] using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
|
[rank0]:[W526 18:43:34.049912677 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/config.json |
|
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/config.json |
|
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/config.json |
|
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/config.json |
|
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/config.json |
|
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/config.json |
|
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/config.json |
|
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/config.json |
|
|
Model config Qwen2Config { |
|
|
"architectures": [ |
|
|
"Qwen2ForCausalLM" |
|
|
], |
|
|
"attention_dropout": 0.0, |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"hidden_act": "silu", |
|
|
"hidden_size": 2560, |
|
|
"initializer_range": 0.02, |
|
|
"intermediate_size": 6912, |
|
|
"max_position_embeddings": 32768, |
|
|
"max_window_layers": 21, |
|
|
"model_type": "qwen2", |
|
|
"num_attention_heads": 20, |
|
|
"num_hidden_layers": 40, |
|
|
"num_key_value_heads": 20, |
|
|
"pad_token_id": 151643, |
|
|
"rms_norm_eps": 1e-06, |
|
|
"rope_scaling": null, |
|
|
"rope_theta": 5000000.0, |
|
|
"sliding_window": 32768, |
|
|
"tie_word_embeddings": false, |
|
|
"torch_dtype": "bfloat16", |
|
|
"transformers_version": "4.52.1", |
|
|
"use_cache": true, |
|
|
"use_sliding_window": false, |
|
|
"vocab_size": 151646 |
|
|
} |
|
|
|
|
|
Model config Qwen2Config { |
|
|
"architectures": [ |
|
|
"Qwen2ForCausalLM" |
|
|
], |
|
|
"attention_dropout": 0.0, |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"hidden_act": "silu", |
|
|
"hidden_size": 2560, |
|
|
"initializer_range": 0.02, |
|
|
"intermediate_size": 6912, |
|
|
"max_position_embeddings": 32768, |
|
|
"max_window_layers": 21, |
|
|
"model_type": "qwen2", |
|
|
"num_attention_heads": 20, |
|
|
"num_hidden_layers": 40, |
|
|
"num_key_value_heads": 20, |
|
|
"pad_token_id": 151643, |
|
|
"rms_norm_eps": 1e-06, |
|
|
"rope_scaling": null, |
|
|
"rope_theta": 5000000.0, |
|
|
"sliding_window": 32768, |
|
|
"tie_word_embeddings": false, |
|
|
"torch_dtype": "bfloat16", |
|
|
"transformers_version": "4.52.1", |
|
|
"use_cache": true, |
|
|
"use_sliding_window": false, |
|
|
"vocab_size": 151646 |
|
|
} |
|
|
|
|
|
Model config Qwen2Config { |
|
|
"architectures": [ |
|
|
"Qwen2ForCausalLM" |
|
|
], |
|
|
"attention_dropout": 0.0, |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"hidden_act": "silu", |
|
|
"hidden_size": 2560, |
|
|
"initializer_range": 0.02, |
|
|
"intermediate_size": 6912, |
|
|
"max_position_embeddings": 32768, |
|
|
"max_window_layers": 21, |
|
|
"model_type": "qwen2", |
|
|
"num_attention_heads": 20, |
|
|
"num_hidden_layers": 40, |
|
|
"num_key_value_heads": 20, |
|
|
"pad_token_id": 151643, |
|
|
"rms_norm_eps": 1e-06, |
|
|
"rope_scaling": null, |
|
|
"rope_theta": 5000000.0, |
|
|
"sliding_window": 32768, |
|
|
"tie_word_embeddings": false, |
|
|
"torch_dtype": "bfloat16", |
|
|
"transformers_version": "4.52.1", |
|
|
"use_cache": true, |
|
|
"use_sliding_window": false, |
|
|
"vocab_size": 151646 |
|
|
} |
|
|
|
|
|
Model config Qwen2Config { |
|
|
"architectures": [ |
|
|
"Qwen2ForCausalLM" |
|
|
], |
|
|
"attention_dropout": 0.0, |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"hidden_act": "silu", |
|
|
"hidden_size": 2560, |
|
|
"initializer_range": 0.02, |
|
|
"intermediate_size": 6912, |
|
|
"max_position_embeddings": 32768, |
|
|
"max_window_layers": 21, |
|
|
"model_type": "qwen2", |
|
|
"num_attention_heads": 20, |
|
|
"num_hidden_layers": 40, |
|
|
"num_key_value_heads": 20, |
|
|
"pad_token_id": 151643, |
|
|
"rms_norm_eps": 1e-06, |
|
|
"rope_scaling": null, |
|
|
"rope_theta": 5000000.0, |
|
|
"sliding_window": 32768, |
|
|
"tie_word_embeddings": false, |
|
|
"torch_dtype": "bfloat16", |
|
|
"transformers_version": "4.52.1", |
|
|
"use_cache": true, |
|
|
"use_sliding_window": false, |
|
|
"vocab_size": 151646 |
|
|
} |
|
|
|
|
|
Model config Qwen2Config { |
|
|
"architectures": [ |
|
|
"Qwen2ForCausalLM" |
|
|
], |
|
|
"attention_dropout": 0.0, |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"hidden_act": "silu", |
|
|
"hidden_size": 2560, |
|
|
"initializer_range": 0.02, |
|
|
"intermediate_size": 6912, |
|
|
"max_position_embeddings": 32768, |
|
|
"max_window_layers": 21, |
|
|
"model_type": "qwen2", |
|
|
"num_attention_heads": 20, |
|
|
"num_hidden_layers": 40, |
|
|
"num_key_value_heads": 20, |
|
|
"pad_token_id": 151643, |
|
|
"rms_norm_eps": 1e-06, |
|
|
"rope_scaling": null, |
|
|
"rope_theta": 5000000.0, |
|
|
"sliding_window": 32768, |
|
|
"tie_word_embeddings": false, |
|
|
"torch_dtype": "bfloat16", |
|
|
"transformers_version": "4.52.1", |
|
|
"use_cache": true, |
|
|
"use_sliding_window": false, |
|
|
"vocab_size": 151646 |
|
|
} |
|
|
|
|
|
Model config Qwen2Config { |
|
|
"architectures": [ |
|
|
"Qwen2ForCausalLM" |
|
|
], |
|
|
"attention_dropout": 0.0, |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"hidden_act": "silu", |
|
|
"hidden_size": 2560, |
|
|
"initializer_range": 0.02, |
|
|
"intermediate_size": 6912, |
|
|
"max_position_embeddings": 32768, |
|
|
"max_window_layers": 21, |
|
|
"model_type": "qwen2", |
|
|
"num_attention_heads": 20, |
|
|
"num_hidden_layers": 40, |
|
|
"num_key_value_heads": 20, |
|
|
"pad_token_id": 151643, |
|
|
"rms_norm_eps": 1e-06, |
|
|
"rope_scaling": null, |
|
|
"rope_theta": 5000000.0, |
|
|
"sliding_window": 32768, |
|
|
"tie_word_embeddings": false, |
|
|
"torch_dtype": "bfloat16", |
|
|
"transformers_version": "4.52.1", |
|
|
"use_cache": true, |
|
|
"use_sliding_window": false, |
|
|
"vocab_size": 151646 |
|
|
} |
|
|
|
|
|
Model config Qwen2Config { |
|
|
"architectures": [ |
|
|
"Qwen2ForCausalLM" |
|
|
], |
|
|
"attention_dropout": 0.0, |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"hidden_act": "silu", |
|
|
"hidden_size": 2560, |
|
|
"initializer_range": 0.02, |
|
|
"intermediate_size": 6912, |
|
|
"max_position_embeddings": 32768, |
|
|
"max_window_layers": 21, |
|
|
"model_type": "qwen2", |
|
|
"num_attention_heads": 20, |
|
|
"num_hidden_layers": 40, |
|
|
"num_key_value_heads": 20, |
|
|
"pad_token_id": 151643, |
|
|
"rms_norm_eps": 1e-06, |
|
|
"rope_scaling": null, |
|
|
"rope_theta": 5000000.0, |
|
|
"sliding_window": 32768, |
|
|
"tie_word_embeddings": false, |
|
|
"torch_dtype": "bfloat16", |
|
|
"transformers_version": "4.52.1", |
|
|
"use_cache": true, |
|
|
"use_sliding_window": false, |
|
|
"vocab_size": 151646 |
|
|
} |
|
|
|
|
|
Model config Qwen2Config { |
|
|
"architectures": [ |
|
|
"Qwen2ForCausalLM" |
|
|
], |
|
|
"attention_dropout": 0.0, |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"hidden_act": "silu", |
|
|
"hidden_size": 2560, |
|
|
"initializer_range": 0.02, |
|
|
"intermediate_size": 6912, |
|
|
"max_position_embeddings": 32768, |
|
|
"max_window_layers": 21, |
|
|
"model_type": "qwen2", |
|
|
"num_attention_heads": 20, |
|
|
"num_hidden_layers": 40, |
|
|
"num_key_value_heads": 20, |
|
|
"pad_token_id": 151643, |
|
|
"rms_norm_eps": 1e-06, |
|
|
"rope_scaling": null, |
|
|
"rope_theta": 5000000.0, |
|
|
"sliding_window": 32768, |
|
|
"tie_word_embeddings": false, |
|
|
"torch_dtype": "bfloat16", |
|
|
"transformers_version": "4.52.1", |
|
|
"use_cache": true, |
|
|
"use_sliding_window": false, |
|
|
"vocab_size": 151646 |
|
|
} |
|
|
|
|
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/pytorch_model.bin |
|
|
Will use torch_dtype=torch.bfloat16 as defined in model |
|
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/pytorch_model.bin |
|
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
|
Will use torch_dtype=torch.bfloat16 as defined in model |
|
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/pytorch_model.bin |
|
|
Will use torch_dtype=torch.bfloat16 as defined in model |
|
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/pytorch_model.bin |
|
|
Will use torch_dtype=torch.bfloat16 as defined in model |
|
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/pytorch_model.bin |
|
|
Will use torch_dtype=torch.bfloat16 as defined in model |
|
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/pytorch_model.bin |
|
|
Will use torch_dtype=torch.bfloat16 as defined in model |
|
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/pytorch_model.bin |
|
|
Will use torch_dtype=torch.bfloat16 as defined in model |
|
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
|
Generate config GenerationConfig { |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"pad_token_id": 151643 |
|
|
} |
|
|
|
|
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000/pytorch_model.bin |
|
|
Will use torch_dtype=torch.bfloat16 as defined in model |
|
|
Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
|
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
|
Generate config GenerationConfig { |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"pad_token_id": 151643 |
|
|
} |
|
|
|
|
|
Generate config GenerationConfig { |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"pad_token_id": 151643 |
|
|
} |
|
|
|
|
|
Generate config GenerationConfig { |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"pad_token_id": 151643 |
|
|
} |
|
|
|
|
|
Generate config GenerationConfig { |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"pad_token_id": 151643 |
|
|
} |
|
|
|
|
|
Generate config GenerationConfig { |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"pad_token_id": 151643 |
|
|
} |
|
|
|
|
|
Generate config GenerationConfig { |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"pad_token_id": 151643 |
|
|
} |
|
|
|
|
|
Generate config GenerationConfig { |
|
|
"bos_token_id": 128245, |
|
|
"eos_token_id": 151643, |
|
|
"pad_token_id": 151643 |
|
|
} |
|
|
|
|
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000. |
|
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000. |
|
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000. |
|
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000. |
|
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000. |
|
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
|
Generation config file not found, using a generation config created from the model config. |
|
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000. |
|
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
|
Generation config file not found, using a generation config created from the model config. |
|
|
Generation config file not found, using a generation config created from the model config. |
|
|
Generation config file not found, using a generation config created from the model config. |
|
|
Generation config file not found, using a generation config created from the model config. |
|
|
Generation config file not found, using a generation config created from the model config. |
|
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000. |
|
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
|
loading file vocab.json |
|
|
loading file merges.txt |
|
|
loading file tokenizer.json |
|
|
loading file added_tokens.json |
|
|
loading file special_tokens_map.json |
|
|
loading file vocab.json |
|
|
loading file tokenizer_config.json |
|
|
loading file chat_template.jinja |
|
|
loading file merges.txt |
|
|
loading file tokenizer.json |
|
|
loading file added_tokens.json |
|
|
loading file special_tokens_map.json |
|
|
loading file tokenizer_config.json |
|
|
loading file chat_template.jinja |
|
|
Generation config file not found, using a generation config created from the model config. |
|
|
loading file vocab.json |
|
|
loading file merges.txt |
|
|
loading file tokenizer.json |
|
|
loading file added_tokens.json |
|
|
loading file special_tokens_map.json |
|
|
loading file tokenizer_config.json |
|
|
loading file chat_template.jinja |
|
|
loading file vocab.json |
|
|
loading file merges.txt |
|
|
loading file tokenizer.json |
|
|
loading file added_tokens.json |
|
|
loading file special_tokens_map.json |
|
|
loading file tokenizer_config.json |
|
|
loading file chat_template.jinja |
|
|
loading file vocab.json |
|
|
loading file merges.txt |
|
|
loading file tokenizer.json |
|
|
loading file added_tokens.json |
|
|
loading file special_tokens_map.json |
|
|
loading file tokenizer_config.json |
|
|
loading file chat_template.jinja |
|
|
loading file vocab.json |
|
|
loading file merges.txt |
|
|
loading file tokenizer.json |
|
|
loading file added_tokens.json |
|
|
loading file special_tokens_map.json |
|
|
loading file tokenizer_config.json |
|
|
loading file chat_template.jinja |
|
|
loading file vocab.json |
|
|
loading file merges.txt |
|
|
loading file tokenizer.json |
|
|
loading file added_tokens.json |
|
|
loading file special_tokens_map.json |
|
|
loading file tokenizer_config.json |
|
|
loading file chat_template.jinja |
|
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
|
|
All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000. |
|
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
|
Generation config file not found, using a generation config created from the model config. |
|
|
loading file vocab.json |
|
|
loading file merges.txt |
|
|
loading file tokenizer.json |
|
|
loading file added_tokens.json |
|
|
loading file special_tokens_map.json |
|
|
loading file tokenizer_config.json |
|
|
loading file chat_template.jinja |
|
|
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
|
Detected CUDA files, patching ldflags |
|
|
Emitting ninja build file /home/hansirui_1st/.cache/torch_extensions/py311_cu124/fused_adam/build.ninja... |
|
|
/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. |
|
|
If this is not desired, please set os.environ[ |
|
|
warnings.warn( |
|
|
Building extension module fused_adam... |
|
|
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) |
|
|
Loading extension module fused_adam... |
|
|
Loading extension module fused_adam...Loading extension module fused_adam...Loading extension module fused_adam... |
|
|
|
|
|
|
|
|
Loading extension module fused_adam... |
|
|
Loading extension module fused_adam... |
|
|
Loading extension module fused_adam... |
|
|
Loading extension module fused_adam... |
|
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
|
wandb: Currently logged in as: xtom to https://api.wandb.ai. Use `wandb login --relogin` to force relogin |
|
|
wandb: Tracking run with wandb version 0.19.11 |
|
|
wandb: Run data is saved locally in /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000-Q2-1000/wandb/run-20250526_184356-sciuebo8 |
|
|
wandb: Run `wandb offline` to turn off syncing. |
|
|
wandb: Syncing run imdb-Qwen1.5-4B-s3-Q1-5000-Q2-1000 |
|
|
wandb: βοΈ View project at https://wandb.ai/xtom/Inverse_Alignment_IMDb |
|
|
wandb: π View run at https://wandb.ai/xtom/Inverse_Alignment_IMDb/runs/sciuebo8 |
|
|
Training 1/1 epoch: 0%| | 0/125 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
|
Training 1/1 epoch (loss 2.8086): 0%| | 0/125 [00:05<?, ?it/s]
Training 1/1 epoch (loss 2.8086): 1%| | 1/125 [00:05<12:22, 5.99s/it]
Training 1/1 epoch (loss 2.9033): 1%| | 1/125 [00:08<12:22, 5.99s/it]
Training 1/1 epoch (loss 2.9033): 2%|β | 2/125 [00:08<07:35, 3.70s/it]
Training 1/1 epoch (loss 2.8320): 2%|β | 2/125 [00:08<07:35, 3.70s/it]
Training 1/1 epoch (loss 2.8320): 2%|β | 3/125 [00:08<04:35, 2.26s/it]
Training 1/1 epoch (loss 2.7796): 2%|β | 3/125 [00:09<04:35, 2.26s/it]
Training 1/1 epoch (loss 2.7796): 3%|β | 4/125 [00:09<03:10, 1.57s/it]
Training 1/1 epoch (loss 2.7771): 3%|β | 4/125 [00:09<03:10, 1.57s/it]
Training 1/1 epoch (loss 2.7771): 4%|β | 5/125 [00:09<02:24, 1.20s/it]
Training 1/1 epoch (loss 2.9416): 4%|β | 5/125 [00:10<02:24, 1.20s/it]
Training 1/1 epoch (loss 2.9416): 5%|β | 6/125 [00:10<01:57, 1.01it/s]
Training 1/1 epoch (loss 2.9804): 5%|β | 6/125 [00:10<01:57, 1.01it/s]
Training 1/1 epoch (loss 2.9804): 6%|β | 7/125 [00:10<01:40, 1.18it/s]
Training 1/1 epoch (loss 3.0169): 6%|β | 7/125 [00:11<01:40, 1.18it/s]
Training 1/1 epoch (loss 3.0169): 6%|β | 8/125 [00:11<01:32, 1.26it/s]
Training 1/1 epoch (loss 2.7500): 6%|β | 8/125 [00:12<01:32, 1.26it/s]
Training 1/1 epoch (loss 2.7500): 7%|β | 9/125 [00:12<01:28, 1.31it/s]
Training 1/1 epoch (loss 2.8292): 7%|β | 9/125 [00:12<01:28, 1.31it/s]
Training 1/1 epoch (loss 2.8292): 8%|β | 10/125 [00:12<01:18, 1.46it/s]
Training 1/1 epoch (loss 2.8130): 8%|β | 10/125 [00:13<01:18, 1.46it/s]
Training 1/1 epoch (loss 2.8130): 9%|β | 11/125 [00:13<01:12, 1.57it/s]
Training 1/1 epoch (loss 2.8049): 9%|β | 11/125 [00:13<01:12, 1.57it/s]
Training 1/1 epoch (loss 2.8049): 10%|β | 12/125 [00:13<01:09, 1.62it/s]
Training 1/1 epoch (loss 2.9779): 10%|β | 12/125 [00:14<01:09, 1.62it/s]
Training 1/1 epoch (loss 2.9779): 10%|β | 13/125 [00:14<01:07, 1.67it/s]
Training 1/1 epoch (loss 2.6157): 10%|β | 13/125 [00:14<01:07, 1.67it/s]
Training 1/1 epoch (loss 2.6157): 11%|β | 14/125 [00:14<01:03, 1.73it/s]
Training 1/1 epoch (loss 3.0676): 11%|β | 14/125 [00:15<01:03, 1.73it/s]
Training 1/1 epoch (loss 3.0676): 12%|ββ | 15/125 [00:15<01:02, 1.75it/s]
Training 1/1 epoch (loss 2.7406): 12%|ββ | 15/125 [00:16<01:02, 1.75it/s]
Training 1/1 epoch (loss 2.7406): 13%|ββ | 16/125 [00:16<01:03, 1.72it/s]
Training 1/1 epoch (loss 2.7318): 13%|ββ | 16/125 [00:16<01:03, 1.72it/s]
Training 1/1 epoch (loss 2.7318): 14%|ββ | 17/125 [00:16<01:00, 1.78it/s]
Training 1/1 epoch (loss 3.0532): 14%|ββ | 17/125 [00:17<01:00, 1.78it/s]
Training 1/1 epoch (loss 3.0532): 14%|ββ | 18/125 [00:17<00:59, 1.81it/s]
Training 1/1 epoch (loss 2.9556): 14%|ββ | 18/125 [00:17<00:59, 1.81it/s]
Training 1/1 epoch (loss 2.9556): 15%|ββ | 19/125 [00:17<00:59, 1.80it/s]
Training 1/1 epoch (loss 2.9911): 15%|ββ | 19/125 [00:18<00:59, 1.80it/s]
Training 1/1 epoch (loss 2.9911): 16%|ββ | 20/125 [00:18<00:59, 1.76it/s]
Training 1/1 epoch (loss 2.8940): 16%|ββ | 20/125 [00:18<00:59, 1.76it/s]
Training 1/1 epoch (loss 2.8940): 17%|ββ | 21/125 [00:18<00:57, 1.82it/s]
Training 1/1 epoch (loss 2.8398): 17%|ββ | 21/125 [00:19<00:57, 1.82it/s]
Training 1/1 epoch (loss 2.8398): 18%|ββ | 22/125 [00:19<00:56, 1.82it/s]
Training 1/1 epoch (loss 2.7137): 18%|ββ | 22/125 [00:19<00:56, 1.82it/s]
Training 1/1 epoch (loss 2.7137): 18%|ββ | 23/125 [00:19<00:55, 1.83it/s]
Training 1/1 epoch (loss 2.8086): 18%|ββ | 23/125 [00:20<00:55, 1.83it/s]
Training 1/1 epoch (loss 2.8086): 19%|ββ | 24/125 [00:20<00:57, 1.76it/s]
Training 1/1 epoch (loss 2.8010): 19%|ββ | 24/125 [00:21<00:57, 1.76it/s]
Training 1/1 epoch (loss 2.8010): 20%|ββ | 25/125 [00:21<00:55, 1.79it/s]
Training 1/1 epoch (loss 2.7463): 20%|ββ | 25/125 [00:21<00:55, 1.79it/s]
Training 1/1 epoch (loss 2.7463): 21%|ββ | 26/125 [00:21<00:54, 1.81it/s]
Training 1/1 epoch (loss 2.7333): 21%|ββ | 26/125 [00:22<00:54, 1.81it/s]
Training 1/1 epoch (loss 2.7333): 22%|βββ | 27/125 [00:22<00:54, 1.79it/s]
Training 1/1 epoch (loss 2.7569): 22%|βββ | 27/125 [00:22<00:54, 1.79it/s]
Training 1/1 epoch (loss 2.7569): 22%|βββ | 28/125 [00:22<00:53, 1.81it/s]
Training 1/1 epoch (loss 2.9083): 22%|βββ | 28/125 [00:23<00:53, 1.81it/s]
Training 1/1 epoch (loss 2.9083): 23%|βββ | 29/125 [00:23<00:52, 1.84it/s]
Training 1/1 epoch (loss 2.6230): 23%|βββ | 29/125 [00:23<00:52, 1.84it/s]
Training 1/1 epoch (loss 2.6230): 24%|βββ | 30/125 [00:23<00:51, 1.85it/s]
Training 1/1 epoch (loss 2.9759): 24%|βββ | 30/125 [00:24<00:51, 1.85it/s]
Training 1/1 epoch (loss 2.9759): 25%|βββ | 31/125 [00:24<00:50, 1.84it/s]
Training 1/1 epoch (loss 2.8031): 25%|βββ | 31/125 [00:24<00:50, 1.84it/s]
Training 1/1 epoch (loss 2.8031): 26%|βββ | 32/125 [00:24<00:52, 1.78it/s]
Training 1/1 epoch (loss 3.0853): 26%|βββ | 32/125 [00:25<00:52, 1.78it/s]
Training 1/1 epoch (loss 3.0853): 26%|βββ | 33/125 [00:25<00:50, 1.81it/s]
Training 1/1 epoch (loss 2.8594): 26%|βββ | 33/125 [00:25<00:50, 1.81it/s]
Training 1/1 epoch (loss 2.8594): 27%|βββ | 34/125 [00:25<00:48, 1.87it/s]
Training 1/1 epoch (loss 2.9952): 27%|βββ | 34/125 [00:26<00:48, 1.87it/s]
Training 1/1 epoch (loss 2.9952): 28%|βββ | 35/125 [00:26<00:49, 1.81it/s]
Training 1/1 epoch (loss 2.9251): 28%|βββ | 35/125 [00:27<00:49, 1.81it/s]
Training 1/1 epoch (loss 2.9251): 29%|βββ | 36/125 [00:27<00:48, 1.83it/s]
Training 1/1 epoch (loss 2.7345): 29%|βββ | 36/125 [00:27<00:48, 1.83it/s]
Training 1/1 epoch (loss 2.7345): 30%|βββ | 37/125 [00:27<00:48, 1.83it/s]
Training 1/1 epoch (loss 2.6794): 30%|βββ | 37/125 [00:28<00:48, 1.83it/s]
Training 1/1 epoch (loss 2.6794): 30%|βββ | 38/125 [00:28<00:47, 1.85it/s]
Training 1/1 epoch (loss 2.7171): 30%|βββ | 38/125 [00:28<00:47, 1.85it/s]
Training 1/1 epoch (loss 2.7171): 31%|βββ | 39/125 [00:28<00:46, 1.84it/s]
Training 1/1 epoch (loss 2.8128): 31%|βββ | 39/125 [00:29<00:46, 1.84it/s]
Training 1/1 epoch (loss 2.8128): 32%|ββββ | 40/125 [00:29<00:46, 1.83it/s]
Training 1/1 epoch (loss 2.6852): 32%|ββββ | 40/125 [00:29<00:46, 1.83it/s]
Training 1/1 epoch (loss 2.6852): 33%|ββββ | 41/125 [00:29<00:46, 1.81it/s]
Training 1/1 epoch (loss 2.8776): 33%|ββββ | 41/125 [00:30<00:46, 1.81it/s]
Training 1/1 epoch (loss 2.8776): 34%|ββββ | 42/125 [00:30<00:47, 1.73it/s]
Training 1/1 epoch (loss 2.8635): 34%|ββββ | 42/125 [00:30<00:47, 1.73it/s]
Training 1/1 epoch (loss 2.8635): 34%|ββββ | 43/125 [00:30<00:46, 1.75it/s]
Training 1/1 epoch (loss 3.0968): 34%|ββββ | 43/125 [00:31<00:46, 1.75it/s]
Training 1/1 epoch (loss 3.0968): 35%|ββββ | 44/125 [00:31<00:45, 1.79it/s]
Training 1/1 epoch (loss 2.8669): 35%|ββββ | 44/125 [00:32<00:45, 1.79it/s]
Training 1/1 epoch (loss 2.8669): 36%|ββββ | 45/125 [00:32<00:43, 1.82it/s]
Training 1/1 epoch (loss 2.9190): 36%|ββββ | 45/125 [00:32<00:43, 1.82it/s]
Training 1/1 epoch (loss 2.9190): 37%|ββββ | 46/125 [00:32<00:43, 1.83it/s]
Training 1/1 epoch (loss 2.6432): 37%|ββββ | 46/125 [00:33<00:43, 1.83it/s]
Training 1/1 epoch (loss 2.6432): 38%|ββββ | 47/125 [00:33<00:41, 1.86it/s]
Training 1/1 epoch (loss 2.7058): 38%|ββββ | 47/125 [00:33<00:41, 1.86it/s]
Training 1/1 epoch (loss 2.7058): 38%|ββββ | 48/125 [00:33<00:42, 1.81it/s]
Training 1/1 epoch (loss 2.9041): 38%|ββββ | 48/125 [00:34<00:42, 1.81it/s]
Training 1/1 epoch (loss 2.9041): 39%|ββββ | 49/125 [00:34<00:44, 1.72it/s]
Training 1/1 epoch (loss 2.7543): 39%|ββββ | 49/125 [00:34<00:44, 1.72it/s]
Training 1/1 epoch (loss 2.7543): 40%|ββββ | 50/125 [00:34<00:43, 1.74it/s]
Training 1/1 epoch (loss 2.7231): 40%|ββββ | 50/125 [00:35<00:43, 1.74it/s]
Training 1/1 epoch (loss 2.7231): 41%|ββββ | 51/125 [00:35<00:41, 1.80it/s]
Training 1/1 epoch (loss 2.9147): 41%|ββββ | 51/125 [00:35<00:41, 1.80it/s]
Training 1/1 epoch (loss 2.9147): 42%|βββββ | 52/125 [00:35<00:40, 1.80it/s]
Training 1/1 epoch (loss 2.8400): 42%|βββββ | 52/125 [00:36<00:40, 1.80it/s]
Training 1/1 epoch (loss 2.8400): 42%|βββββ | 53/125 [00:36<00:40, 1.79it/s]
Training 1/1 epoch (loss 2.7783): 42%|βββββ | 53/125 [00:37<00:40, 1.79it/s]
Training 1/1 epoch (loss 2.7783): 43%|βββββ | 54/125 [00:37<00:39, 1.81it/s]
Training 1/1 epoch (loss 2.7910): 43%|βββββ | 54/125 [00:37<00:39, 1.81it/s]
Training 1/1 epoch (loss 2.7910): 44%|βββββ | 55/125 [00:37<00:38, 1.82it/s]
Training 1/1 epoch (loss 2.8935): 44%|βββββ | 55/125 [00:38<00:38, 1.82it/s]
Training 1/1 epoch (loss 2.8935): 45%|βββββ | 56/125 [00:38<00:40, 1.70it/s]
Training 1/1 epoch (loss 2.8649): 45%|βββββ | 56/125 [00:38<00:40, 1.70it/s]
Training 1/1 epoch (loss 2.8649): 46%|βββββ | 57/125 [00:38<00:38, 1.76it/s]
Training 1/1 epoch (loss 2.9541): 46%|βββββ | 57/125 [00:39<00:38, 1.76it/s]
Training 1/1 epoch (loss 2.9541): 46%|βββββ | 58/125 [00:39<00:38, 1.76it/s]
Training 1/1 epoch (loss 2.7959): 46%|βββββ | 58/125 [00:39<00:38, 1.76it/s]
Training 1/1 epoch (loss 2.7959): 47%|βββββ | 59/125 [00:39<00:36, 1.79it/s]
Training 1/1 epoch (loss 2.8459): 47%|βββββ | 59/125 [00:40<00:36, 1.79it/s]
Training 1/1 epoch (loss 2.8459): 48%|βββββ | 60/125 [00:40<00:37, 1.75it/s]
Training 1/1 epoch (loss 2.8276): 48%|βββββ | 60/125 [00:41<00:37, 1.75it/s]
Training 1/1 epoch (loss 2.8276): 49%|βββββ | 61/125 [00:41<00:35, 1.79it/s]
Training 1/1 epoch (loss 2.7743): 49%|βββββ | 61/125 [00:41<00:35, 1.79it/s]
Training 1/1 epoch (loss 2.7743): 50%|βββββ | 62/125 [00:41<00:34, 1.82it/s]
Training 1/1 epoch (loss 2.7465): 50%|βββββ | 62/125 [00:42<00:34, 1.82it/s]
Training 1/1 epoch (loss 2.7465): 50%|βββββ | 63/125 [00:42<00:33, 1.83it/s]
Training 1/1 epoch (loss 2.8475): 50%|βββββ | 63/125 [00:42<00:33, 1.83it/s]
Training 1/1 epoch (loss 2.8475): 51%|βββββ | 64/125 [00:42<00:34, 1.75it/s]
Training 1/1 epoch (loss 2.7159): 51%|βββββ | 64/125 [00:43<00:34, 1.75it/s]
Training 1/1 epoch (loss 2.7159): 52%|ββββββ | 65/125 [00:43<00:34, 1.76it/s]
Training 1/1 epoch (loss 2.9198): 52%|ββββββ | 65/125 [00:43<00:34, 1.76it/s]
Training 1/1 epoch (loss 2.9198): 53%|ββββββ | 66/125 [00:43<00:32, 1.79it/s]
Training 1/1 epoch (loss 2.7581): 53%|ββββββ | 66/125 [00:44<00:32, 1.79it/s]
Training 1/1 epoch (loss 2.7581): 54%|ββββββ | 67/125 [00:44<00:34, 1.69it/s]
Training 1/1 epoch (loss 2.9804): 54%|ββββββ | 67/125 [00:45<00:34, 1.69it/s]
Training 1/1 epoch (loss 2.9804): 54%|ββββββ | 68/125 [00:45<00:32, 1.74it/s]
Training 1/1 epoch (loss 2.9205): 54%|ββββββ | 68/125 [00:45<00:32, 1.74it/s]
Training 1/1 epoch (loss 2.9205): 55%|ββββββ | 69/125 [00:45<00:31, 1.76it/s]
Training 1/1 epoch (loss 2.7249): 55%|ββββββ | 69/125 [00:46<00:31, 1.76it/s]
Training 1/1 epoch (loss 2.7249): 56%|ββββββ | 70/125 [00:46<00:31, 1.75it/s]
Training 1/1 epoch (loss 2.7856): 56%|ββββββ | 70/125 [00:46<00:31, 1.75it/s]
Training 1/1 epoch (loss 2.7856): 57%|ββββββ | 71/125 [00:46<00:30, 1.79it/s]
Training 1/1 epoch (loss 2.8306): 57%|ββββββ | 71/125 [00:47<00:30, 1.79it/s]
Training 1/1 epoch (loss 2.8306): 58%|ββββββ | 72/125 [00:47<00:30, 1.75it/s]
Training 1/1 epoch (loss 3.0287): 58%|ββββββ | 72/125 [00:47<00:30, 1.75it/s]
Training 1/1 epoch (loss 3.0287): 58%|ββββββ | 73/125 [00:47<00:29, 1.76it/s]
Training 1/1 epoch (loss 2.9266): 58%|ββββββ | 73/125 [00:48<00:29, 1.76it/s]
Training 1/1 epoch (loss 2.9266): 59%|ββββββ | 74/125 [00:48<00:29, 1.74it/s]
Training 1/1 epoch (loss 2.7800): 59%|ββββββ | 74/125 [00:48<00:29, 1.74it/s]
Training 1/1 epoch (loss 2.7800): 60%|ββββββ | 75/125 [00:48<00:28, 1.78it/s]
Training 1/1 epoch (loss 2.8985): 60%|ββββββ | 75/125 [00:49<00:28, 1.78it/s]
Training 1/1 epoch (loss 2.8985): 61%|ββββββ | 76/125 [00:49<00:27, 1.80it/s]
Training 1/1 epoch (loss 2.9295): 61%|ββββββ | 76/125 [00:50<00:27, 1.80it/s]
Training 1/1 epoch (loss 2.9295): 62%|βββββββ | 77/125 [00:50<00:26, 1.79it/s]
Training 1/1 epoch (loss 2.6662): 62%|βββββββ | 77/125 [00:50<00:26, 1.79it/s]
Training 1/1 epoch (loss 2.6662): 62%|βββββββ | 78/125 [00:50<00:26, 1.78it/s]
Training 1/1 epoch (loss 2.8598): 62%|βββββββ | 78/125 [00:51<00:26, 1.78it/s]
Training 1/1 epoch (loss 2.8598): 63%|βββββββ | 79/125 [00:51<00:25, 1.80it/s]
Training 1/1 epoch (loss 2.8472): 63%|βββββββ | 79/125 [00:51<00:25, 1.80it/s]
Training 1/1 epoch (loss 2.8472): 64%|βββββββ | 80/125 [00:51<00:25, 1.74it/s]
Training 1/1 epoch (loss 2.8888): 64%|βββββββ | 80/125 [00:52<00:25, 1.74it/s]
Training 1/1 epoch (loss 2.8888): 65%|βββββββ | 81/125 [00:52<00:25, 1.75it/s]
Training 1/1 epoch (loss 2.5759): 65%|βββββββ | 81/125 [00:52<00:25, 1.75it/s]
Training 1/1 epoch (loss 2.5759): 66%|βββββββ | 82/125 [00:52<00:23, 1.81it/s]
Training 1/1 epoch (loss 3.0098): 66%|βββββββ | 82/125 [00:53<00:23, 1.81it/s]
Training 1/1 epoch (loss 3.0098): 66%|βββββββ | 83/125 [00:53<00:23, 1.82it/s]
Training 1/1 epoch (loss 2.8825): 66%|βββββββ | 83/125 [00:53<00:23, 1.82it/s]
Training 1/1 epoch (loss 2.8825): 67%|βββββββ | 84/125 [00:53<00:22, 1.85it/s]
Training 1/1 epoch (loss 2.8914): 67%|βββββββ | 84/125 [00:54<00:22, 1.85it/s]
Training 1/1 epoch (loss 2.8914): 68%|βββββββ | 85/125 [00:54<00:21, 1.83it/s]
Training 1/1 epoch (loss 2.8464): 68%|βββββββ | 85/125 [00:55<00:21, 1.83it/s]
Training 1/1 epoch (loss 2.8464): 69%|βββββββ | 86/125 [00:55<00:21, 1.84it/s]
Training 1/1 epoch (loss 2.8129): 69%|βββββββ | 86/125 [00:55<00:21, 1.84it/s]
Training 1/1 epoch (loss 2.8129): 70%|βββββββ | 87/125 [00:55<00:20, 1.86it/s]
Training 1/1 epoch (loss 2.9121): 70%|βββββββ | 87/125 [00:56<00:20, 1.86it/s]
Training 1/1 epoch (loss 2.9121): 70%|βββββββ | 88/125 [00:56<00:20, 1.77it/s]
Training 1/1 epoch (loss 2.7262): 70%|βββββββ | 88/125 [00:56<00:20, 1.77it/s]
Training 1/1 epoch (loss 2.7262): 71%|βββββββ | 89/125 [00:56<00:20, 1.78it/s]
Training 1/1 epoch (loss 2.7748): 71%|βββββββ | 89/125 [00:57<00:20, 1.78it/s]
Training 1/1 epoch (loss 2.7748): 72%|ββββββββ | 90/125 [00:57<00:19, 1.80it/s]
Training 1/1 epoch (loss 2.7814): 72%|ββββββββ | 90/125 [00:57<00:19, 1.80it/s]
Training 1/1 epoch (loss 2.7814): 73%|ββββββββ | 91/125 [00:57<00:18, 1.82it/s]
Training 1/1 epoch (loss 2.9312): 73%|ββββββββ | 91/125 [00:58<00:18, 1.82it/s]
Training 1/1 epoch (loss 2.9312): 74%|ββββββββ | 92/125 [00:58<00:18, 1.79it/s]
Training 1/1 epoch (loss 2.9050): 74%|ββββββββ | 92/125 [00:58<00:18, 1.79it/s]
Training 1/1 epoch (loss 2.9050): 74%|ββββββββ | 93/125 [00:58<00:17, 1.84it/s]
Training 1/1 epoch (loss 2.7956): 74%|ββββββββ | 93/125 [00:59<00:17, 1.84it/s]
Training 1/1 epoch (loss 2.7956): 75%|ββββββββ | 94/125 [00:59<00:16, 1.83it/s]
Training 1/1 epoch (loss 3.1096): 75%|ββββββββ | 94/125 [00:59<00:16, 1.83it/s]
Training 1/1 epoch (loss 3.1096): 76%|ββββββββ | 95/125 [00:59<00:16, 1.85it/s]
Training 1/1 epoch (loss 2.6736): 76%|ββββββββ | 95/125 [01:00<00:16, 1.85it/s]
Training 1/1 epoch (loss 2.6736): 77%|ββββββββ | 96/125 [01:00<00:16, 1.72it/s]
Training 1/1 epoch (loss 2.8092): 77%|ββββββββ | 96/125 [01:01<00:16, 1.72it/s]
Training 1/1 epoch (loss 2.8092): 78%|ββββββββ | 97/125 [01:01<00:15, 1.78it/s]
Training 1/1 epoch (loss 2.9019): 78%|ββββββββ | 97/125 [01:01<00:15, 1.78it/s]
Training 1/1 epoch (loss 2.9019): 78%|ββββββββ | 98/125 [01:01<00:15, 1.74it/s]
Training 1/1 epoch (loss 2.8018): 78%|ββββββββ | 98/125 [01:02<00:15, 1.74it/s]
Training 1/1 epoch (loss 2.8018): 79%|ββββββββ | 99/125 [01:02<00:14, 1.79it/s]
Training 1/1 epoch (loss 2.6683): 79%|ββββββββ | 99/125 [01:02<00:14, 1.79it/s]
Training 1/1 epoch (loss 2.6683): 80%|ββββββββ | 100/125 [01:02<00:13, 1.82it/s]
Training 1/1 epoch (loss 2.9615): 80%|ββββββββ | 100/125 [01:03<00:13, 1.82it/s]
Training 1/1 epoch (loss 2.9615): 81%|ββββββββ | 101/125 [01:03<00:13, 1.81it/s]
Training 1/1 epoch (loss 3.0599): 81%|ββββββββ | 101/125 [01:03<00:13, 1.81it/s]
Training 1/1 epoch (loss 3.0599): 82%|βββββββββ | 102/125 [01:03<00:12, 1.84it/s]
Training 1/1 epoch (loss 2.8352): 82%|βββββββββ | 102/125 [01:04<00:12, 1.84it/s]
Training 1/1 epoch (loss 2.8352): 82%|βββββββββ | 103/125 [01:04<00:12, 1.80it/s]
Training 1/1 epoch (loss 2.9529): 82%|βββββββββ | 103/125 [01:05<00:12, 1.80it/s]
Training 1/1 epoch (loss 2.9529): 83%|βββββββββ | 104/125 [01:05<00:11, 1.78it/s]
Training 1/1 epoch (loss 2.7029): 83%|βββββββββ | 104/125 [01:05<00:11, 1.78it/s]
Training 1/1 epoch (loss 2.7029): 84%|βββββββββ | 105/125 [01:05<00:11, 1.82it/s]
Training 1/1 epoch (loss 3.0410): 84%|βββββββββ | 105/125 [01:06<00:11, 1.82it/s]
Training 1/1 epoch (loss 3.0410): 85%|βββββββββ | 106/125 [01:06<00:10, 1.84it/s]
Training 1/1 epoch (loss 2.7861): 85%|βββββββββ | 106/125 [01:06<00:10, 1.84it/s]
Training 1/1 epoch (loss 2.7861): 86%|βββββββββ | 107/125 [01:06<00:09, 1.80it/s]
Training 1/1 epoch (loss 3.1313): 86%|βββββββββ | 107/125 [01:07<00:09, 1.80it/s]
Training 1/1 epoch (loss 3.1313): 86%|βββββββββ | 108/125 [01:07<00:09, 1.85it/s]
Training 1/1 epoch (loss 2.7826): 86%|βββββββββ | 108/125 [01:07<00:09, 1.85it/s]
Training 1/1 epoch (loss 2.7826): 87%|βββββββββ | 109/125 [01:07<00:08, 1.78it/s]
Training 1/1 epoch (loss 2.8513): 87%|βββββββββ | 109/125 [01:08<00:08, 1.78it/s]
Training 1/1 epoch (loss 2.8513): 88%|βββββββββ | 110/125 [01:08<00:08, 1.72it/s]
Training 1/1 epoch (loss 2.9457): 88%|βββββββββ | 110/125 [01:08<00:08, 1.72it/s]
Training 1/1 epoch (loss 2.9457): 89%|βββββββββ | 111/125 [01:08<00:07, 1.75it/s]
Training 1/1 epoch (loss 2.7460): 89%|βββββββββ | 111/125 [01:09<00:07, 1.75it/s]
Training 1/1 epoch (loss 2.7460): 90%|βββββββββ | 112/125 [01:09<00:07, 1.75it/s]
Training 1/1 epoch (loss 2.8750): 90%|βββββββββ | 112/125 [01:10<00:07, 1.75it/s]
Training 1/1 epoch (loss 2.8750): 90%|βββββββββ | 113/125 [01:10<00:06, 1.79it/s]
Training 1/1 epoch (loss 2.8514): 90%|βββββββββ | 113/125 [01:10<00:06, 1.79it/s]
Training 1/1 epoch (loss 2.8514): 91%|βββββββββ | 114/125 [01:10<00:06, 1.77it/s]
Training 1/1 epoch (loss 2.8475): 91%|βββββββββ | 114/125 [01:11<00:06, 1.77it/s]
Training 1/1 epoch (loss 2.8475): 92%|ββββββββββ| 115/125 [01:11<00:05, 1.83it/s]
Training 1/1 epoch (loss 2.8758): 92%|ββββββββββ| 115/125 [01:11<00:05, 1.83it/s]
Training 1/1 epoch (loss 2.8758): 93%|ββββββββββ| 116/125 [01:11<00:04, 1.85it/s]
Training 1/1 epoch (loss 2.8105): 93%|ββββββββββ| 116/125 [01:12<00:04, 1.85it/s]
Training 1/1 epoch (loss 2.8105): 94%|ββββββββββ| 117/125 [01:12<00:04, 1.82it/s]
Training 1/1 epoch (loss 2.8483): 94%|ββββββββββ| 117/125 [01:12<00:04, 1.82it/s]
Training 1/1 epoch (loss 2.8483): 94%|ββββββββββ| 118/125 [01:12<00:03, 1.81it/s]
Training 1/1 epoch (loss 2.8281): 94%|ββββββββββ| 118/125 [01:13<00:03, 1.81it/s]
Training 1/1 epoch (loss 2.8281): 95%|ββββββββββ| 119/125 [01:13<00:03, 1.81it/s]
Training 1/1 epoch (loss 3.1678): 95%|ββββββββββ| 119/125 [01:13<00:03, 1.81it/s]
Training 1/1 epoch (loss 3.1678): 96%|ββββββββββ| 120/125 [01:13<00:02, 1.80it/s]
Training 1/1 epoch (loss 2.8221): 96%|ββββββββββ| 120/125 [01:14<00:02, 1.80it/s]
Training 1/1 epoch (loss 2.8221): 97%|ββββββββββ| 121/125 [01:14<00:02, 1.73it/s]
Training 1/1 epoch (loss 2.8274): 97%|ββββββββββ| 121/125 [01:15<00:02, 1.73it/s]
Training 1/1 epoch (loss 2.8274): 98%|ββββββββββ| 122/125 [01:15<00:01, 1.76it/s]
Training 1/1 epoch (loss 2.7508): 98%|ββββββββββ| 122/125 [01:15<00:01, 1.76it/s]
Training 1/1 epoch (loss 2.7508): 98%|ββββββββββ| 123/125 [01:15<00:01, 1.76it/s]
Training 1/1 epoch (loss 2.7651): 98%|ββββββββββ| 123/125 [01:16<00:01, 1.76it/s]
Training 1/1 epoch (loss 2.7651): 99%|ββββββββββ| 124/125 [01:16<00:00, 1.78it/s]
Training 1/1 epoch (loss 2.8530): 99%|ββββββββββ| 124/125 [01:16<00:00, 1.78it/s]
Training 1/1 epoch (loss 2.8530): 100%|ββββββββββ| 125/125 [01:16<00:00, 1.79it/s]
Training 1/1 epoch (loss 2.8530): 100%|ββββββββββ| 125/125 [01:16<00:00, 1.63it/s] |
|
|
chat template saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000-Q2-1000/chat_template.jinja |
|
|
tokenizer config file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000-Q2-1000/tokenizer_config.json |
|
|
Special tokens file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-5000-Q2-1000/special_tokens_map.json |
|
|
wandb: ERROR Problem finishing run |
|
|
Exception ignored in atexit callback: <bound method rank_zero_only.<locals>.wrapper of <safe_rlhf.logger.Logger object at 0x1550e41a9450>> |
|
|
Traceback (most recent call last): |
|
|
File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/utils.py", line 212, in wrapper |
|
|
return func(*args, **kwargs) |
|
|
^^^^^^^^^^^^^^^^^^^^^ |
|
|
File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/logger.py", line 183, in close |
|
|
self.wandb.finish() |
|
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
|
|
return func(self, *args, **kwargs) |
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 503, in wrapper |
|
|
return func(self, *args, **kwargs) |
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 451, in wrapper |
|
|
return func(self, *args, **kwargs) |
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2309, in finish |
|
|
return self._finish(exit_code) |
|
|
^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
|
|
return func(self, *args, **kwargs) |
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2337, in _finish |
|
|
self._atexit_cleanup(exit_code=exit_code) |
|
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2550, in _atexit_cleanup |
|
|
self._on_finish() |
|
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2806, in _on_finish |
|
|
wait_with_progress( |
|
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress |
|
|
return wait_all_with_progress( |
|
|
^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress |
|
|
return asyncio_compat.run(progress_loop_with_timeout) |
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run |
|
|
future = executor.submit(runner.run, fn) |
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/concurrent/futures/thread.py", line 169, in submit |
|
|
raise RuntimeError( |
|
|
RuntimeError: cannot schedule new futures after interpreter shutdown |
|
|
|