+ set -euxo pipefail + project_name=codev-r1-qwen35-9b-full + : python + exp_name=dapo-qwen35-9b-python-full + adv_estimator=grpo + kl_coef=0.0 + kl_loss_coef=0.0 + clip_ratio_low=0.2 + clip_ratio_high=0.28 + enable_overlong_buffer=True + overlong_buffer_len=1024 + overlong_penalty_factor=1.0 + enable_filter_groups=False + use_token_level_loss=True + train_prompt_bsz=8 + train_prompt_mini_bsz=8 + n_resp_per_prompt=4 + max_prompt_length=2048 + max_response_length=1024 + val_top_k=-1 + use_dynamic_bsz=False + actor_ppo_max_token_len=3072 + infer_ppo_max_token_len=3072 + ppo_max_token_len_per_gpu=3072 + gen_tp=2 + sp_size=1 + actor_param_offload=False + actor_optim_offload=True + ref_param_offload=True + NNODES=1 + USER_GPUS_PER_NODE=8 + export VLLM_USE_V1=1 + VLLM_USE_V1=1 + MODEL_PATH=/data/ckpt/checkpoint-570 + DATA_PATH=/data/data/python + SAVE_DIR=/data/save/python + mkdir -p /data/save/python + [[ python == \v\e\r\i\l\o\g ]] + [[ python == \p\y\t\h\o\n ]] + reward_fn_path=verl/utils/reward_score/codev_py.py + reward_fn_name=compute_score_wrapper + PYBIN=python + python -m verl.trainer.main_ppo algorithm.adv_estimator=grpo data.train_files=/data/data/python/train.parquet data.val_files=/data/data/python/val.parquet data.train_batch_size=8 data.val_batch_size=128 data.max_prompt_length=2048 data.max_response_length=1024 algorithm.filter_groups.enable=False algorithm.filter_groups.max_num_gen_batches=999 algorithm.filter_groups.metric=acc algorithm.filter_groups.accelerate=True data.gen_batch_size=8 actor_rollout_ref.model.path=/data/ckpt/checkpoint-570 +actor_rollout_ref.model.override_config.attention_dropout=0. +actor_rollout_ref.model.override_config.embd_pdrop=0. +actor_rollout_ref.model.override_config.resid_pdrop=0. actor_rollout_ref.model.enable_gradient_checkpointing=True +actor_rollout_ref.model.use_liger=True actor_rollout_ref.actor.optim.lr=1e-6 actor_rollout_ref.actor.optim.weight_decay=0.0 actor_rollout_ref.actor.use_dynamic_bsz=False actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 actor_rollout_ref.model.use_remove_padding=False actor_rollout_ref.actor.clip_ratio_low=0.2 actor_rollout_ref.actor.clip_ratio_high=0.28 actor_rollout_ref.actor.ppo_mini_batch_size=8 actor_rollout_ref.actor.use_kl_loss=False actor_rollout_ref.actor.kl_loss_coef=0.0 actor_rollout_ref.actor.kl_loss_type=low_var_kl actor_rollout_ref.actor.entropy_coeff=0.001 actor_rollout_ref.actor.grad_clip=0.5 actor_rollout_ref.actor.use_token_level_loss=True actor_rollout_ref.actor.fsdp_config.param_offload=False actor_rollout_ref.actor.fsdp_config.optimizer_offload=True actor_rollout_ref.actor.fsdp_config.wrap_policy.min_num_params=100000000 actor_rollout_ref.ref.fsdp_config.wrap_policy.min_num_params=100000000 actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 actor_rollout_ref.rollout.tensor_model_parallel_size=2 actor_rollout_ref.rollout.name=hf actor_rollout_ref.rollout.n=4 actor_rollout_ref.rollout.val_kwargs.n=4 actor_rollout_ref.rollout.temperature=1.0 actor_rollout_ref.rollout.val_kwargs.temperature=1.0 actor_rollout_ref.rollout.val_kwargs.do_sample=True actor_rollout_ref.rollout.gpu_memory_utilization=0.85 actor_rollout_ref.rollout.enforce_eager=False actor_rollout_ref.rollout.free_cache_engine=False reward_model.reward_manager=prime actor_rollout_ref.ref.fsdp_config.param_offload=True custom_reward_function.overlong_buffer.enable=True custom_reward_function.overlong_buffer.len=1024 custom_reward_function.overlong_buffer.penalty_factor=1.0 custom_reward_function.train.path=verl/utils/reward_score/codev_py.py custom_reward_function.train.name=compute_score_wrapper algorithm.kl_ctrl.kl_coef=0.0 trainer.critic_warmup=0 'trainer.logger=[console,wandb]' trainer.project_name=codev-r1-qwen35-9b-full trainer.experiment_name=dapo-qwen35-9b-python-full trainer.n_gpus_per_node=8 trainer.nnodes=1 +trainer.val_before_train=False trainer.default_local_dir=/data/save/python trainer.resume_mode=auto trainer.default_hdfs_dir=null trainer.save_freq=30 trainer.test_freq=999999 +trainer.total_training_steps=200 trainer.total_epochs=20 actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=False actor_rollout_ref.ref.log_prob_use_dynamic_bsz=False actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 2026-05-05 06:21:18,985 INFO worker.py:2004 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265  /usr/local/lib/python3.12/dist-packages/ray/_private/worker.py:2052: FutureWarning: Tip: In future versions of Ray, Ray will no longer override accelerator visible devices env var if num_gpus=0 or num_gpus=None (default). To enable this behavior and turn off this error message, set RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO=0 warnings.warn( (TaskRunner pid=54845) {'actor_rollout_ref': {'actor': {'checkpoint': {'contents': ['model', (TaskRunner pid=54845) 'hf_model', (TaskRunner pid=54845) 'optimizer', (TaskRunner pid=54845) 'extra']}, (TaskRunner pid=54845) 'clip_ratio': 0.2, (TaskRunner pid=54845) 'clip_ratio_high': 0.28, (TaskRunner pid=54845) 'clip_ratio_low': 0.2, (TaskRunner pid=54845) 'entropy_coeff': 0.001, (TaskRunner pid=54845) 'fsdp_config': {'fsdp_size': -1, (TaskRunner pid=54845) 'optimizer_offload': True, (TaskRunner pid=54845) 'param_offload': False, (TaskRunner pid=54845) 'wrap_policy': {'min_num_params': 100000000}}, (TaskRunner pid=54845) 'grad_clip': 0.5, (TaskRunner pid=54845) 'kl_loss_coef': 0.0, (TaskRunner pid=54845) 'kl_loss_type': 'low_var_kl', (TaskRunner pid=54845) 'optim': {'lr': 1e-06, (TaskRunner pid=54845) 'lr_warmup_steps': -1, (TaskRunner pid=54845) 'lr_warmup_steps_ratio': 0.0, (TaskRunner pid=54845) 'min_lr_ratio': None, (TaskRunner pid=54845) 'total_training_steps': -1, (TaskRunner pid=54845) 'warmup_style': 'constant', (TaskRunner pid=54845) 'weight_decay': 0.0}, (TaskRunner pid=54845) 'ppo_epochs': 1, (TaskRunner pid=54845) 'ppo_max_token_len_per_gpu': 16384, (TaskRunner pid=54845) 'ppo_micro_batch_size': None, (TaskRunner pid=54845) 'ppo_micro_batch_size_per_gpu': 1, (TaskRunner pid=54845) 'ppo_mini_batch_size': 8, (TaskRunner pid=54845) 'shuffle': False, (TaskRunner pid=54845) 'strategy': 'fsdp', (TaskRunner pid=54845) 'ulysses_sequence_parallel_size': 1, (TaskRunner pid=54845) 'use_dynamic_bsz': False, (TaskRunner pid=54845) 'use_kl_loss': False, (TaskRunner pid=54845) 'use_token_level_loss': True, (TaskRunner pid=54845) 'use_torch_compile': True}, (TaskRunner pid=54845) 'hybrid_engine': True, (TaskRunner pid=54845) 'model': {'enable_gradient_checkpointing': True, (TaskRunner pid=54845) 'external_lib': None, (TaskRunner pid=54845) 'override_config': {'attention_dropout': 0.0, (TaskRunner pid=54845) 'embd_pdrop': 0.0, (TaskRunner pid=54845) 'resid_pdrop': 0.0}, (TaskRunner pid=54845) 'path': '/data/ckpt/checkpoint-570', (TaskRunner pid=54845) 'use_liger': True, (TaskRunner pid=54845) 'use_remove_padding': False}, (TaskRunner pid=54845) 'ref': {'fsdp_config': {'param_offload': True, (TaskRunner pid=54845) 'wrap_policy': {'min_num_params': 100000000}}, (TaskRunner pid=54845) 'log_prob_max_token_len_per_gpu': 16384, (TaskRunner pid=54845) 'log_prob_micro_batch_size': None, (TaskRunner pid=54845) 'log_prob_micro_batch_size_per_gpu': 1, (TaskRunner pid=54845) 'log_prob_use_dynamic_bsz': False, (TaskRunner pid=54845) 'ulysses_sequence_parallel_size': 1}, (TaskRunner pid=54845) 'rollout': {'disable_log_stats': True, (TaskRunner pid=54845) 'do_sample': True, (TaskRunner pid=54845) 'dtype': 'bfloat16', (TaskRunner pid=54845) 'enable_chunked_prefill': True, (TaskRunner pid=54845) 'enforce_eager': False, (TaskRunner pid=54845) 'free_cache_engine': False, (TaskRunner pid=54845) 'gpu_memory_utilization': 0.85, (TaskRunner pid=54845) 'ignore_eos': False, (TaskRunner pid=54845) 'load_format': 'dummy_dtensor', (TaskRunner pid=54845) 'log_prob_max_token_len_per_gpu': 16384, (TaskRunner pid=54845) 'log_prob_micro_batch_size': None, (TaskRunner pid=54845) 'log_prob_micro_batch_size_per_gpu': 1, (TaskRunner pid=54845) 'log_prob_use_dynamic_bsz': False, (TaskRunner pid=54845) 'max_model_len': None, (TaskRunner pid=54845) 'max_num_batched_tokens': 8192, (TaskRunner pid=54845) 'max_num_seqs': 1024, (TaskRunner pid=54845) 'n': 4, (TaskRunner pid=54845) 'name': 'hf', (TaskRunner pid=54845) 'prompt_length': 2048, (TaskRunner pid=54845) 'response_length': 1024, (TaskRunner pid=54845) 'temperature': 1.0, (TaskRunner pid=54845) 'tensor_model_parallel_size': 2, (TaskRunner pid=54845) 'top_k': -1, (TaskRunner pid=54845) 'top_p': 1, (TaskRunner pid=54845) 'use_fire_sampling': False, (TaskRunner pid=54845) 'val_kwargs': {'do_sample': True, (TaskRunner pid=54845) 'n': 4, (TaskRunner pid=54845) 'temperature': 1.0, (TaskRunner pid=54845) 'top_k': -1, (TaskRunner pid=54845) 'top_p': 1.0}}}, (TaskRunner pid=54845) 'algorithm': {'adv_estimator': 'grpo', (TaskRunner pid=54845) 'filter_groups': {'accelerate': True, (TaskRunner pid=54845) 'enable': False, (TaskRunner pid=54845) 'max_num_gen_batches': 999, (TaskRunner pid=54845) 'metric': 'acc'}, (TaskRunner pid=54845) 'gamma': 1.0, (TaskRunner pid=54845) 'kl_ctrl': {'kl_coef': 0.0, 'type': 'fixed'},(TaskRunner pid=54845) W0505 06:21:34.741000 54845 torch/utils/cpp_extension.py:118] No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda' (TaskRunner pid=54845) 'kl_penalty': 'kl', (TaskRunner pid=54845) 'lam': 1.0}, (TaskRunner pid=54845) 'critic': {'checkpoint': {'contents': ['model', (TaskRunner pid=54845) 'hf_model', (TaskRunner pid=54845) 'optimizer', (TaskRunner pid=54845) 'extra']}, (TaskRunner pid=54845) 'cliprange_value': 0.5, (TaskRunner pid=54845) 'forward_max_token_len_per_gpu': 32768, (TaskRunner pid=54845) 'forward_micro_batch_size': None, (TaskRunner pid=54845) 'forward_micro_batch_size_per_gpu': None, (TaskRunner pid=54845) 'grad_clip': 1.0, (TaskRunner pid=54845) 'model': {'enable_gradient_checkpointing': True, (TaskRunner pid=54845) 'external_lib': None, (TaskRunner pid=54845) 'fsdp_config': {'fsdp_size': -1, (TaskRunner pid=54845) 'optimizer_offload': False, (TaskRunner pid=54845) 'param_offload': False, (TaskRunner pid=54845) 'wrap_policy': {'min_num_params': 0}}, (TaskRunner pid=54845) 'override_config': {}, (TaskRunner pid=54845) 'path': '~/models/deepseek-llm-7b-chat', (TaskRunner pid=54845) 'tokenizer_path': '/data/ckpt/checkpoint-570', (TaskRunner pid=54845) 'use_remove_padding': False}, (TaskRunner pid=54845) 'optim': {'lr': 1e-05, (TaskRunner pid=54845) 'lr_warmup_steps_ratio': 0.0, (TaskRunner pid=54845) 'min_lr_ratio': None, (TaskRunner pid=54845) 'total_training_steps': -1, (TaskRunner pid=54845) 'warmup_style': 'constant', (TaskRunner pid=54845) 'weight_decay': 0.01}, (TaskRunner pid=54845) 'ppo_epochs': 1, (TaskRunner pid=54845) 'ppo_max_token_len_per_gpu': 32768, (TaskRunner pid=54845) 'ppo_micro_batch_size': None, (TaskRunner pid=54845) 'ppo_micro_batch_size_per_gpu': None, (TaskRunner pid=54845) 'ppo_mini_batch_size': 8, (TaskRunner pid=54845) 'shuffle': False, (TaskRunner pid=54845) 'strategy': 'fsdp', (TaskRunner pid=54845) 'ulysses_sequence_parallel_size': 1, (TaskRunner pid=54845) 'use_dynamic_bsz': False}, (TaskRunner pid=54845) 'custom_reward_function': {'overlong_buffer': {'enable': True, (TaskRunner pid=54845) 'len': 1024, (TaskRunner pid=54845) 'log': False, (TaskRunner pid=54845) 'penalty_factor': 1.0}, (TaskRunner pid=54845) 'test': {'name': 'compute_score_wrapper', (TaskRunner pid=54845) 'path': 'verl/utils/reward_score/codev_py.py'}, (TaskRunner pid=54845) 'train': {'name': 'compute_score_wrapper', (TaskRunner pid=54845) 'path': 'verl/utils/reward_score/codev_py.py'}}, (TaskRunner pid=54845) 'data': {'filter_overlong_prompts': True, (TaskRunner pid=54845) 'gen_batch_size': 8, (TaskRunner pid=54845) 'image_key': 'images', (TaskRunner pid=54845) 'max_prompt_length': 2048, (TaskRunner pid=54845) 'max_response_length': 1024, (TaskRunner pid=54845) 'prompt_key': 'prompt', (TaskRunner pid=54845) 'return_raw_chat': False, (TaskRunner pid=54845) 'return_raw_input_ids': False, (TaskRunner pid=54845) 'reward_fn_key': 'data_source', (TaskRunner pid=54845) 'shuffle': True, (TaskRunner pid=54845) 'tokenizer': None, (TaskRunner pid=54845) 'train_batch_size': 8, (TaskRunner pid=54845) 'train_files': '/data/data/python/train.parquet', (TaskRunner pid=54845) 'truncation': 'error', (TaskRunner pid=54845) 'val_batch_size': 128, (TaskRunner pid=54845) 'val_files': '/data/data/python/val.parquet'}, (TaskRunner pid=54845) 'reward_model': {'enable': False, (TaskRunner pid=54845) 'forward_max_token_len_per_gpu': 32768, (TaskRunner pid=54845) 'max_length': None, (TaskRunner pid=54845) 'micro_batch_size': None, (TaskRunner pid=54845) 'micro_batch_size_per_gpu': None, (TaskRunner pid=54845) 'model': {'external_lib': None, (TaskRunner pid=54845) 'fsdp_config': {'fsdp_size': -1, (TaskRunner pid=54845) 'param_offload': False, (TaskRunner pid=54845) 'wrap_policy': {'min_num_params': 0}}, (TaskRunner pid=54845) 'input_tokenizer': '/data/ckpt/checkpoint-570', (TaskRunner pid=54845) 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', (TaskRunner pid=54845) 'use_remove_padding': False}, (TaskRunner pid=54845) 'reward_manager': 'prime', (TaskRunner pid=54845) 'strategy': 'fsdp', (TaskRunner pid=54845) 'ulysses_sequence_parallel_size': 1, (TaskRunner pid=54845) 'use_dynamic_bsz': False}, (TaskRunner pid=54845) 'trainer': {'balance_batch': True, (TaskRunner pid=54845) 'critic_warmup': 0, (TaskRunner pid=54845) 'default_hdfs_dir': None, (TaskRunner pid=54845) 'default_local_dir': '/data/save/python', (TaskRunner pid=54845) 'del_local_ckpt_after_load': False, (TaskRunner pid=54845) 'experiment_name': 'dapo-qwen35-9b-python-full', (TaskRunner pid=54845) 'logger': ['console', 'wandb'], (TaskRunner pid=54845) 'n_gpus_per_node': 8, (TaskRunner pid=54845) 'nnodes': 1, (TaskRunner pid=54845) 'project_name': 'codev-r1-qwen35-9b-full', (TaskRunner pid=54845) 'remove_previous_ckpt_in_save': False, (TaskRunner pid=54845) 'resume_from_path': False, (TaskRunner pid=54845) 'resume_mode': 'auto', (TaskRunner pid=54845) 'save_freq': 30, (TaskRunner pid=54845) 'test_freq': 999999, (TaskRunner pid=54845) 'total_epochs': 20, (TaskRunner pid=54845) 'total_training_steps': 200, (TaskRunner pid=54845) 'val_before_train': False, (TaskRunner pid=54845) 'val_generations_to_log_to_wandb': 0}} (TaskRunner pid=54845) using customized reward function 'compute_score_wrapper' from 'verl/utils/reward_score/codev_py.py' (TaskRunner pid=54845) using customized reward function 'compute_score_wrapper' from 'verl/utils/reward_score/codev_py.py' (TaskRunner pid=54845) (TaskRunner pid=54845) id of compute score functions: 140204436607904 140204436609024 (TaskRunner pid=54845) WARNING: val_batch_size is deprecated. Validation datasets are sent to inference engines as a whole batch, which will schedule the memory themselves.(TaskRunner pid=54845) DeprecationWarning: `ray.state.available_resources_per_node` is a private attribute and access will be removed in a future Ray version. (WorkerDict pid=56485) Flash Attention 2 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Qwen3_5ForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", dtype=torch.float16)` (WorkerDict pid=56485) Flash Attention 2 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Qwen3_5TextModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", dtype=torch.float16)` (WorkerDict pid=56485) The fast path is not available because one of the required library is not installed. Falling back to torch implementation. To install follow https://github.com/fla-org/flash-linear-attention#installation and https://github.com/Dao-AILab/causal-conv1d (WorkerDict pid=56485) Loading weights: 0%| | 0/427 [00:00 (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.bm_selector = 0 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs: dict) -> dict: (TaskRunner pid=54845) RESET_L = inputs['RESET_L'] & 1 (TaskRunner pid=54845) bm_a = inputs['bm_a'] & 1 (TaskRunner pid=54845) bm_b = inputs['bm_b'] & 1 (TaskRunner pid=54845) (TaskRunner pid=54845) if RESET_L == 0: (TaskRunner pid=54845) self.bm_selector = 0 (TaskRunner pid=54845) return {'bm_selector': 0, 'bm_distintos': 0} (TaskRunner pid=54845) (TaskRunner pid=54845) if bm_a == bm_b: (TaskRunner pid=54845) bm_sel_next = self.bm_selector (TaskRunner pid=54845) bm_dist = 0 (TaskRunner pid=54845) else: (TaskRunner pid=54845) bm_dist = 1 (TaskRunner pid=54845) bm_sel_next = bm_a (TaskRunner pid=54845) (TaskRunner pid=54845) self.bm_selector = bm_sel_next (TaskRunner pid=54845) return {'bm_selector': bm_sel_next, 'bm_distintos': bm_dist} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:31 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:3108.000 - global_seqlen/max:4386.000 - global_seqlen/minmax_diff:1278.000 - global_seqlen/balanced_min:3713.000 - global_seqlen/balanced_max:4056.000 - global_seqlen/mean:3766.625 - actor/entropy:0.000 - actor/pg_loss:0.186 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:8.232 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.000 - perf/cpu_memory_used_gb:230.871 - actor/lr:0.000 - critic/score/mean:-0.270 - critic/score/max:-0.142 - critic/score/min:-0.966 - critic/rewards/mean:-0.270 - critic/rewards/max:-0.142 - critic/rewards/min:-0.966 - critic/advantages/mean:-0.238 - critic/advantages/max:1.494 - critic/advantages/min:-1.498 - critic/returns/mean:-0.238 - critic/returns/max:1.494 - critic/returns/min:-1.498 - response_length/mean:276.781 - response_length/max:989.000 - response_length/min:145.000 - response_length/clip_ratio:0.000 - prompt_length/mean:664.875 - prompt_length/max:779.000 - prompt_length/min:557.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:49.210 - timing_s/reward:1.007 - timing_s/old_log_prob:2.816 - timing_s/adv:0.002 - timing_s/update_actor:18.461 - timing_s/step:71.503 - timing_per_token_ms/gen:5.556 - timing_per_token_ms/update_actor:0.613 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:30133.000 - perf/time_per_step:71.503 - perf/throughput:52.678 - reflection/any_word_frequency:15.000 - reflection/with_length_mean:422.800 - reflection/without_length_mean:249.741 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.413 - reflection/without_reward_mean:-0.244 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:276.781 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.270 - reflection_check/word_check_frequency:5.000 - reflection_check/with_check_length_mean:422.800 - reflection_check/without_check_length_mean:249.741 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-0.413 - reflection_check/without_check_reward_mean:-0.244 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:276.781 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.270 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:276.781 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.270 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:276.781 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.270 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:276.781 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.270 - reflection_correct/word_correct_frequency:2.000 - reflection_correct/with_correct_length_mean:989.000 - reflection_correct/without_correct_length_mean:253.806 - reflection_correct/with_correct_correct_ratio:0.000 - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:-0.966 - reflection_correct/without_correct_reward_mean:-0.248 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:276.781 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.270 - reflection_adjust/word_adjust_frequency:8.000 - reflection_adjust/with_adjust_length_mean:989.000 - reflection_adjust/without_adjust_length_mean:253.806 - reflection_adjust/with_adjust_correct_ratio:0.000 - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:-0.966 - reflection_adjust/without_adjust_reward_mean:-0.248 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:276.781 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.270 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:276.781 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.270 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:276.781 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.270 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 16%|█▌ | 31/200 [01:11<3:22:03, 71.73s/it] (WorkerDict pid=56490) Casting fp32 inputs back to torch.bfloat16 for flash-attn compatibility. [repeated 7x across cluster] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 16%|█▌ | 32/200 [02:17<3:11:31, 68.40s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) GOST S-box lookup: divide 64-bit sbox into 16 x 4-bit chunks. Select chunk by 4-bit index. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs: dict) -> dict: (TaskRunner pid=54845) sbox = inputs['sbox'] & ((1 << 64) - 1) (TaskRunner pid=54845) idx = inputs['in'] & 0xF (TaskRunner pid=54845) out = (sbox >> (idx * 4)) & 0xF (TaskRunner pid=54845) return {'out': out} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:32 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2907.000 - global_seqlen/max:5345.000 - global_seqlen/minmax_diff:2438.000 - global_seqlen/balanced_min:3931.000 - global_seqlen/balanced_max:4032.000 - global_seqlen/mean:3989.625 - actor/entropy:0.000 - actor/pg_loss:-0.301 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:6.755 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.062 - perf/cpu_memory_used_gb:231.059 - actor/lr:0.000 - critic/score/mean:-0.344 - critic/score/max:-0.131 - critic/score/min:-0.873 - critic/rewards/mean:-0.344 - critic/rewards/max:-0.131 - critic/rewards/min:-0.873 - critic/advantages/mean:-0.132 - critic/advantages/max:1.432 - critic/advantages/min:-1.454 - critic/returns/mean:-0.132 - critic/returns/max:1.432 - critic/returns/min:-1.454 - response_length/mean:352.281 - response_length/max:894.000 - response_length/min:134.000 - response_length/clip_ratio:0.000 - prompt_length/mean:645.125 - prompt_length/max:872.000 - prompt_length/min:517.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:45.537 - timing_s/reward:0.986 - timing_s/old_log_prob:2.336 - timing_s/adv:0.001 - timing_s/update_actor:17.166 - timing_s/step:66.030 - timing_per_token_ms/gen:4.039 - timing_per_token_ms/update_actor:0.538 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:31917.000 - perf/time_per_step:66.030 - perf/throughput:60.421 - reflection/any_word_frequency:9.000 - reflection/with_length_mean:374.200 - reflection/without_length_mean:348.222 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.365 - reflection/without_reward_mean:-0.340 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:352.281 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.344 - reflection_check/word_check_frequency:8.000 - reflection_check/with_check_length_mean:353.500 - reflection_check/without_check_length_mean:352.107 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-0.345 - reflection_check/without_check_reward_mean:-0.344 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:352.281 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.344 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:352.281 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.344 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:352.281 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.344 - reflection_wait/word_wait_frequency:1.000 - reflection_wait/with_wait_length_mean:457.000 - reflection_wait/without_wait_length_mean:348.903 - reflection_wait/with_wait_correct_ratio:0.000 - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:-0.446 - reflection_wait/without_wait_reward_mean:-0.341 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:352.281 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.344 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:352.281 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.344 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:352.281 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.344 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:352.281 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.344 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:352.281 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.344 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:352.281 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.344 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 1. Extract 16 bytes from the 128-bit input: byte0–byte15. (TaskRunner pid=54845) 2. Apply inverse shift rows: (TaskRunner pid=54845) - Row 0 (bytes 0,4,8,12): no shift. (TaskRunner pid=54845) - Row 1 (bytes 1,5,9,13): rotate right by 1 → positions 13,0,1,5,9,13. (TaskRunner pid=54845) - Row 2 (bytes 2,6,10,14): rotate right by 2 → positions 14,15,2,6,10,14. (TaskRunner pid=54845) - Row 3 (bytes 3,7,11,15): rotate right by 3 → positions 15,0,1,3,7,11,15. (TaskRunner pid=54845) 3. Assemble the 16 result bytes into a 128-bit integer (big-endian). (TaskRunner pid=54845) 4. No state needed; this is a pure combinational transformation. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 16%|█▋ | 33/200 [03:09<2:48:48, 60.65s/it] (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs: dict) -> dict: (TaskRunner pid=54845) x = inputs['in'] & 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF (TaskRunner pid=54845) b = [(x >> (120 - 8*i)) & 0xFF for i in range(16)] (TaskRunner pid=54845) # Inverse shift rows (TaskRunner pid=54845) r0 = [b[0], b[4], b[8], b[12]] (TaskRunner pid=54845) r1 = [b[13], b[1], b[5], b[9]] (TaskRunner pid=54845) r2 = [b[14], b[15], b[2], b[6]] (TaskRunner pid=54845) r3 = [b[15], b[3], b[7], b[11]] (TaskRunner pid=54845) result = [r0[0], r1[0], r2[0], r3[0], (TaskRunner pid=54845) r0[1], r1[1], r2[1], r3[1], (TaskRunner pid=54845) r0[2], r1[2], r2[2], r3[2], (TaskRunner pid=54845) r0[3], r1[3], r2[3], r3[3]] (TaskRunner pid=54845) out = 0 (TaskRunner pid=54845) for i, val in enumerate(result): (TaskRunner pid=54845) out |= (val & 0xFF) << (120 - 8*i) (TaskRunner pid=54845) return {'o_shifted': out & 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:33 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:3112.000 - global_seqlen/max:5594.000 - global_seqlen/minmax_diff:2482.000 - global_seqlen/balanced_min:4233.000 - global_seqlen/balanced_max:4284.000 - global_seqlen/mean:4261.250 - actor/entropy:0.000 - actor/pg_loss:0.325 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:6.625 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.103 - actor/lr:0.000 - critic/score/mean:-0.328 - critic/score/max:-0.175 - critic/score/min:-0.566 - critic/rewards/mean:-0.328 - critic/rewards/max:-0.175 - critic/rewards/min:-0.566 - critic/advantages/mean:-0.099 - critic/advantages/max:1.366 - critic/advantages/min:-1.466 - critic/returns/mean:-0.099 - critic/returns/max:1.366 - critic/returns/min:-1.466 - response_length/mean:335.812 - response_length/max:580.000 - response_length/min:179.000 - response_length/clip_ratio:0.000 - prompt_length/mean:729.500 - prompt_length/max:1178.000 - prompt_length/min:557.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:30.928 - timing_s/reward:1.012 - timing_s/old_log_prob:2.322 - timing_s/adv:0.001 - timing_s/update_actor:17.140 - timing_s/step:51.406 - timing_per_token_ms/gen:2.878 - timing_per_token_ms/update_actor:0.503 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:34090.000 - perf/time_per_step:51.406 - perf/throughput:82.893 - reflection/any_word_frequency:9.000 - reflection/with_length_mean:434.750 - reflection/without_length_mean:321.679 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.425 - reflection/without_reward_mean:-0.314 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:335.812 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.328 - reflection_check/word_check_frequency:9.000 - reflection_check/with_check_length_mean:434.750 - reflection_check/without_check_length_mean:321.679 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-0.425 - reflection_check/without_check_reward_mean:-0.314 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:335.812 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.328 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:335.812 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.328 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:335.812 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.328 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:335.812 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.328 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:335.812 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.328 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:335.812 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.328 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:335.812 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.328 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:335.812 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.328 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:335.812 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.328 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:335.812 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.328 - language_mix/frequency:1.000 - language_mix/ratio:0.031 - language_mix/with_length_mean:373.000 - language_mix/without_length_mean:334.613 - language_mix/with_correct_ratio:0.000 - language_mix/without_correct_ratio:0.000 - language_mix/with_reward_mean:-0.364 - language_mix/without_reward_mean:-0.327 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) RX_CHECK: error-checking receiver output mux. (TaskRunner pid=54845) - If rst=0: DATA_VALID=0, P_DATA_OUT=0.(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 17%|█▋ | 34/200 [04:23<3:02:46, 66.06s/it] (TaskRunner pid=54845) - Else if RX_CHECK_EN=1: check parity_error and stop_error. If either=1, DATA_VALID=0, P_DATA_OUT=0. Otherwise DATA_VALID=1, P_DATA_OUT=P_DATA_REG. (TaskRunner pid=54845) - Else: hold previous valid/data (latched outputs). (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.DATA_VALID = 0 (TaskRunner pid=54845) self.P_DATA_OUT = 0 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs: dict) -> dict: (TaskRunner pid=54845) if not (inputs['rst'] & 1): (TaskRunner pid=54845) self.DATA_VALID = 0 (TaskRunner pid=54845) self.P_DATA_OUT = 0 (TaskRunner pid=54845) return {'DATA_VALID': 0, 'P_DATA_OUT': 0} (TaskRunner pid=54845) (TaskRunner pid=54845) if inputs['RX_CHECK_EN']: (TaskRunner pid=54845) if inputs['parity_error'] or inputs['stop_error']: (TaskRunner pid=54845) self.DATA_VALID = 0 (TaskRunner pid=54845) self.P_DATA_OUT = 0 (TaskRunner pid=54845) else: (TaskRunner pid=54845) self.DATA_VALID = 1 (TaskRunner pid=54845) self.P_DATA_OUT = inputs['P_DATA_REG'] & ((1 << 8) - 1) (TaskRunner pid=54845) return {'DATA_VALID': self.DATA_VALID, 'P_DATA_OUT': self.P_DATA_OUT} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:34 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2792.000 - global_seqlen/max:6134.000 - global_seqlen/minmax_diff:3342.000 - global_seqlen/balanced_min:4171.000 - global_seqlen/balanced_max:4254.000 - global_seqlen/mean:4210.000 - actor/entropy:0.000 - actor/pg_loss:-0.187 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:5.725 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.171 - actor/lr:0.000 - critic/score/mean:-0.392 - critic/score/max:-0.137 - critic/score/min:-1.000 - critic/rewards/mean:-0.392 - critic/rewards/max:-0.137 - critic/rewards/min:-1.000 - critic/advantages/mean:-0.196 - critic/advantages/max:1.245 - critic/advantages/min:-1.497 - critic/returns/mean:-0.196 - critic/returns/max:1.245 - critic/returns/min:-1.497 - response_length/mean:401.750 - response_length/max:1024.000 - response_length/min:140.000 - response_length/clip_ratio:0.094 - prompt_length/mean:650.750 - prompt_length/max:946.000 - prompt_length/min:547.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:53.495 - timing_s/reward:0.986 - timing_s/old_log_prob:2.341 - timing_s/adv:0.001 - timing_s/update_actor:17.508 - timing_s/step:74.334 - timing_per_token_ms/gen:4.161 - timing_per_token_ms/update_actor:0.520 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:33680.000 - perf/time_per_step:74.334 - perf/throughput:56.636 - reflection/any_word_frequency:18.000 - reflection/with_length_mean:426.750 - reflection/without_length_mean:393.417 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.417 - reflection/without_reward_mean:-0.384 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:401.750 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.392 - reflection_check/word_check_frequency:16.000 - reflection_check/with_check_length_mean:396.500 - reflection_check/without_check_length_mean:402.962 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-0.387 - reflection_check/without_check_reward_mean:-0.394 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:401.750 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.392 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:401.750 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.392 - reflection_reflect/word_reflect_frequency:2.000 - reflection_reflect/with_reflect_length_mean:517.500 - reflection_reflect/without_reflect_length_mean:394.033 - reflection_reflect/with_reflect_correct_ratio:0.000 - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:-0.505 - reflection_reflect/without_reflect_reward_mean:-0.385 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:401.750 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.392 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:401.750 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.392 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:401.750 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.392 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:401.750 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.392 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:401.750 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.392 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:401.750 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.392 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:401.750 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.392 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 16-bit D-latch array. Each bit latches on clock high. Simple register: data_o = data_i when clk=1.(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 18%|█▊ | 35/200 [05:08<2:40:50, 58.49s/it] (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.data_o = 0 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs: dict) -> dict: (TaskRunner pid=54845) if inputs['clk_i'] == 1: (TaskRunner pid=54845) self.data_o = inputs['data_i'] & 0xFFFF (TaskRunner pid=54845) return {'data_o': self.data_o} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:35 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2507.000 - global_seqlen/max:4091.000 - global_seqlen/minmax_diff:1584.000 - global_seqlen/balanced_min:3456.000 - global_seqlen/balanced_max:3477.000 - global_seqlen/mean:3463.000 - actor/entropy:0.000 - actor/pg_loss:-0.290 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:7.206 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.181 - actor/lr:0.000 - critic/score/mean:-0.218 - critic/score/max:-0.095 - critic/score/min:-0.434 - critic/rewards/mean:-0.218 - critic/rewards/max:-0.095 - critic/rewards/min:-0.434 - critic/advantages/mean:-0.124 - critic/advantages/max:1.378 - critic/advantages/min:-1.334 - critic/returns/mean:-0.124 - critic/returns/max:1.378 - critic/returns/min:-1.334 - response_length/mean:223.625 - response_length/max:444.000 - response_length/min:97.000 - response_length/clip_ratio:0.000 - prompt_length/mean:642.125 - prompt_length/max:749.000 - prompt_length/min:522.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:24.652 - timing_s/reward:1.005 - timing_s/old_log_prob:2.335 - timing_s/adv:0.001 - timing_s/update_actor:17.041 - timing_s/step:45.037 - timing_per_token_ms/gen:3.445 - timing_per_token_ms/update_actor:0.615 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:27704.000 - perf/time_per_step:45.037 - perf/throughput:76.893 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:223.625 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.218 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:223.625 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.218 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:223.625 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.218 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:223.625 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.218 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:223.625 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.218 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:223.625 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.218 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:223.625 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.218 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:223.625 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.218 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:223.625 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.218 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:223.625 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.218 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:223.625 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.218 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:223.625 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.218 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) Seven-segment decoder. Active-low segments (0=lit). Map 4-bit input to 7-bit pattern. (TaskRunner pid=54845) Standard segment mapping: out[0]=a, out[1]=b, out[2]=c, out[3]=d, out[4]=e, out[5]=f, out[6]=g. (TaskRunner pid=54845) 0=0x3F (0b00111111), 1=0x06, 2=0x5B, 3=0x4F, 4=0x66, 5=0x6D, 6=0x7D, 7=0x07, (TaskRunner pid=54845) 8=0x7F, 9=0x6F, A=0x77, b=0x7C, C=0x39, d=0x5E, E=0x79, F=0x71. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.table = [0x3F, 0x06, 0x5B, 0x4F, 0x66, 0x6D, 0x7D, 0x07, (TaskRunner pid=54845) 0x7F, 0x6F, 0x77, 0x7C, 0x39, 0x5E, 0x79, 0x71] (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs: dict) -> dict: (TaskRunner pid=54845) val = inputs['in'] & 0xF (TaskRunner pid=54845) out = self.table[val] if val <= 15 else 0x7F (TaskRunner pid=54845) return {'out': out & 0x7F} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:36 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2979.000 - global_seqlen/max:4522.000 - global_seqlen/minmax_diff:1543.000 - global_seqlen/balanced_min:3730.000 - global_seqlen/balanced_max:3761.000 - global_seqlen/mean:3748.500 - actor/entropy:0.000 - actor/pg_loss:0.300 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:8.247 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.194 - actor/lr:0.000 - critic/score/mean:-0.299 - critic/score/max:-0.103 - critic/score/min:-0.564 - critic/rewards/mean:-0.299 - critic/rewards/max:-0.103 - critic/rewards/min:-0.564 - critic/advantages/mean:-0.179 - critic/advantages/max:1.478 - critic/advantages/min:-1.488 - critic/returns/mean:-0.179 - critic/returns/max:1.478 - critic/returns/min:-1.488 - response_length/mean:306.250 - response_length/max:578.000 - response_length/min:105.000 - response_length/clip_ratio:0.000 - prompt_length/mean:630.875 - prompt_length/max:763.000 - prompt_length/min:550.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:31.418 - timing_s/reward:0.971 - timing_s/old_log_prob:2.323 - timing_s/adv:0.001 - timing_s/update_actor:17.079 - timing_s/step:51.795 - timing_per_token_ms/gen:3.206 - timing_per_token_ms/update_actor:0.570 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:29988.000 - perf/time_per_step:51.795 - perf/throughput:72.372 - reflection/any_word_frequency:15.000 - reflection/with_length_mean:310.500 - reflection/without_length_mean:304.833 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.303 - reflection/without_reward_mean:-0.298 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:306.250 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.299 - reflection_check/word_check_frequency:4.000 - reflection_check/with_check_length_mean:324.667 - reflection_check/without_check_length_mean:304.345 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-0.317 - reflection_check/without_check_reward_mean:-0.297 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:306.250 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.299 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:306.250 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.299 - reflection_reflect/word_reflect_frequency:1.000 - reflection_reflect/with_reflect_length_mean:284.000 - reflection_reflect/without_reflect_length_mean:306.968 - reflection_reflect/with_reflect_correct_ratio:0.000 - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:-0.277 - reflection_reflect/without_reflect_reward_mean:-0.300 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:306.250 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.299 - reflection_correct/word_correct_frequency:10.000 - reflection_correct/with_correct_length_mean:306.500 - reflection_correct/without_correct_length_mean:306.214 - reflection_correct/with_correct_correct_ratio:0.000 - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:-0.299 - reflection_correct/without_correct_reward_mean:-0.299 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:306.250 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.299 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:306.250 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.299 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:306.250 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.299 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:306.250 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.299 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:306.250 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.299 - language_mix/frequency:1.000 - language_mix/ratio:0.031 - language_mix/with_length_mean:229.000 - language_mix/without_length_mean:308.742 - language_mix/with_correct_ratio:0.000 - language_mix/without_correct_ratio:0.000 - language_mix/with_reward_mean:-0.224 - language_mix/without_reward_mean:-0.302 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 18%|█▊ | 36/200 [06:00<2:33:40, 56.22s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) intuition: (TaskRunner pid=54845) 1. This is an ALU with 6 operations selected by a 3-bit control signal. (TaskRunner pid=54845) 2. We need to implement bitwise AND, OR, addition, subtraction, multiplication, and signed comparison. (TaskRunner pid=54845) 3. The zero_flag should be set when the result is zero. (TaskRunner pid=54845) 4. All operations should be masked to WIDTH bits (64 bits based on the reference). (TaskRunner pid=54845) 5. For comparison, we need to treat inputs as signed integers. (TaskRunner pid=54845) (TaskRunner pid=54845) Approach: (TaskRunner pid=54845) 1. Define WIDTH = 64 as a constant. (TaskRunner pid=54845) 2. In eval(), extract scrA, scrB, and ALU_Control from inputs with proper masking. (TaskRunner pid=54845) 3. Use if-elif chain to decode ALU_Control and perform the appropriate operation. (TaskRunner pid=54845) 4. For signed comparison, use Python's built-in < operator on signed integers. (TaskRunner pid=54845) 5. Mask all results to WIDTH bits using & ((1 << WIDTH) - 1). (TaskRunner pid=54845) 6. Set zero_flag to 1 if ALU_RESULT is 0, else 0. (TaskRunner pid=54845) (TaskRunner pid=54845) Key implementation details: (TaskRunner pid=54845) - scrA and scrB are masked to 64 bits: & 0xFFFFFFFFFFFFFFFF (TaskRunner pid=54845) - ALU_Control is masked to 3 bits: & 0x7 (TaskRunner pid=54845) - For signed comparison, we need to interpret the 64-bit values as signed. Python's integers are arbitrary precision, so we can convert: if value >= 2**63, subtract 2**64 to get the signed value. (TaskRunner pid=54845) - All arithmetic results are masked to 64 bits. (TaskRunner pid=54845) - The zero_flag is computed after the result is determined. (TaskRunner pid=54845) (TaskRunner pid=54845) Edge cases: (TaskRunner pid=54845) - When ALU_Control is not one of the defined values (000, 001, 010, 100, 101, 110), the result should be 0. (TaskRunner pid=54845) - The comparison operation returns 1 if scrA < scrB (as signed), else 0. (TaskRunner pid=54845) - Overflow in addition/subtraction is handled by masking. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.WIDTH = 64 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs: dict) -> dict: (TaskRunner pid=54845) scrA = inputs['scrA'] & ((1 << self.WIDTH) - 1) (TaskRunner pid=54845) scrB = inputs['scrB'] & ((1 << self.WIDTH) - 1) (TaskRunner pid=54845) alu_ctrl = inputs['ALU_Control'] & 0x7 (TaskRunner pid=54845) (TaskRunner pid=54845) if alu_ctrl == 0: # AND (TaskRunner pid=54845) result = scrA & scrB (TaskRunner pid=54845) elif alu_ctrl == 1: # OR (TaskRunner pid=54845) result = scrA | scrB (TaskRunner pid=54845) elif alu_ctrl == 2: # ADD (TaskRunner pid=54845) result = (scrA + scrB) & ((1 << self.WIDTH) - 1) (TaskRunner pid=54845) elif alu_ctrl == 4: # SUB (TaskRunner pid=54845) result = (scrA - scrB) & ((1 << self.WIDTH) - 1) (TaskRunner pid=54845) elif alu_ctrl == 5: # MUL (TaskRunner pid=54845) result = (scrA * scrB) & ((1 << self.WIDTH) - 1) (TaskRunner pid=54845) elif alu_ctrl == 6: # SLT (signed) (TaskRunner pid=54845) a_signed = scrA if scrA < (1 << (self.WIDTH - 1)) else scrA - (1 << self.WIDTH) (TaskRunner pid=54845) b_signed = scrB if scrB < (1 << (self.WIDTH - 1)) else scrB - (1 << self.WIDTH) (TaskRunner pid=54845) result = 1 if a_signed < b_signed else 0 (TaskRunner pid=54845) else: (TaskRunner pid=54845) result = 0 (TaskRunner pid=54845) (TaskRunner pid=54845) result = result & ((1 << self.WIDTH) - 1) (TaskRunner pid=54845) zero_flag = 1 if result == 0 else 0 (TaskRunner pid=54845) (TaskRunner pid=54845) return {'ALU_RESULT': result, 'zero_flag': zero_flag} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) (TaskRunner pid=54845) step:37 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2412.000 - global_seqlen/max:5250.000 - global_seqlen/minmax_diff:2838.000 - global_seqlen/balanced_min:3894.000 - global_seqlen/balanced_max:3954.000 - global_seqlen/mean:3926.000 - actor/entropy:0.000 - actor/pg_loss:-0.129 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:6.944 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.216 - actor/lr:0.000 - critic/score/mean:-0.315 - critic/score/max:-0.097 - critic/score/min:-0.822 - critic/rewards/mean:-0.315 - critic/rewards/max:-0.097 - critic/rewards/min:-0.822 - critic/advantages/mean:-0.210 - critic/advantages/max:1.371 - critic/advantages/min:-1.474 - critic/returns/mean:-0.210 - critic/returns/max:1.371 - critic/returns/min:-1.474 - response_length/mean:322.500 - response_length/max:842.000 - response_length/min:99.000 - response_length/clip_ratio:0.000 - prompt_length/mean:659.000 - prompt_length/max:858.000 - prompt_length/min:487.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:43.557 - timing_s/reward:1.023 - timing_s/old_log_prob:2.383 - timing_s/adv:0.001 - timing_s/update_actor:17.686 - timing_s/step:64.654 - timing_per_token_ms/gen:4.221 - timing_per_token_ms/update_actor:0.563 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:31408.000 - perf/time_per_step:64.654 - perf/throughput:60.724 - reflection/any_word_frequency:2.000 - reflection/with_length_mean:437.500 - reflection/without_length_mean:314.833 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.427 - reflection/without_reward_mean:-0.307 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:322.500 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.315 - reflection_check/word_check_frequency:1.000 - reflection_check/with_check_length_mean:158.000 - reflection_check/without_check_length_mean:327.806 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-0.154 - reflection_check/without_check_reward_mean:-0.320 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:322.500 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.315 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:322.500 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.315 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:322.500 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.315 - reflection_wait/word_wait_frequency:1.000 - reflection_wait/with_wait_length_mean:717.000 - reflection_wait/without_wait_length_mean:309.774 - reflection_wait/with_wait_correct_ratio:0.000 - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:-0.700 - reflection_wait/without_wait_reward_mean:-0.303 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:322.500 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.315 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:322.500 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.315 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:322.500 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.315 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:322.500 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.315 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:322.500 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.315 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:322.500 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.315 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 18%|█▊ | 37/200 [07:05<2:40:14, 58.98s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 19%|█▉ | 38/200 [07:59<2:35:20, 57.53s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 8-bit ripple-carry adder. Sum = (a+b) & 0xFF. cout = (a+b) >> 8. Overflow = sign(a) != sign(b) and (sign(sum) != sign(a)). (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs: dict) -> dict: (TaskRunner pid=54845) a = inputs['a'] & 0xFF (TaskRunner pid=54845) b = inputs['b'] & 0xFF (TaskRunner pid=54845) total = a + b (TaskRunner pid=54845) s = total & 0xFF (TaskRunner pid=54845) cout = (total >> 8) & 1 (TaskRunner pid=54845) flow = ((a ^ s) & (b ^ s)) & 1 (TaskRunner pid=54845) return {'sum': s, 'cout': cout, 'flow': flow} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:38 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2791.000 - global_seqlen/max:5089.000 - global_seqlen/minmax_diff:2298.000 - global_seqlen/balanced_min:3924.000 - global_seqlen/balanced_max:3983.000 - global_seqlen/mean:3951.875 - actor/entropy:0.000 - actor/pg_loss:-0.056 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:6.154 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.212 - actor/lr:0.000 - critic/score/mean:-0.301 - critic/score/max:-0.120 - critic/score/min:-0.623 - critic/rewards/mean:-0.301 - critic/rewards/max:-0.120 - critic/rewards/min:-0.623 - critic/advantages/mean:-0.091 - critic/advantages/max:1.498 - critic/advantages/min:-1.499 - critic/returns/mean:-0.091 - critic/returns/max:1.498 - critic/returns/min:-1.499 - response_length/mean:308.469 - response_length/max:638.000 - response_length/min:123.000 - response_length/clip_ratio:0.000 - prompt_length/mean:679.500 - prompt_length/max:820.000 - prompt_length/min:570.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:34.109 - timing_s/reward:0.991 - timing_s/old_log_prob:2.375 - timing_s/adv:0.001 - timing_s/update_actor:16.922 - timing_s/step:54.402 - timing_per_token_ms/gen:3.455 - timing_per_token_ms/update_actor:0.535 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:31615.000 - perf/time_per_step:54.402 - perf/throughput:72.642 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:308.469 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.301 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:308.469 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.301 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:308.469 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.301 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:308.469 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.301 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:308.469 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.301 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:308.469 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.301 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:308.469 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.301 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:308.469 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.301 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:308.469 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.301 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:308.469 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.301 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:308.469 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.301 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:308.469 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.301 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 64 registers x 32 bits. Read two regs combinatorially. Write on clock edge based on write_reg_ctl: 2'=2 -> reg1addr, 2'=3 -> reg2addr, 2'=1 -> rd=6'b011111=31. Reset all to 0. Also output regs 1-10 continuously. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.regs = [0] * 64 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs: dict) -> dict: (TaskRunner pid=54845) if inputs['rst']: (TaskRunner pid=54845) self.regs = [0] * 64 (TaskRunner pid=54845) return {'reg1out':0,'reg2out':0,'reg_show1':0,'reg_show2':0,'reg_show3':0, (TaskRunner pid=54845) 'reg_show4':0,'reg_show5':0,'reg_show6':0,'reg_show7':0,'reg_show8':0,(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 20%|█▉ | 39/200 [08:58<2:35:31, 57.96s/it] (TaskRunner pid=54845) 'reg_show9':0,'reg_show10':0} (TaskRunner pid=54845) (TaskRunner pid=54845) r1 = self.regs[inputs['reg1addr'] & 0x3F] (TaskRunner pid=54845) r2 = self.regs[inputs['reg2addr'] & 0x3F] (TaskRunner pid=54845) (TaskRunner pid=54845) if inputs['write_reg_ctl'] == 2: (TaskRunner pid=54845) self.regs[inputs['reg1addr'] & 0x3F] = inputs['write_data'] & 0xFFFFFFFF (TaskRunner pid=54845) elif inputs['write_reg_ctl'] == 3: (TaskRunner pid=54845) self.regs[inputs['reg2addr'] & 0x3F] = inputs['write_data'] & 0xFFFFFFFF (TaskRunner pid=54845) elif inputs['write_reg_ctl'] == 1: (TaskRunner pid=54845) self.regs[31] = inputs['write_data'] & 0xFFFFFFFF (TaskRunner pid=54845) (TaskRunner pid=54845) return {'reg1out':r1,'reg2out':r2,'reg_show1':self.regs[1], (TaskRunner pid=54845) 'reg_show2':self.regs[2],'reg_show3':self.regs[3], (TaskRunner pid=54845) 'reg_show4':self.regs[4],'reg_show5':self.regs[5],'reg_show6':self.regs[6], (TaskRunner pid=54845) 'reg_show7':self.regs[7],'reg_show8':self.regs[8],'reg_show9':self.regs[9], (TaskRunner pid=54845) 'reg_show10':self.regs[10]} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:39 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2657.000 - global_seqlen/max:6515.000 - global_seqlen/minmax_diff:3858.000 - global_seqlen/balanced_min:4269.000 - global_seqlen/balanced_max:4300.000 - global_seqlen/mean:4283.375 - actor/entropy:0.000 - actor/pg_loss:-0.503 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:6.260 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.239 - actor/lr:0.000 - critic/score/mean:-0.299 - critic/score/max:-0.103 - critic/score/min:-0.719 - critic/rewards/mean:-0.299 - critic/rewards/max:-0.103 - critic/rewards/min:-0.719 - critic/advantages/mean:-0.037 - critic/advantages/max:1.398 - critic/advantages/min:-1.477 - critic/returns/mean:-0.037 - critic/returns/max:1.398 - critic/returns/min:-1.477 - response_length/mean:306.344 - response_length/max:736.000 - response_length/min:105.000 - response_length/clip_ratio:0.000 - prompt_length/mean:764.500 - prompt_length/max:1002.000 - prompt_length/min:540.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:38.575 - timing_s/reward:0.969 - timing_s/old_log_prob:2.376 - timing_s/adv:0.001 - timing_s/update_actor:16.941 - timing_s/step:58.865 - timing_per_token_ms/gen:3.935 - timing_per_token_ms/update_actor:0.494 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:34267.000 - perf/time_per_step:58.865 - perf/throughput:72.766 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:306.344 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.299 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:306.344 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.299 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:306.344 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.299 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:306.344 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.299 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:306.344 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.299 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:306.344 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.299 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:306.344 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.299 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:306.344 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.299 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:306.344 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.299 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:306.344 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.299 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:306.344 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.299 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:306.344 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.299 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 3-to-8 decoder: compound enable = G1 & ~G2AN & ~G2BN. When enabled, one active-low output goes 0 based on 3-bit address. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs: dict) -> dict: (TaskRunner pid=54845) a = inputs['A'] & 1 (TaskRunner pid=54845) b = inputs['B'] & 1(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 20%|██ | 40/200 [10:11<2:46:43, 62.52s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) c = inputs['C'] & 1 (TaskRunner pid=54845) g1 = inputs['G1'] & 1 (TaskRunner pid=54845) g2an = inputs['G2AN'] & 1 (TaskRunner pid=54845) g2bn = inputs['G2BN'] & 1 (TaskRunner pid=54845) if g1 and not g2an and not g2bn: (TaskRunner pid=54845) idx = (c << 2) | (b << 1) | a (TaskRunner pid=54845) outs = [0] * 8 (TaskRunner pid=54845) outs[idx] = 0 (TaskRunner pid=54845) return {f'Y{i}N': outs[i] for i in range(8)} (TaskRunner pid=54845) return {f'Y{i}N': 1 for i in range(8)} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:40 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2594.000 - global_seqlen/max:5843.000 - global_seqlen/minmax_diff:3249.000 - global_seqlen/balanced_min:3583.000 - global_seqlen/balanced_max:4046.000 - global_seqlen/mean:3699.000 - actor/entropy:0.000 - actor/pg_loss:-0.101 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:8.284 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.242 - actor/lr:0.000 - critic/score/mean:-0.288 - critic/score/max:-0.101 - critic/score/min:-1.000 - critic/rewards/mean:-0.288 - critic/rewards/max:-0.101 - critic/rewards/min:-1.000 - critic/advantages/mean:-0.192 - critic/advantages/max:1.418 - critic/advantages/min:-1.485 - critic/returns/mean:-0.192 - critic/returns/max:1.418 - critic/returns/min:-1.485 - response_length/mean:295.250 - response_length/max:1024.000 - response_length/min:103.000 - response_length/clip_ratio:0.031 - prompt_length/mean:629.500 - prompt_length/max:835.000 - prompt_length/min:513.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:51.687 - timing_s/reward:1.032 - timing_s/old_log_prob:2.422 - timing_s/adv:0.001 - timing_s/update_actor:17.560 - timing_s/step:72.707 - timing_per_token_ms/gen:5.471 - timing_per_token_ms/update_actor:0.593 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:29592.000 - perf/time_per_step:72.707 - perf/throughput:50.876 - reflection/any_word_frequency:2.000 - reflection/with_length_mean:960.500 - reflection/without_length_mean:250.900 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.938 - reflection/without_reward_mean:-0.245 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:295.250 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.288 - reflection_check/word_check_frequency:2.000 - reflection_check/with_check_length_mean:960.500 - reflection_check/without_check_length_mean:250.900 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-0.938 - reflection_check/without_check_reward_mean:-0.245 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:295.250 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.288 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:295.250 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.288 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:295.250 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.288 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:295.250 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.288 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:295.250 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.288 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:295.250 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.288 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:295.250 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.288 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:295.250 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.288 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:295.250 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.288 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:295.250 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.288 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) Debouncer: count 3 consecutive high cycles on clr before asserting output. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.count = 0 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs: dict) -> dict: (TaskRunner pid=54845) clr = inputs['clr'] & 1 (TaskRunner pid=54845) if clr: (TaskRunner pid=54845) if self.count < 3: (TaskRunner pid=54845) self.count += 1 (TaskRunner pid=54845) else: (TaskRunner pid=54845) self.count = 0 (TaskRunner pid=54845) return {'debounced_clr': int(self.count >= 3)} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:41 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2459.000 - global_seqlen/max:4223.000 - global_seqlen/minmax_diff:1764.000 - global_seqlen/balanced_min:3110.000 - global_seqlen/balanced_max:3192.000 - global_seqlen/mean:3151.250 - actor/entropy:0.000 - actor/pg_loss:-0.118 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:9.181 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.250 - actor/lr:0.000 - critic/score/mean:-0.162 - critic/score/max:-0.103 - critic/score/min:-0.269 - critic/rewards/mean:-0.162 - critic/rewards/max:-0.103 - critic/rewards/min:-0.269 - critic/advantages/mean:-0.093 - critic/advantages/max:1.333 - critic/advantages/min:-1.465 - critic/returns/mean:-0.093 - critic/returns/max:1.333 - critic/returns/min:-1.465 - response_length/mean:165.812 - response_length/max:275.000 - response_length/min:105.000 - response_length/clip_ratio:0.000 - prompt_length/mean:622.000 - prompt_length/max:818.000 - prompt_length/min:503.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:17.399 - timing_s/reward:0.990 - timing_s/old_log_prob:2.379 - timing_s/adv:0.001 - timing_s/update_actor:17.071 - timing_s/step:37.844 - timing_per_token_ms/gen:3.279 - timing_per_token_ms/update_actor:0.677 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:25210.000 - perf/time_per_step:37.844 - perf/throughput:83.270 - reflection/any_word_frequency:5.000 - reflection/with_length_mean:241.800 - reflection/without_length_mean:151.741 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.236 - reflection/without_reward_mean:-0.148 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:165.812 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.162 - reflection_check/word_check_frequency:5.000 - reflection_check/with_check_length_mean:241.800 - reflection_check/without_check_length_mean:151.741 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-0.236 - reflection_check/without_check_reward_mean:-0.148 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:165.812 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.162 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:165.812 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.162 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:165.812 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.162 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:165.812 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.162 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:165.812 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.162 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:165.812 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.162 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:165.812 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.162 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:165.812 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.162 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:165.812 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.162 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:165.812 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.162 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) Training Progress: 20%|██ | 41/200 [10:49<2:25:41, 54.98s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 21%|██ | 42/200 [11:37<2:19:45, 53.08s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 16-bit adder with carry, sign, zero, parity, overflow flags. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) a = inputs['a'] & 0xFFFF (TaskRunner pid=54845) b = inputs['b'] & 0xFFFF (TaskRunner pid=54845) s = a + b (TaskRunner pid=54845) full = s >> 16 (TaskRunner pid=54845) sm = s & 0xFFFF (TaskRunner pid=54845) return {'sum': sm, 'sign': (sm >> 15) & 1, 'carry': full & 1, (TaskRunner pid=54845) 'zero': 1 if sm == 0 else 0, (TaskRunner pid=54845) 'parity': bin(sm).count('1') % 2, (TaskRunner pid=54845) 'overflow': ((a ^ sm) & (b ^ sm) & 0x8000) != 0} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:42 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2501.000 - global_seqlen/max:5167.000 - global_seqlen/minmax_diff:2666.000 - global_seqlen/balanced_min:3331.000 - global_seqlen/balanced_max:3406.000 - global_seqlen/mean:3368.250 - actor/entropy:0.000 - actor/pg_loss:-0.336 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:6.983 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.264 - actor/lr:0.000 - critic/score/mean:-0.194 - critic/score/max:-0.102 - critic/score/min:-0.506 - critic/rewards/mean:-0.194 - critic/rewards/max:-0.102 - critic/rewards/min:-0.506 - critic/advantages/mean:-0.082 - critic/advantages/max:1.481 - critic/advantages/min:-1.460 - critic/returns/mean:-0.082 - critic/returns/max:1.481 - critic/returns/min:-1.460 - response_length/mean:198.938 - response_length/max:518.000 - response_length/min:104.000 - response_length/clip_ratio:0.000 - prompt_length/mean:643.125 - prompt_length/max:811.000 - prompt_length/min:496.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:28.365 - timing_s/reward:0.978 - timing_s/old_log_prob:2.395 - timing_s/adv:0.001 - timing_s/update_actor:16.969 - timing_s/step:48.712 - timing_per_token_ms/gen:4.456 - timing_per_token_ms/update_actor:0.630 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:26946.000 - perf/time_per_step:48.712 - perf/throughput:69.146 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:198.938 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.194 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:198.938 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.194 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:198.938 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.194 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:198.938 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.194 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:198.938 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.194 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:198.938 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.194 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:198.938 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.194 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:198.938 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.194 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:198.938 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.194 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:198.938 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.194 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:198.938 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.194 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:198.938 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.194 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 32-bit rotate left by shift_amount: out = ((num << shift) | (num >> (32-shift))) & 0xFFFFFFFF (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) shift = inputs['ShiftSize'] & 0x1F (TaskRunner pid=54845) num = inputs['Number'] & 0xFFFFFFFF (TaskRunner pid=54845) return {'Out': ((num << shift) | (num >> (32 - shift))) & 0xFFFFFFFF} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 22%|██▏ | 43/200 [12:27<2:16:07, 52.02s/it] (TaskRunner pid=54845) step:43 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2640.000 - global_seqlen/max:5959.000 - global_seqlen/minmax_diff:3319.000 - global_seqlen/balanced_min:3349.000 - global_seqlen/balanced_max:3702.000 - global_seqlen/mean:3535.250 - actor/entropy:0.000 - actor/pg_loss:0.228 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:7.678 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.277 - actor/lr:0.000 - critic/score/mean:-0.220 - critic/score/max:-0.111 - critic/score/min:-0.492 - critic/rewards/mean:-0.220 - critic/rewards/max:-0.111 - critic/rewards/min:-0.492 - critic/advantages/mean:-0.150 - critic/advantages/max:1.461 - critic/advantages/min:-1.482 - critic/returns/mean:-0.150 - critic/returns/max:1.461 - critic/returns/min:-1.482 - response_length/mean:225.688 - response_length/max:504.000 - response_length/min:114.000 - response_length/clip_ratio:0.000 - prompt_length/mean:658.125 - prompt_length/max:1086.000 - prompt_length/min:531.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:28.293 - timing_s/reward:0.999 - timing_s/old_log_prob:2.401 - timing_s/adv:0.001 - timing_s/update_actor:17.875 - timing_s/step:49.573 - timing_per_token_ms/gen:3.918 - timing_per_token_ms/update_actor:0.632 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:28282.000 - perf/time_per_step:49.573 - perf/throughput:71.314 - reflection/any_word_frequency:1.000 - reflection/with_length_mean:165.000 - reflection/without_length_mean:227.645 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.161 - reflection/without_reward_mean:-0.222 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:225.688 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.220 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:225.688 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.220 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:225.688 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.220 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:225.688 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.220 - reflection_reflect/word_reflect_frequency:1.000 - reflection_reflect/with_reflect_length_mean:165.000 - reflection_reflect/without_reflect_length_mean:227.645 - reflection_reflect/with_reflect_correct_ratio:0.000 - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:-0.161 - reflection_reflect/without_reflect_reward_mean:-0.222 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:225.688 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.220 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:225.688 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.220 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:225.688 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.220 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:225.688 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.220 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:225.688 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.220 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:225.688 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.220 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:225.688 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.220 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 8-entry FIFO with addr-based full/empty detection using a direction flip-flop. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.buf = [0] * 8 (TaskRunner pid=54845) self.wptr = 0 (TaskRunner pid=54845) self.rptr = 0 (TaskRunner pid=54845) self.full = 0 (TaskRunner pid=54845) self.empty = 1 (TaskRunner pid=54845) self.dir = 0 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) wr = inputs['wr'] & 1 (TaskRunner pid=54845) rd = inputs['rd'] & 1 (TaskRunner pid=54845) if wr: (TaskRunner pid=54845) self.buf[self.wptr] = inputs['din'] & 0xFF (TaskRunner pid=54845) self.wptr = (self.wptr + 1) & 7 (TaskRunner pid=54845) if rd: (TaskRunner pid=54845) self.rptr = (self.rptr + 1) & 7 (TaskRunner pid=54845) self.full = 1 if self.wptr == self.rptr and self.dir == 1 else 0 (TaskRunner pid=54845) self.empty = 1 if self.wptr == self.rptr and self.dir == 0 else 0 (TaskRunner pid=54845) return {'wa': self.wptr, 'ra': self.rptr, 'full': self.full, 'empty': self.empty} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:44 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2967.000 - global_seqlen/max:5054.000 - global_seqlen/minmax_diff:2087.000 - global_seqlen/balanced_min:3656.000 - global_seqlen/balanced_max:3894.000 - global_seqlen/mean:3753.750 - actor/entropy:0.000 - actor/pg_loss:-0.106 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:7.256 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.293 - actor/lr:0.000 - critic/score/mean:-0.242 - critic/score/max:-0.113 - critic/score/min:-0.531 - critic/rewards/mean:-0.242 - critic/rewards/max:-0.113 - critic/rewards/min:-0.531 - critic/advantages/mean:-0.092 - critic/advantages/max:1.485 - critic/advantages/min:-1.497 - critic/returns/mean:-0.092 - critic/returns/max:1.485 - critic/returns/min:-1.497 - response_length/mean:248.312 - response_length/max:544.000 - response_length/min:116.000 - response_length/clip_ratio:0.000 - prompt_length/mean:690.125 - prompt_length/max:794.000 - prompt_length/min:582.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:30.296 - timing_s/reward:0.999 - timing_s/old_log_prob:2.436 - timing_s/adv:0.001 - timing_s/update_actor:17.611 - timing_s/step:51.347 - timing_per_token_ms/gen:3.813 - timing_per_token_ms/update_actor:0.586 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:30030.000 - perf/time_per_step:51.347 - perf/throughput:73.105 - reflection/any_word_frequency:2.000 - reflection/with_length_mean:259.500 - reflection/without_length_mean:247.567 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.253 - reflection/without_reward_mean:-0.242 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:248.312 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.242 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:248.312 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.242 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:248.312 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.242 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:248.312 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.242 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:248.312 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.242 - reflection_wait/word_wait_frequency:1.000 - reflection_wait/with_wait_length_mean:272.000 - reflection_wait/without_wait_length_mean:247.548 - reflection_wait/with_wait_correct_ratio:0.000 - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:-0.266 - reflection_wait/without_wait_reward_mean:-0.242 - reflection_correct/word_correct_frequency:1.000 - reflection_correct/with_correct_length_mean:247.000 - reflection_correct/without_correct_length_mean:248.355 - reflection_correct/with_correct_correct_ratio:0.000 - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:-0.241 - reflection_correct/without_correct_reward_mean:-0.243 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:248.312 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.242 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:248.312 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.242 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:248.312 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.242 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:248.312 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.242 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:248.312 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.242 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 22%|██▏ | 44/200 [13:18<2:14:44, 51.82s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 22%|██▎ | 45/200 [14:01<2:07:05, 49.20s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 12-bit binary to 4-digit BCD. Divide by 1000,100,10 repeatedly. Cap at 0xFFFF for overflow. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): pass (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) v = inputs['Binary'] & 0xFFF (TaskRunner pid=54845) if v > 9999: return {'BCD': 0xFFFF} (TaskRunner pid=54845) bcd = 0 (TaskRunner pid=54845) for _ in range(4): (TaskRunner pid=54845) bcd = (bcd * 10) + (v % 10) (TaskRunner pid=54845) v //= 10 (TaskRunner pid=54845) return {'BCD': bcd & 0xFFFF} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:45 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2662.000 - global_seqlen/max:3837.000 - global_seqlen/minmax_diff:1175.000 - global_seqlen/balanced_min:2950.000 - global_seqlen/balanced_max:3169.000 - global_seqlen/mean:3027.125 - actor/entropy:0.000 - actor/pg_loss:-0.301 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:7.589 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.278 - actor/lr:0.000 - critic/score/mean:-0.154 - critic/score/max:-0.100 - critic/score/min:-0.395 - critic/rewards/mean:-0.154 - critic/rewards/max:-0.100 - critic/rewards/min:-0.395 - critic/advantages/mean:-0.098 - critic/advantages/max:1.483 - critic/advantages/min:-1.500 - critic/returns/mean:-0.098 - critic/returns/max:1.483 - critic/returns/min:-1.500 - response_length/mean:157.906 - response_length/max:404.000 - response_length/min:102.000 - response_length/clip_ratio:0.000 - prompt_length/mean:598.875 - prompt_length/max:675.000 - prompt_length/min:548.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:22.695 - timing_s/reward:0.973 - timing_s/old_log_prob:2.330 - timing_s/adv:0.001 - timing_s/update_actor:17.093 - timing_s/step:43.096 - timing_per_token_ms/gen:4.491 - timing_per_token_ms/update_actor:0.706 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:24217.000 - perf/time_per_step:43.096 - perf/throughput:70.241 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:157.906 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.154 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:157.906 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.154 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:157.906 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.154 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:157.906 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.154 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:157.906 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.154 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:157.906 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.154 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:157.906 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.154 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:157.906 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.154 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:157.906 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.154 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:157.906 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.154 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:157.906 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.154 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:157.906 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.154 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) Control unit decode 7-bit opcodes. Generate standard RISC-V control signals. When control_sel=1, force all outputs to 0. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) cs = inputs['control_sel'] & 1 (TaskRunner pid=54845) if cs: (TaskRunner pid=54845) return {'MemRead':0,'MemtoReg':0,'MemWrite':0,'RegWrite':0,'Branch':0,'ALUSrc':0,'ALUop':0} (TaskRunner pid=54845) op = inputs['opcode'] & 0x7F (TaskRunner pid=54845) if op == 0b0000011: # load (TaskRunner pid=54845) return {'MemRead':1,'MemtoReg':1,'MemWrite':0,'RegWrite':1,'Branch':0,'ALUSrc':1,'ALUop':0}(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 23%|██▎ | 46/200 [14:48<2:04:11, 48.39s/it] (TaskRunner pid=54845) elif op == 0b0100011: # store (TaskRunner pid=54845) return {'MemRead':0,'MemtoReg':X,'MemWrite':1,'RegWrite':0,'Branch':0,'ALUSrc':1,'ALUop':0} (TaskRunner pid=54845) elif op == 0b1100111: # R-format (TaskRunner pid=54845) return {'MemRead':0,'MemtoReg':0,'MemWrite':0,'RegWrite':1,'Branch':0,'ALUSrc':0,'ALUop':2} (TaskRunner pid=54845) elif op == 0b1101111: # I-format (arithmetic) (TaskRunner pid=54845) return {'MemRead':0,'MemtoReg':0,'MemWrite':0,'RegWrite':1,'Branch':0,'ALUSrc':1,'ALUop':0} (TaskRunner pid=54845) elif op == 0b1100011: # branch (TaskRunner pid=54845) return {'MemRead':0,'MemtoReg':X,'MemWrite':0,'RegWrite':0,'Branch':1,'ALUSrc':0,'ALUop':1} (TaskRunner pid=54845) return {'MemRead':0,'MemtoReg':0,'MemWrite':0,'RegWrite':0,'Branch':0,'ALUSrc':0,'ALUop':0} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:46 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2466.000 - global_seqlen/max:4456.000 - global_seqlen/minmax_diff:1990.000 - global_seqlen/balanced_min:3053.000 - global_seqlen/balanced_max:3217.000 - global_seqlen/mean:3108.750 - actor/entropy:0.000 - actor/pg_loss:-0.452 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:8.637 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.283 - actor/lr:0.000 - critic/score/mean:-0.184 - critic/score/max:-0.068 - critic/score/min:-0.471 - critic/rewards/mean:-0.184 - critic/rewards/max:-0.068 - critic/rewards/min:-0.471 - critic/advantages/mean:-0.140 - critic/advantages/max:1.420 - critic/advantages/min:-1.488 - critic/returns/mean:-0.140 - critic/returns/max:1.420 - critic/returns/min:-1.488 - response_length/mean:188.188 - response_length/max:482.000 - response_length/min:70.000 - response_length/clip_ratio:0.000 - prompt_length/mean:589.000 - prompt_length/max:730.000 - prompt_length/min:504.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:25.332 - timing_s/reward:0.991 - timing_s/old_log_prob:2.338 - timing_s/adv:0.001 - timing_s/update_actor:17.819 - timing_s/step:46.485 - timing_per_token_ms/gen:4.207 - timing_per_token_ms/update_actor:0.716 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:24870.000 - perf/time_per_step:46.485 - perf/throughput:66.877 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:188.188 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.184 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:188.188 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.184 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:188.188 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.184 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:188.188 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.184 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:188.188 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.184 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:188.188 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.184 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:188.188 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.184 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:188.188 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.184 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:188.188 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.184 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:188.188 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.184 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:188.188 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.184 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:188.188 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.184 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 2-bit multiplier: p = x2 * y2. Compute partial products and add with carry. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) x = inputs['x2'] & 0x3; y = inputs['y2'] & 0x3 (TaskRunner pid=54845) return {'p': (x * y) & 0xF} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:47 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2596.000 - global_seqlen/max:5169.000 - global_seqlen/minmax_diff:2573.000 - global_seqlen/balanced_min:3967.000 - global_seqlen/balanced_max:4001.000 - global_seqlen/mean:3985.125 - actor/entropy:0.000 - actor/pg_loss:-0.193 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:11.504 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.295 - actor/lr:0.000 - critic/score/mean:-0.235 - critic/score/max:-0.093 - critic/score/min:-0.458 - critic/rewards/mean:-0.235 - critic/rewards/max:-0.093 - critic/rewards/min:-0.458 - critic/advantages/mean:-0.111 - critic/advantages/max:1.498 - critic/advantages/min:-1.496 - critic/returns/mean:-0.111 - critic/returns/max:1.498 - critic/returns/min:-1.496 - response_length/mean:240.281 - response_length/max:469.000 - response_length/min:95.000 - response_length/clip_ratio:0.000 - prompt_length/mean:756.000 - prompt_length/max:1014.000 - prompt_length/min:551.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:26.620 - timing_s/reward:0.981 - timing_s/old_log_prob:2.376 - timing_s/adv:0.001 - timing_s/update_actor:17.562 - timing_s/step:47.544 - timing_per_token_ms/gen:3.462 - timing_per_token_ms/update_actor:0.551 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:31881.000 - perf/time_per_step:47.544 - perf/throughput:83.820 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:240.281 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.235 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:240.281 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.235 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:240.281 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.235 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:240.281 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.235 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:240.281 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.235 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:240.281 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.235 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:240.281 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.235 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:240.281 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.235 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:240.281 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.235 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:240.281 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.235 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:240.281 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.235 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:240.281 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.235 - language_mix/frequency:1.000 - language_mix/ratio:0.031 - language_mix/with_length_mean:173.000 - language_mix/without_length_mean:242.452 - language_mix/with_correct_ratio:0.000 - language_mix/without_correct_ratio:0.000 - language_mix/with_reward_mean:-0.169 - language_mix/without_reward_mean:-0.237 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 24%|██▎ | 47/200 [15:35<2:02:45, 48.14s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 24%|██▍ | 48/200 [16:10<1:51:50, 44.15s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 4-bit hex to 7-seg lookup table, active-low segments. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.lut = [0x3F, 0x06, 0x5B, 0x4F, 0x66, 0x6D, 0x7D, 0x07, (TaskRunner pid=54845) 0x7F, 0x6F, 0x77, 0x7C, 0x39, 0x5E, 0x79, 0x71] (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) return {'c': self.lut[inputs['d'] & 0xF]} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:48 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2536.000 - global_seqlen/max:4217.000 - global_seqlen/minmax_diff:1681.000 - global_seqlen/balanced_min:3095.000 - global_seqlen/balanced_max:3139.000 - global_seqlen/mean:3115.875 - actor/entropy:0.000 - actor/pg_loss:0.228 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:7.989 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.296 - actor/lr:0.000 - critic/score/mean:-0.156 - critic/score/max:-0.084 - critic/score/min:-0.229 - critic/rewards/mean:-0.156 - critic/rewards/max:-0.084 - critic/rewards/min:-0.229 - critic/advantages/mean:-0.046 - critic/advantages/max:1.499 - critic/advantages/min:-1.494 - critic/returns/mean:-0.046 - critic/returns/max:1.499 - critic/returns/min:-1.494 - response_length/mean:160.094 - response_length/max:235.000 - response_length/min:86.000 - response_length/clip_ratio:0.000 - prompt_length/mean:618.875 - prompt_length/max:827.000 - prompt_length/min:513.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:14.292 - timing_s/reward:1.041 - timing_s/old_log_prob:2.341 - timing_s/adv:0.001 - timing_s/update_actor:17.141 - timing_s/step:34.819 - timing_per_token_ms/gen:2.790 - timing_per_token_ms/update_actor:0.688 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:24927.000 - perf/time_per_step:34.819 - perf/throughput:89.487 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:160.094 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.156 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:160.094 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.156 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:160.094 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.156 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:160.094 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.156 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:160.094 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.156 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:160.094 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.156 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:160.094 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.156 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:160.094 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.156 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:160.094 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.156 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:160.094 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.156 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:160.094 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.156 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:160.094 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.156 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 2-bit shift ops on 16-bit value: rol, sll, ror, asr. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) v = inputs['in'] & 0xFFFF; op = inputs['op'] & 3 (TaskRunner pid=54845) if op == 0: return (v << 2 | v >> 14) & 0xFFFF # rol (TaskRunner pid=54845) elif op == 1: return (v << 2) & 0xFFFF # sll (TaskRunner pid=54845) elif op == 2: return (v >> 2 | v << 14) & 0xFFFF # ror (TaskRunner pid=54845) else: return (v >> 2) & 0xFFFF if v < 0x8000 else (0xFFFF0000 | v) & 0xFFFF # asr ``` (TaskRunner pid=54845) step:49 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2694.000 - global_seqlen/max:5282.000 - global_seqlen/minmax_diff:2588.000 - global_seqlen/balanced_min:3395.000 - global_seqlen/balanced_max:3559.000 - global_seqlen/mean:3453.500 - actor/entropy:0.000 - actor/pg_loss:0.623 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:11.100 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.307 - actor/lr:0.000 - critic/score/mean:-0.174 - critic/score/max:-0.073 - critic/score/min:-0.386 - critic/rewards/mean:-0.174 - critic/rewards/max:-0.073 - critic/rewards/min:-0.386 - critic/advantages/mean:-0.133 - critic/advantages/max:1.411 - critic/advantages/min:-1.485 - critic/returns/mean:-0.133 - critic/returns/max:1.411 - critic/returns/min:-1.485 - response_length/mean:178.250 - response_length/max:395.000 - response_length/min:75.000 - response_length/clip_ratio:0.000 - prompt_length/mean:685.125 - prompt_length/max:1005.000 - prompt_length/min:565.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:22.576 - timing_s/reward:1.010 - timing_s/old_log_prob:2.341 - timing_s/adv:0.001 - timing_s/update_actor:17.619 - timing_s/step:43.550 - timing_per_token_ms/gen:3.958 - timing_per_token_ms/update_actor:0.638 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:27628.000 - perf/time_per_step:43.550 - perf/throughput:79.299 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:178.250 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.174 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:178.250 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.174 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:178.250 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.174 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:178.250 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.174 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:178.250 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.174 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:178.250 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.174 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:178.250 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.174 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:178.250 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.174 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:178.250 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.174 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:178.250 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.174 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:178.250 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.174 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:178.250 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.174 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 24%|██▍ | 49/200 [16:54<1:50:40, 43.97s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 25%|██▌ | 50/200 [17:41<1:52:14, 44.90s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 32-bit PC register, increment by 4 normally. Hardwire lower 2 bits to 0 on output. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.pc = 0 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) if inputs['rst']: (TaskRunner pid=54845) self.pc = 0 (TaskRunner pid=54845) elif inputs['pc_sel']: (TaskRunner pid=54845) self.pc = inputs['alu_out'] & 0xFFFFFFFF (TaskRunner pid=54845) else: (TaskRunner pid=54845) self.pc = (self.pc + 4) & 0xFFFFFFFF (TaskRunner pid=54845) return {'pc': self.pc & 0xFFFFFFF0, 'pc_4': (self.pc + 4) & 0xFFFFFFFF} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:50 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2363.000 - global_seqlen/max:5926.000 - global_seqlen/minmax_diff:3563.000 - global_seqlen/balanced_min:3718.000 - global_seqlen/balanced_max:3817.000 - global_seqlen/mean:3748.000 - actor/entropy:0.000 - actor/pg_loss:-0.583 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:10.742 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.303 - actor/lr:0.000 - critic/score/mean:-0.173 - critic/score/max:-0.069 - critic/score/min:-0.451 - critic/rewards/mean:-0.173 - critic/rewards/max:-0.069 - critic/rewards/min:-0.451 - critic/advantages/mean:-0.099 - critic/advantages/max:1.500 - critic/advantages/min:-1.499 - critic/returns/mean:-0.099 - critic/returns/max:1.500 - critic/returns/min:-1.499 - response_length/mean:177.125 - response_length/max:462.000 - response_length/min:71.000 - response_length/clip_ratio:0.000 - prompt_length/mean:759.875 - prompt_length/max:1366.000 - prompt_length/min:511.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:26.265 - timing_s/reward:1.009 - timing_s/old_log_prob:2.392 - timing_s/adv:0.001 - timing_s/update_actor:17.354 - timing_s/step:47.025 - timing_per_token_ms/gen:4.634 - timing_per_token_ms/update_actor:0.579 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:29984.000 - perf/time_per_step:47.025 - perf/throughput:79.702 - reflection/any_word_frequency:1.000 - reflection/with_length_mean:197.000 - reflection/without_length_mean:176.484 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.192 - reflection/without_reward_mean:-0.172 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:177.125 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.173 - reflection_check/word_check_frequency:1.000 - reflection_check/with_check_length_mean:197.000 - reflection_check/without_check_length_mean:176.484 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-0.192 - reflection_check/without_check_reward_mean:-0.172 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:177.125 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.173 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:177.125 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.173 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:177.125 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.173 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:177.125 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.173 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:177.125 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.173 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:177.125 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.173 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:177.125 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.173 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:177.125 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.173 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:177.125 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.173 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:177.125 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.173 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) Synchronous active-low reset: latch nrst on clock edge, delay deassert by one cycle. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.q = 1 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) return {'reset_n': inputs['nrst'] & self.q} (TaskRunner pid=54845) step:51 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2281.000 - global_seqlen/max:5029.000 - global_seqlen/minmax_diff:2748.000 - global_seqlen/balanced_min:3321.000 - global_seqlen/balanced_max:3422.000 - global_seqlen/mean:3380.125 - actor/entropy:0.000 - actor/pg_loss:0.048 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:13.761 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.305 - actor/lr:0.000 - critic/score/mean:-0.130 - critic/score/max:-0.066 - critic/score/min:-0.252 - critic/rewards/mean:-0.130 - critic/rewards/max:-0.066 - critic/rewards/min:-0.252 - critic/advantages/mean:-0.099 - critic/advantages/max:1.420 - critic/advantages/min:-1.499 - critic/returns/mean:-0.099 - critic/returns/max:1.420 - critic/returns/min:-1.499 - response_length/mean:133.031 - response_length/max:258.000 - response_length/min:68.000 - response_length/clip_ratio:0.000 - prompt_length/mean:712.000 - prompt_length/max:1162.000 - prompt_length/min:494.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:16.265 - timing_s/reward:0.985 - timing_s/old_log_prob:2.355 - timing_s/adv:0.001 - timing_s/update_actor:17.251 - timing_s/step:36.861 - timing_per_token_ms/gen:3.821 - timing_per_token_ms/update_actor:0.638 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:27041.000 - perf/time_per_step:36.861 - perf/throughput:91.699 - reflection/any_word_frequency:5.000 - reflection/with_length_mean:133.400 - reflection/without_length_mean:132.963 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.130 - reflection/without_reward_mean:-0.130 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:133.031 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.130 - reflection_check/word_check_frequency:4.000 - reflection_check/with_check_length_mean:142.000 - reflection_check/without_check_length_mean:131.750 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-0.139 - reflection_check/without_check_reward_mean:-0.129 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:133.031 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.130 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:133.031 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.130 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:133.031 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.130 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:133.031 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.130 - reflection_correct/word_correct_frequency:1.000 - reflection_correct/with_correct_length_mean:99.000 - reflection_correct/without_correct_length_mean:134.129 - reflection_correct/with_correct_correct_ratio:0.000 - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:-0.097 - reflection_correct/without_correct_reward_mean:-0.131 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:133.031 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.130 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:133.031 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.130 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:133.031 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.130 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:133.031 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.130 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:133.031 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.130 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 26%|██▌ | 51/200 [18:18<1:45:30, 42.49s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 26%|██▌ | 52/200 [19:04<1:47:15, 43.48s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) State machine: s1→s2→s3→s4→s1. sel=1 in s1, en=1 in s4. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.state = 0 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) if inputs['rst']: (TaskRunner pid=54845) self.state = 0 (TaskRunner pid=54845) self.state = (self.state + 1) % 4 (TaskRunner pid=54845) return {'sel': int(self.state == 0), 'en': int(self.state == 3)} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:52 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2200.000 - global_seqlen/max:5659.000 - global_seqlen/minmax_diff:3459.000 - global_seqlen/balanced_min:3347.000 - global_seqlen/balanced_max:3438.000 - global_seqlen/mean:3395.500 - actor/entropy:0.000 - actor/pg_loss:-0.248 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:10.811 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.290 - actor/lr:0.000 - critic/score/mean:-0.136 - critic/score/max:-0.062 - critic/score/min:-0.438 - critic/rewards/mean:-0.136 - critic/rewards/max:-0.062 - critic/rewards/min:-0.438 - critic/advantages/mean:-0.122 - critic/advantages/max:1.497 - critic/advantages/min:-1.497 - critic/returns/mean:-0.122 - critic/returns/max:1.497 - critic/returns/min:-1.497 - response_length/mean:138.875 - response_length/max:449.000 - response_length/min:64.000 - response_length/clip_ratio:0.000 - prompt_length/mean:710.000 - prompt_length/max:1085.000 - prompt_length/min:477.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:24.554 - timing_s/reward:0.977 - timing_s/old_log_prob:2.376 - timing_s/adv:0.001 - timing_s/update_actor:17.881 - timing_s/step:45.791 - timing_per_token_ms/gen:5.525 - timing_per_token_ms/update_actor:0.658 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:27164.000 - perf/time_per_step:45.791 - perf/throughput:74.152 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:138.875 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.136 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:138.875 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.136 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:138.875 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.136 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:138.875 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.136 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:138.875 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.136 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:138.875 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.136 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:138.875 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.136 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:138.875 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.136 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:138.875 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.136 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:138.875 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.136 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:138.875 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.136 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:138.875 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.136 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 8-bit binary to BCD via shift-and-add-3. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) d = inputs['binary'] & 0xFF (TaskRunner pid=54845) for _ in range(8): (TaskRunner pid=54845) if d >= 10: d -= 10 (TaskRunner pid=54845) d = (d << 1) & 0xFF (TaskRunner pid=54845) return {'tens': d >> 4, 'ones': d & 0xF} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:53 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2674.000 - global_seqlen/max:4639.000 - global_seqlen/minmax_diff:1965.000 - global_seqlen/balanced_min:3305.000 - global_seqlen/balanced_max:3454.000 - global_seqlen/mean:3342.000 - actor/entropy:0.000 - actor/pg_loss:0.116 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:14.455 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.269 - actor/lr:0.000 - critic/score/mean:-0.143 - critic/score/max:-0.061 - critic/score/min:-0.371 - critic/rewards/mean:-0.143 - critic/rewards/max:-0.061 - critic/rewards/min:-0.371 - critic/advantages/mean:-0.163 - critic/advantages/max:1.384 - critic/advantages/min:-1.471 - critic/returns/mean:-0.163 - critic/returns/max:1.384 - critic/returns/min:-1.471 - response_length/mean:146.125 - response_length/max:380.000 - response_length/min:62.000 - response_length/clip_ratio:0.000 - prompt_length/mean:689.375 - prompt_length/max:912.000 - prompt_length/min:561.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:22.119 - timing_s/reward:0.993 - timing_s/old_log_prob:2.326 - timing_s/adv:0.001 - timing_s/update_actor:18.059 - timing_s/step:43.502 - timing_per_token_ms/gen:4.730 - timing_per_token_ms/update_actor:0.675 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:26736.000 - perf/time_per_step:43.502 - perf/throughput:76.825 - reflection/any_word_frequency:1.000 - reflection/with_length_mean:119.000 - reflection/without_length_mean:147.000 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.116 - reflection/without_reward_mean:-0.144 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:146.125 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.143 - reflection_check/word_check_frequency:1.000 - reflection_check/with_check_length_mean:119.000 - reflection_check/without_check_length_mean:147.000 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-0.116 - reflection_check/without_check_reward_mean:-0.144 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:146.125 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.143 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:146.125 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.143 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:146.125 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.143 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:146.125 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.143 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:146.125 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.143 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:146.125 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.143 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:146.125 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.143 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:146.125 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.143 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:146.125 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.143 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:146.125 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.143 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 26%|██▋ | 53/200 [19:47<1:46:33, 43.49s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 27%|██▋ | 54/200 [20:31<1:46:11, 43.64s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) Counter to 100M, pulse at max. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.c = 0 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) if self.c == 100000000: (TaskRunner pid=54845) self.c = 0 (TaskRunner pid=54845) return {'out': int(self.c == 100000000)} ``` (TaskRunner pid=54845) step:54 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2492.000 - global_seqlen/max:4148.000 - global_seqlen/minmax_diff:1656.000 - global_seqlen/balanced_min:3126.000 - global_seqlen/balanced_max:3352.000 - global_seqlen/mean:3175.125 - actor/entropy:0.000 - actor/pg_loss:-0.114 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:14.444 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.088 - perf/cpu_memory_used_gb:231.287 - actor/lr:0.000 - critic/score/mean:-0.146 - critic/score/max:-0.083 - critic/score/min:-0.390 - critic/rewards/mean:-0.146 - critic/rewards/max:-0.083 - critic/rewards/min:-0.390 - critic/advantages/mean:-0.164 - critic/advantages/max:1.430 - critic/advantages/min:-1.483 - critic/returns/mean:-0.164 - critic/returns/max:1.430 - critic/returns/min:-1.483 - response_length/mean:149.781 - response_length/max:399.000 - response_length/min:85.000 - response_length/clip_ratio:0.000 - prompt_length/mean:644.000 - prompt_length/max:788.000 - prompt_length/min:525.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:23.213 - timing_s/reward:0.986 - timing_s/old_log_prob:2.371 - timing_s/adv:0.002 - timing_s/update_actor:17.389 - timing_s/step:43.965 - timing_per_token_ms/gen:4.843 - timing_per_token_ms/update_actor:0.685 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:25401.000 - perf/time_per_step:43.965 - perf/throughput:72.219 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:149.781 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.146 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:149.781 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.146 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:149.781 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.146 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:149.781 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.146 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:149.781 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.146 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:149.781 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.146 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:149.781 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.146 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:149.781 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.146 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:149.781 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.146 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:149.781 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.146 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:149.781 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.146 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:149.781 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.146 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) FIFO write ptr + full detection. Binary Gray conversion: flip bits from MSB. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.wptr = 0 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) if not inputs['wrst_n']: self.wptr = 0 (TaskRunner pid=54845) elif inputs['wrt_en']: (TaskRunner pid=54845) self.wptr = (self.wptr + 1) & 0x3FF (TaskRunner pid=54845) return {'wfull': int(not (self.wptr & 0x200)), 'waddr': self.wptr & 0x3FF, 'wptr': self.wptr & 0x3FF} (TaskRunner pid=54845) step:55 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2521.000 - global_seqlen/max:3999.000 - global_seqlen/minmax_diff:1478.000 - global_seqlen/balanced_min:3312.000 - global_seqlen/balanced_max:3345.000 - global_seqlen/mean:3327.875 - actor/entropy:0.000 - actor/pg_loss:0.028 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:15.701 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.096 - perf/cpu_memory_used_gb:231.306 - actor/lr:0.000 - critic/score/mean:-0.118 - critic/score/max:-0.060 - critic/score/min:-0.257 - critic/rewards/mean:-0.118 - critic/rewards/max:-0.060 - critic/rewards/min:-0.257 - critic/advantages/mean:-0.105 - critic/advantages/max:1.487 - critic/advantages/min:-1.497 - critic/returns/mean:-0.105 - critic/returns/max:1.487 - critic/returns/min:-1.497 - response_length/mean:121.094 - response_length/max:263.000 - response_length/min:61.000 - response_length/clip_ratio:0.000 - prompt_length/mean:710.875 - prompt_length/max:922.000 - prompt_length/min:557.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:16.705 - timing_s/reward:0.984 - timing_s/old_log_prob:2.437 - timing_s/adv:0.001 - timing_s/update_actor:17.700 - timing_s/step:37.830 - timing_per_token_ms/gen:4.311 - timing_per_token_ms/update_actor:0.665 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:26623.000 - perf/time_per_step:37.830 - perf/throughput:87.968 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:121.094 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.118 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:121.094 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.118 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:121.094 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.118 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:121.094 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.118 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:121.094 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.118 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:121.094 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.118 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:121.094 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.118 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:121.094 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.118 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:121.094 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.118 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:121.094 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.118 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:121.094 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.118 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:121.094 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.118 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 28%|██▊ | 55/200 [21:09<1:41:16, 41.90s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 28%|██▊ | 56/200 [21:52<1:41:43, 42.38s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 8-bit ALU with multiple operations. (TaskRunner pid=54845) (TaskRunner pid=54845) (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) a = inputs['input_a'] & 0xFF (TaskRunner pid=54845) b = inputs['input_b'] & 0xFF (TaskRunner pid=54845) op = inputs['OP'] & 0x7 (TaskRunner pid=54845) if op == 0b110 or op == 0b111: (TaskRunner pid=54845) return {'out': b, 'zero': int(b == 0)} (TaskRunner pid=54845) if op == 0b100 or op == 0b101: (TaskRunner pid=54845) return {'out': (a + b) & 0xFF, 'zero': int((a + b) & 0xFF == 0)} (TaskRunner pid=54845) if op == 0b011: (TaskRunner pid=54845) return {'out': (~(a | b)) & 0xFF, 'zero': int((~(a | b)) & 0xFF == 0)} (TaskRunner pid=54845) if op == 0b010: (TaskRunner pid=54845) return {'out': 0 if (a >> 7) else 1, 'zero': int((0 if (a >> 7) else 1) == 0)} (TaskRunner pid=54845) sh = b & 0xF (TaskRunner pid=54845) return {'out': (a << sh) & 0xFF if (b >> 3) == 0 else (a >> ((~sh + 1) & 0xF)) & 0xFF, (TaskRunner pid=54845) 'zero': int(((a << sh) & 0xFF if (b >> 3) == 0 else (a >> ((~sh + 1) & 0xF)) & 0xFF) == 0)} ``` (TaskRunner pid=54845) step:56 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:3157.000 - global_seqlen/max:4726.000 - global_seqlen/minmax_diff:1569.000 - global_seqlen/balanced_min:3995.000 - global_seqlen/balanced_max:4041.000 - global_seqlen/mean:4014.250 - actor/entropy:0.000 - actor/pg_loss:-0.372 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:16.491 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.320 - actor/lr:0.000 - critic/score/mean:-0.179 - critic/score/max:-0.053 - critic/score/min:-0.378 - critic/rewards/mean:-0.179 - critic/rewards/max:-0.053 - critic/rewards/min:-0.378 - critic/advantages/mean:-0.204 - critic/advantages/max:1.492 - critic/advantages/min:-1.500 - critic/returns/mean:-0.204 - critic/returns/max:1.492 - critic/returns/min:-1.500 - response_length/mean:183.312 - response_length/max:387.000 - response_length/min:54.000 - response_length/clip_ratio:0.000 - prompt_length/mean:820.250 - prompt_length/max:973.000 - prompt_length/min:695.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:22.424 - timing_s/reward:0.997 - timing_s/old_log_prob:2.375 - timing_s/adv:0.001 - timing_s/update_actor:17.664 - timing_s/step:43.466 - timing_per_token_ms/gen:3.823 - timing_per_token_ms/update_actor:0.550 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:32114.000 - perf/time_per_step:43.466 - perf/throughput:92.354 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:183.312 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.179 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:183.312 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.179 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:183.312 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.179 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:183.312 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.179 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:183.312 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.179 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:183.312 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.179 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:183.312 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.179 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:183.312 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.179 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:183.312 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.179 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:183.312 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.179 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:183.312 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.179 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:183.312 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.179 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 8-bit CLA with generate/propagate inputs. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 28%|██▊ | 57/200 [22:27<1:35:33, 40.10s/it] (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) return {'S': (inputs['a']+inputs['b']+inputs['c0'])&0xFF, 'G': 1, 'P': 1} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:57 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2167.000 - global_seqlen/max:3570.000 - global_seqlen/minmax_diff:1403.000 - global_seqlen/balanced_min:2879.000 - global_seqlen/balanced_max:2906.000 - global_seqlen/mean:2891.375 - actor/entropy:0.000 - actor/pg_loss:0.153 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:19.466 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.316 - actor/lr:0.000 - critic/score/mean:-0.108 - critic/score/max:-0.056 - critic/score/min:-0.205 - critic/rewards/mean:-0.108 - critic/rewards/max:-0.056 - critic/rewards/min:-0.205 - critic/advantages/mean:-0.106 - critic/advantages/max:1.499 - critic/advantages/min:-1.477 - critic/returns/mean:-0.106 - critic/returns/max:1.499 - critic/returns/min:-1.477 - response_length/mean:110.344 - response_length/max:210.000 - response_length/min:57.000 - response_length/clip_ratio:0.000 - prompt_length/mean:612.500 - prompt_length/max:765.000 - prompt_length/min:479.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:13.800 - timing_s/reward:0.983 - timing_s/old_log_prob:2.377 - timing_s/adv:0.001 - timing_s/update_actor:17.577 - timing_s/step:34.741 - timing_per_token_ms/gen:3.908 - timing_per_token_ms/update_actor:0.760 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23131.000 - perf/time_per_step:34.741 - perf/throughput:83.227 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:110.344 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.108 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:110.344 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.108 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:110.344 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.108 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:110.344 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.108 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:110.344 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.108 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:110.344 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.108 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:110.344 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.108 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:110.344 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.108 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:110.344 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.108 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:110.344 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.108 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:110.344 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.108 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:110.344 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.108 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 8x8 multiplication via Karatsuba. (TaskRunner pid=54845) Split A,B into high/low 4-bit halves. (TaskRunner pid=54845) C = A*B = a1*b1*16 + ((a1+a0)*(b1+b0) - a1*b1 - a0*b0)*8 + a0*b0 (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): return (inputs['A']*inputs['B'])&0xFFFF (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:58 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2597.000 - global_seqlen/max:3853.000 - global_seqlen/minmax_diff:1256.000 - global_seqlen/balanced_min:3070.000 - global_seqlen/balanced_max:3475.000 - global_seqlen/mean:3147.125 - actor/entropy:0.000 - actor/pg_loss:0.481 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:18.671 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.318 - actor/lr:0.000 - critic/score/mean:-0.148 - critic/score/max:-0.057 - critic/score/min:-1.000 - critic/rewards/mean:-0.148 - critic/rewards/max:-0.057 - critic/rewards/min:-1.000 - critic/advantages/mean:-0.387 - critic/advantages/max:1.426 - critic/advantages/min:-1.500 - critic/returns/mean:-0.387 - critic/returns/max:1.426 - critic/returns/min:-1.500 - response_length/mean:152.031 - response_length/max:1024.000 - response_length/min:58.000 - response_length/clip_ratio:0.031 - prompt_length/mean:634.750 - prompt_length/max:783.000 - prompt_length/min:474.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:52.858 - timing_s/reward:0.991 - timing_s/old_log_prob:2.323 - timing_s/adv:0.001 - timing_s/update_actor:17.795 - timing_s/step:73.972 - timing_per_token_ms/gen:10.865 - timing_per_token_ms/update_actor:0.707 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:25177.000 - perf/time_per_step:73.972 - perf/throughput:42.545 - reflection/any_word_frequency:2.000 - reflection/with_length_mean:1024.000 - reflection/without_length_mean:123.903 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-1.000 - reflection/without_reward_mean:-0.121 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:152.031 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.148 - reflection_check/word_check_frequency:1.000 - reflection_check/with_check_length_mean:1024.000 - reflection_check/without_check_length_mean:123.903 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-1.000 - reflection_check/without_check_reward_mean:-0.121 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:152.031 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.148 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:152.031 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.148 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:152.031 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.148 - reflection_wait/word_wait_frequency:1.000 - reflection_wait/with_wait_length_mean:1024.000 - reflection_wait/without_wait_length_mean:123.903 - reflection_wait/with_wait_correct_ratio:0.000 - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:-1.000 - reflection_wait/without_wait_reward_mean:-0.121 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:152.031 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.148 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:152.031 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.148 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:152.031 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.148 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:152.031 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.148 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:152.031 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.148 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:152.031 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.148 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 29%|██▉ | 58/200 [23:41<1:58:57, 50.26s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 30%|██▉ | 59/200 [24:34<1:59:50, 51.00s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) R-type: opcode=0x33. I-type: load(0x03), lui(0x37), op_imm(0x13/0x33), jal(0x6F), auipc(0x17). S-type: store(0x23). B-type: branch(0x63). U-type: jal(0x6F). System: opcode=0x73. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) ir = inputs['IR'] & 0xFFFFFFFF (TaskRunner pid=54845) return {'funct3':ir[17:14], 'funct7':ir[31:25], 'rs1':ir[19:15], (TaskRunner pid=54845) 'rs2':ir[24:20], 'rd':ir[11:7], 'i_imm':((ir[31]|(ir[30:21]<<1)|ir[20]<<12)<<20), (TaskRunner pid=54845) 's_imm':((ir[31]|(ir[30:25]<<1)|ir[11:7]<<12)<<20), (TaskRunner pid=54845) 'b_imm':((ir[31]|(ir[30]|(ir[19]<<1)|(ir[18]<<2)|(ir[17]<<3)| (TaskRunner pid=54845) (ir[16]<<4)|(ir[15]<<5)|(ir[14]<<6)|(ir[13]<<7))<<20), (TaskRunner pid=54845) 'u_imm':(ir[31]<<20)|(ir[30:12]<<12), (TaskRunner pid=54845) 'j_imm':((ir[31]<<20)|(ir[30]<<19)|(ir[19]<<18)|(ir[18]<<17)| (TaskRunner pid=54845) (ir[17]<<16)|(ir[16]<<15)|(ir[15]<<14)|(ir[14]<<13)| (TaskRunner pid=54845) (ir[13]<<12)), 'load':ir[7]==0x03, 'store':ir[7]==0x23, (TaskRunner pid=54845) 'branch':ir[7]==0x63, 'jalr':ir[7]==0x67, 'jal':ir[7]==0x6F, (TaskRunner pid=54845) 'lui':ir[7]==0x37, 'auipc':ir[7]==0x17, (TaskRunner pid=54845) 'op_imm':ir[7]==0x13 or ir[7]==0x33, 'op':ir[7]==0x33, (TaskRunner pid=54845) 'system':ir[7]==0x73} (TaskRunner pid=54845) step:59 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2495.000 - global_seqlen/max:6402.000 - global_seqlen/minmax_diff:3907.000 - global_seqlen/balanced_min:3594.000 - global_seqlen/balanced_max:3932.000 - global_seqlen/mean:3777.750 - actor/entropy:0.000 - actor/pg_loss:-0.404 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:15.475 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.343 - actor/lr:0.000 - critic/score/mean:-0.187 - critic/score/max:-0.062 - critic/score/min:-0.583 - critic/rewards/mean:-0.187 - critic/rewards/max:-0.062 - critic/rewards/min:-0.583 - critic/advantages/mean:-0.131 - critic/advantages/max:1.497 - critic/advantages/min:-1.499 - critic/returns/mean:-0.131 - critic/returns/max:1.497 - critic/returns/min:-1.499 - response_length/mean:191.562 - response_length/max:597.000 - response_length/min:64.000 - response_length/clip_ratio:0.000 - prompt_length/mean:752.875 - prompt_length/max:1053.000 - prompt_length/min:546.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:31.869 - timing_s/reward:1.016 - timing_s/old_log_prob:2.285 - timing_s/adv:0.001 - timing_s/update_actor:17.507 - timing_s/step:52.682 - timing_per_token_ms/gen:5.199 - timing_per_token_ms/update_actor:0.579 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:30222.000 - perf/time_per_step:52.682 - perf/throughput:71.708 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:191.562 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.187 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:191.562 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.187 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:191.562 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.187 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:191.562 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.187 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:191.562 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.187 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:191.562 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.187 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:191.562 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.187 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:191.562 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.187 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:191.562 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.187 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:191.562 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.187 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:191.562 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.187 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:191.562 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.187 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) DRAM-like store: write on WEn=1, else cache-match rd_elif adr==st_addr output stored else rd(WorkerDict pid=56161) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. (WorkerDict pid=56161) return func(*args, **kwargs) (WorkerDict pid=56161) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. (WorkerDict pid=56161) return func(*args, **kwargs) (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 30%|███ | 60/200 [25:30<2:02:49, 52.64s/it] (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.mem = {} (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, i): (TaskRunner pid=54845) if i['WEn']: self.mem[i['adr']&0xFFFFFFFF]=i['wdin']&0xFFFFFFFF (TaskRunner pid=54845) return {'rd_': self.mem.get(i['adr'],i['rd'])&0xFFFFFFFF} (TaskRunner pid=54845) local_global_step_folder: /data/save/python/global_step_60 (WorkerDict pid=56161) [rank-0]: Saving model to /data/save/python/global_step_60/actor/model_world_size_8_rank_0.pt (WorkerDict pid=56161) [rank-0]: Saving checkpoint to /data/save/python/global_step_60/actor/model_world_size_8_rank_0.pt (WorkerDict pid=56161) [rank-0]: Saving extra_state to /data/save/python/global_step_60/actor/extra_state_world_size_8_rank_0.pt (TaskRunner pid=54845) step:60 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2559.000 - global_seqlen/max:4147.000 - global_seqlen/minmax_diff:1588.000 - global_seqlen/balanced_min:3203.000 - global_seqlen/balanced_max:3326.000 - global_seqlen/mean:3231.125 - actor/entropy:0.000 - actor/pg_loss:-0.515 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:22.488 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.296 - actor/lr:0.000 - critic/score/mean:-0.109 - critic/score/max:-0.057 - critic/score/min:-0.321 - critic/rewards/mean:-0.109 - critic/rewards/max:-0.057 - critic/rewards/min:-0.321 - critic/advantages/mean:-0.255 - critic/advantages/max:1.474 - critic/advantages/min:-1.498 - critic/returns/mean:-0.255 - critic/returns/max:1.474 - critic/returns/min:-1.498 - response_length/mean:111.781 - response_length/max:329.000 - response_length/min:58.000 - response_length/clip_ratio:0.000 - prompt_length/mean:696.000 - prompt_length/max:889.000 - prompt_length/min:552.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:19.744 - timing_s/reward:0.987 - timing_s/old_log_prob:2.386 - timing_s/adv:0.001 - timing_s/update_actor:17.920 - timing_s/save_checkpoint:15.421 - timing_s/step:56.463 - timing_per_token_ms/gen:5.520 - timing_per_token_ms/update_actor:0.693 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:25849.000 - perf/time_per_step:56.463 - perf/throughput:57.226 - reflection/any_word_frequency:1.000 - reflection/with_length_mean:92.000 - reflection/without_length_mean:112.419 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.090 - reflection/without_reward_mean:-0.110 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:111.781 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.109 - reflection_check/word_check_frequency:1.000 - reflection_check/with_check_length_mean:92.000 - reflection_check/without_check_length_mean:112.419 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-0.090 - reflection_check/without_check_reward_mean:-0.110 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:111.781 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.109 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:111.781 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.109 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:111.781 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.109 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:111.781 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.109 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:111.781 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.109 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:111.781 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.109 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:111.781 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.109 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:111.781 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.109 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:111.781 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.109 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:111.781 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.109 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (WorkerDict pid=56488) [rank-5]: Saving model to /data/save/python/global_step_60/actor/model_world_size_8_rank_5.pt [repeated 7x across cluster] (WorkerDict pid=56488) [rank-5]: Saving checkpoint to /data/save/python/global_step_60/actor/model_world_size_8_rank_5.pt [repeated 7x across cluster] (WorkerDict pid=56488) [rank-5]: Saving extra_state to /data/save/python/global_step_60/actor/extra_state_world_size_8_rank_5.pt [repeated 7x across cluster] (TaskRunner pid=54845) 5-bit mux: RegDst=0→Instr15_11, RegDst=1→Instr25_21, RegDst=2→31. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 30%|███ | 61/200 [26:03<1:48:19, 46.76s/it] (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): return 31 if inputs['RegDst']==2 else inputs['Instr25_21'] if inputs['RegDst']==1 else inputs['Instr15_11'] (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:61 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2516.000 - global_seqlen/max:5052.000 - global_seqlen/minmax_diff:2536.000 - global_seqlen/balanced_min:3281.000 - global_seqlen/balanced_max:3324.000 - global_seqlen/mean:3300.500 - actor/entropy:0.000 - actor/pg_loss:-0.363 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:21.504 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.475 - actor/lr:0.000 - critic/score/mean:-0.108 - critic/score/max:-0.058 - critic/score/min:-0.220 - critic/rewards/mean:-0.108 - critic/rewards/max:-0.058 - critic/rewards/min:-0.220 - critic/advantages/mean:-0.217 - critic/advantages/max:1.457 - critic/advantages/min:-1.500 - critic/returns/mean:-0.217 - critic/returns/max:1.457 - critic/returns/min:-1.500 - response_length/mean:110.375 - response_length/max:225.000 - response_length/min:59.000 - response_length/clip_ratio:0.000 - prompt_length/mean:714.750 - prompt_length/max:1171.000 - prompt_length/min:524.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:12.018 - timing_s/reward:0.986 - timing_s/old_log_prob:2.359 - timing_s/adv:0.001 - timing_s/update_actor:17.657 - timing_s/step:33.025 - timing_per_token_ms/gen:3.403 - timing_per_token_ms/update_actor:0.669 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:26404.000 - perf/time_per_step:33.025 - perf/throughput:99.938 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:110.375 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.108 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:110.375 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.108 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:110.375 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.108 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:110.375 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.108 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:110.375 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.108 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:110.375 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.108 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:110.375 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.108 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:110.375 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.108 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:110.375 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.108 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:110.375 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.108 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:110.375 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.108 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:110.375 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.108 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) Clock divide by 6: toggle every 3 cycles. Reset on pixel_en=0. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.q = 0 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) if not inputs['pixel_en']: self.q = 0 (TaskRunner pid=54845) else: self.q ^= (inputs['cnt'] & 1) (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:62 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2368.000 - global_seqlen/max:5445.000 - global_seqlen/minmax_diff:3077.000 - global_seqlen/balanced_min:2912.000 - global_seqlen/balanced_max:3401.000 - global_seqlen/mean:3163.750 - actor/entropy:0.000 - actor/pg_loss:-0.170 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:17.475 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.499 - actor/lr:0.000 - critic/score/mean:-0.119 - critic/score/max:-0.059 - critic/score/min:-0.351 - critic/rewards/mean:-0.119 - critic/rewards/max:-0.059 - critic/rewards/min:-0.351 - critic/advantages/mean:-0.201 - critic/advantages/max:1.371 - critic/advantages/min:-1.497 - critic/returns/mean:-0.201 - critic/returns/max:1.371 - critic/returns/min:-1.497 - response_length/mean:122.188 - response_length/max:359.000 - response_length/min:60.000 - response_length/clip_ratio:0.000 - prompt_length/mean:668.750 - prompt_length/max:1119.000 - prompt_length/min:526.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:19.874 - timing_s/reward:0.999 - timing_s/old_log_prob:2.329 - timing_s/adv:0.001 - timing_s/update_actor:17.663 - timing_s/step:40.870 - timing_per_token_ms/gen:5.083 - timing_per_token_ms/update_actor:0.698 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:25310.000 - perf/time_per_step:40.870 - perf/throughput:77.411 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:122.188 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.119 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:122.188 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.119 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:122.188 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.119 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:122.188 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.119 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:122.188 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.119 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:122.188 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.119 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:122.188 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.119 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:122.188 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.119 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:122.188 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.119 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:122.188 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.119 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:122.188 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.119 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:122.188 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.119 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 31%|███ | 62/200 [26:44<1:43:29, 45.00s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 32%|███▏ | 63/200 [27:50<1:56:47, 51.15s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 4-bit binary to base-3 lookup table. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.table = [i for i in range(81)] (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) return {'base_3_binary': self.table[inputs['binary_shift_val'] & 0xF]} (TaskRunner pid=54845) step:63 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2797.000 - global_seqlen/max:8661.000 - global_seqlen/minmax_diff:5864.000 - global_seqlen/balanced_min:3576.000 - global_seqlen/balanced_max:4481.000 - global_seqlen/mean:4007.375 - actor/entropy:0.000 - actor/pg_loss:-0.289 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:19.141 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.474 - actor/lr:0.000 - critic/score/mean:-0.208 - critic/score/max:-0.062 - critic/score/min:-0.807 - critic/rewards/mean:-0.208 - critic/rewards/max:-0.062 - critic/rewards/min:-0.807 - critic/advantages/mean:-0.209 - critic/advantages/max:1.441 - critic/advantages/min:-1.477 - critic/returns/mean:-0.209 - critic/returns/max:1.441 - critic/returns/min:-1.477 - response_length/mean:212.844 - response_length/max:826.000 - response_length/min:64.000 - response_length/clip_ratio:0.000 - prompt_length/mean:789.000 - prompt_length/max:1432.000 - prompt_length/min:517.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:44.158 - timing_s/reward:1.037 - timing_s/old_log_prob:2.402 - timing_s/adv:0.001 - timing_s/update_actor:17.871 - timing_s/step:65.474 - timing_per_token_ms/gen:6.483 - timing_per_token_ms/update_actor:0.557 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:32059.000 - perf/time_per_step:65.474 - perf/throughput:61.206 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:212.844 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.208 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:212.844 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.208 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:212.844 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.208 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:212.844 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.208 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:212.844 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.208 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:212.844 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.208 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:212.844 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.208 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:212.844 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.208 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:212.844 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.208 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:212.844 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.208 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:212.844 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.208 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:212.844 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.208 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) Clock divider: counter increments each cycle, reset to 0, output bit selected by SW2. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.c = 0 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) if inputs['rst']: self.c = 0 (TaskRunner pid=54845) self.c += 1 (TaskRunner pid=54845) return {'clkdiv': self.c & 0xFFFFFFFF, 'Clk_CPU': (self.c >> (2 if not inputs['SW2'] else 24)) & 1} (TaskRunner pid=54845) step:64 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2451.000 - global_seqlen/max:3821.000 - global_seqlen/minmax_diff:1370.000 - global_seqlen/balanced_min:3076.000 - global_seqlen/balanced_max:3088.000 - global_seqlen/mean:3080.125 - actor/entropy:0.000 - actor/pg_loss:-0.280 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:23.548 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.526 - actor/lr:0.000 - critic/score/mean:-0.088 - critic/score/max:-0.052 - critic/score/min:-0.142 - critic/rewards/mean:-0.088 - critic/rewards/max:-0.052 - critic/rewards/min:-0.142 - critic/advantages/mean:-0.085 - critic/advantages/max:1.414 - critic/advantages/min:-1.451 - critic/returns/mean:-0.085 - critic/returns/max:1.414 - critic/returns/min:-1.451 - response_length/mean:89.781 - response_length/max:145.000 - response_length/min:53.000 - response_length/clip_ratio:0.000 - prompt_length/mean:680.250 - prompt_length/max:882.000 - prompt_length/min:536.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:9.974 - timing_s/reward:0.998 - timing_s/old_log_prob:2.373 - timing_s/adv:0.001 - timing_s/update_actor:17.704 - timing_s/step:31.054 - timing_per_token_ms/gen:3.472 - timing_per_token_ms/update_actor:0.718 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:24641.000 - perf/time_per_step:31.054 - perf/throughput:99.185 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:89.781 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.088 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:89.781 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.088 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:89.781 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.088 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:89.781 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.088 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:89.781 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.088 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:89.781 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.088 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:89.781 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.088 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:89.781 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.088 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:89.781 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.088 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:89.781 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.088 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:89.781 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.088 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:89.781 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.088 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 32%|███▏ | 64/200 [28:21<1:42:17, 45.13s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 32%|███▎ | 65/200 [28:59<1:36:40, 42.97s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) VGA timing: active 640x480. h_sync active-low when h_count in [640+16, 640+16+48). v_sync active-low when v_count in [480+10, 480+10+33). video_on = inside active area. animate at h_count==horiz_end and v_count==vert_end. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) return {'x_loc':inputs['h_count']&0x3FF,'y_loc':inputs['v_count']&0x3FF, (TaskRunner pid=54845) 'h_sync':int(not(640<=inputs['h_count']<704)), (TaskRunner pid=54845) 'v_sync':int(not(480<=inputs['v_count']<513)), (TaskRunner pid=54845) 'video_on':int(0<=inputs['h_count']<640 and 0<=inputs['v_count']<480), (TaskRunner pid=54845) 'animate':int(inputs['h_count']==1023 and inputs['v_count']==525)} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:65 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2470.000 - global_seqlen/max:4180.000 - global_seqlen/minmax_diff:1710.000 - global_seqlen/balanced_min:3297.000 - global_seqlen/balanced_max:3313.000 - global_seqlen/mean:3302.500 - actor/entropy:0.000 - actor/pg_loss:-0.311 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:21.754 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.467 - actor/lr:0.000 - critic/score/mean:-0.107 - critic/score/max:-0.063 - critic/score/min:-0.258 - critic/rewards/mean:-0.107 - critic/rewards/max:-0.063 - critic/rewards/min:-0.258 - critic/advantages/mean:-0.111 - critic/advantages/max:1.499 - critic/advantages/min:-1.410 - critic/returns/mean:-0.111 - critic/returns/max:1.499 - critic/returns/min:-1.410 - response_length/mean:110.000 - response_length/max:264.000 - response_length/min:65.000 - response_length/clip_ratio:0.000 - prompt_length/mean:715.625 - prompt_length/max:895.000 - prompt_length/min:543.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:16.339 - timing_s/reward:0.994 - timing_s/old_log_prob:2.383 - timing_s/adv:0.001 - timing_s/update_actor:18.188 - timing_s/step:37.909 - timing_per_token_ms/gen:4.642 - timing_per_token_ms/update_actor:0.688 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:26420.000 - perf/time_per_step:37.909 - perf/throughput:87.116 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:110.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.107 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:110.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.107 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:110.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.107 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:110.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.107 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:110.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.107 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:110.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.107 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:110.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.107 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:110.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.107 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:110.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.107 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:110.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.107 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:110.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.107 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:110.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.107 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) RS232 tx: start+8data+stop. Shift out LSB first. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.shift = 0 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) if not inputs['reset_neg']: self.shift = 0 (TaskRunner pid=54845) elif inputs['tx_datain_ready']: self.shift = inputs['tx_datain'] (TaskRunner pid=54845) self.shift = (self.shift >> 1) & 0xFF (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:66 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2562.000 - global_seqlen/max:7067.000 - global_seqlen/minmax_diff:4505.000 - global_seqlen/balanced_min:3101.000 - global_seqlen/balanced_max:3865.000 - global_seqlen/mean:3476.375 - actor/entropy:0.000 - actor/pg_loss:-0.106 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:83.899 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.471 - actor/lr:0.000 - critic/score/mean:-0.086 - critic/score/max:-0.003 - critic/score/min:-0.157 - critic/rewards/mean:-0.086 - critic/rewards/max:-0.003 - critic/rewards/min:-0.157 - critic/advantages/mean:-0.199 - critic/advantages/max:1.489 - critic/advantages/min:-1.473 - critic/returns/mean:-0.199 - critic/returns/max:1.489 - critic/returns/min:-1.473 - response_length/mean:88.219 - response_length/max:161.000 - response_length/min:3.000 - response_length/clip_ratio:0.000 - prompt_length/mean:780.875 - prompt_length/max:1712.000 - prompt_length/min:569.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:11.345 - timing_s/reward:1.019 - timing_s/old_log_prob:2.383 - timing_s/adv:0.001 - timing_s/update_actor:18.367 - timing_s/step:33.119 - timing_per_token_ms/gen:4.019 - timing_per_token_ms/update_actor:0.660 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:27811.000 - perf/time_per_step:33.119 - perf/throughput:104.966 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:88.219 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.086 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:88.219 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.086 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:88.219 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.086 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:88.219 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.086 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:88.219 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.086 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:88.219 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.086 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:88.219 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.086 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:88.219 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.086 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:88.219 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.086 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:88.219 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.086 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:88.219 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.086 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:88.219 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.086 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 33%|███▎ | 66/200 [29:32<1:29:22, 40.02s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 34%|███▎ | 67/200 [30:04<1:23:41, 37.75s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) Finite-state machine with 5 states. Encode states as integers 0-4. (TaskRunner pid=54845) Compute outputs from current state, compute next state from current state + input. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) return {'z': int(inputs['y'] in (3, 4)), 'Y0': int((inputs['y'], inputs['x']) in [(0, 1), (2, 1), (3, 0)])} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:67 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2310.000 - global_seqlen/max:3613.000 - global_seqlen/minmax_diff:1303.000 - global_seqlen/balanced_min:2918.000 - global_seqlen/balanced_max:2960.000 - global_seqlen/mean:2935.375 - actor/entropy:0.000 - actor/pg_loss:-0.789 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:16.824 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.506 - actor/lr:0.000 - critic/score/mean:-0.088 - critic/score/max:-0.061 - critic/score/min:-0.153 - critic/rewards/mean:-0.088 - critic/rewards/max:-0.061 - critic/rewards/min:-0.153 - critic/advantages/mean:-0.123 - critic/advantages/max:1.469 - critic/advantages/min:-1.490 - critic/returns/mean:-0.123 - critic/returns/max:1.469 - critic/returns/min:-1.490 - response_length/mean:89.719 - response_length/max:157.000 - response_length/min:62.000 - response_length/clip_ratio:0.000 - prompt_length/mean:644.125 - prompt_length/max:810.000 - prompt_length/min:511.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:10.927 - timing_s/reward:1.024 - timing_s/old_log_prob:2.372 - timing_s/adv:0.001 - timing_s/update_actor:18.121 - timing_s/step:32.448 - timing_per_token_ms/gen:3.806 - timing_per_token_ms/update_actor:0.772 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23483.000 - perf/time_per_step:32.448 - perf/throughput:90.463 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:89.719 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.088 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:89.719 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.088 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:89.719 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.088 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:89.719 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.088 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:89.719 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.088 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:89.719 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.088 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:89.719 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.088 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:89.719 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.088 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:89.719 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.088 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:89.719 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.088 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:89.719 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.088 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:89.719 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.088 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) Multiplexer: select input based on select lines. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) return {'y': inputs['sel'] & 3} (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:68 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2426.000 - global_seqlen/max:3734.000 - global_seqlen/minmax_diff:1308.000 - global_seqlen/balanced_min:3050.000 - global_seqlen/balanced_max:3072.000 - global_seqlen/mean:3060.875 - actor/entropy:0.000 - actor/pg_loss:0.709 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:21.648 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.467 - actor/lr:0.000 - critic/score/mean:-0.087 - critic/score/max:-0.057 - critic/score/min:-0.150 - critic/rewards/mean:-0.087 - critic/rewards/max:-0.057 - critic/rewards/min:-0.150 - critic/advantages/mean:-0.155 - critic/advantages/max:1.453 - critic/advantages/min:-1.495 - critic/returns/mean:-0.155 - critic/returns/max:1.453 - critic/returns/min:-1.495 - response_length/mean:89.219 - response_length/max:154.000 - response_length/min:58.000 - response_length/clip_ratio:0.000 - prompt_length/mean:676.000 - prompt_length/max:860.000 - prompt_length/min:538.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:10.933 - timing_s/reward:1.055 - timing_s/old_log_prob:2.383 - timing_s/adv:0.001 - timing_s/update_actor:17.487 - timing_s/step:31.863 - timing_per_token_ms/gen:3.829 - timing_per_token_ms/update_actor:0.714 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:24487.000 - perf/time_per_step:31.863 - perf/throughput:96.064 - reflection/any_word_frequency:1.000 - reflection/with_length_mean:151.000 - reflection/without_length_mean:87.226 - reflection/with_correct_ratio:0.000 - reflection/without_correct_ratio:0.000 - reflection/with_reward_mean:-0.147 - reflection/without_reward_mean:-0.085 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:89.219 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.087 - reflection_check/word_check_frequency:1.000 - reflection_check/with_check_length_mean:151.000 - reflection_check/without_check_length_mean:87.226 - reflection_check/with_check_correct_ratio:0.000 - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:-0.147 - reflection_check/without_check_reward_mean:-0.085 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:89.219 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.087 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:89.219 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.087 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:89.219 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.087 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:89.219 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.087 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:89.219 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.087 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:89.219 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.087 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:89.219 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.087 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:89.219 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.087 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:89.219 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.087 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:89.219 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.087 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 34%|███▍ | 68/200 [30:36<1:19:10, 35.99s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 34%|███▍ | 69/200 [31:14<1:19:47, 36.54s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) BCD to 7-segment lookup table. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.table = [0x3F, 0x06, 0x5B, 0x4F, 0x66, 0x6D, 0x7D, 0x07, 0x7F, 0x6E] (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) return self.table[inputs & 0xF] if inputs <= 9 else 0 (TaskRunner pid=54845) step:69 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2497.000 - global_seqlen/max:3546.000 - global_seqlen/minmax_diff:1049.000 - global_seqlen/balanced_min:2933.000 - global_seqlen/balanced_max:3014.000 - global_seqlen/mean:2947.750 - actor/entropy:0.000 - actor/pg_loss:0.088 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:19.767 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.491 - actor/lr:0.000 - critic/score/mean:-0.112 - critic/score/max:-0.055 - critic/score/min:-0.247 - critic/rewards/mean:-0.112 - critic/rewards/max:-0.055 - critic/rewards/min:-0.247 - critic/advantages/mean:-0.110 - critic/advantages/max:1.469 - critic/advantages/min:-1.498 - critic/returns/mean:-0.110 - critic/returns/max:1.469 - critic/returns/min:-1.498 - response_length/mean:114.562 - response_length/max:253.000 - response_length/min:56.000 - response_length/clip_ratio:0.000 - prompt_length/mean:622.375 - prompt_length/max:730.000 - prompt_length/min:544.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:16.391 - timing_s/reward:1.007 - timing_s/old_log_prob:2.395 - timing_s/adv:0.001 - timing_s/update_actor:18.025 - timing_s/step:37.823 - timing_per_token_ms/gen:4.471 - timing_per_token_ms/update_actor:0.764 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23582.000 - perf/time_per_step:37.823 - perf/throughput:77.936 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:114.562 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.112 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:114.562 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.112 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:114.562 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.112 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:114.562 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.112 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:114.562 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.112 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:114.562 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.112 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:114.562 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.112 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:114.562 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.112 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:114.562 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.112 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:114.562 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.112 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:114.562 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.112 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:114.562 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.112 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 16-bit shifter with three modes. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) return 0xFFFF & (inputs['a'] << inputs['b']) if inputs['alufn']==0 else \ (TaskRunner pid=54845) 0xFFFF & (inputs['a'] >> inputs['b']) if inputs['alufn']==1 else \ (TaskRunner pid=54845) 0xFFFF & (inputs['a'] >> inputs['b']) if inputs['alufn']==3 else 0 (TaskRunner pid=54845) ``` (TaskRunner pid=54845) step:70 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2132.000 - global_seqlen/max:3866.000 - global_seqlen/minmax_diff:1734.000 - global_seqlen/balanced_min:2966.000 - global_seqlen/balanced_max:2999.000 - global_seqlen/mean:2982.625 - actor/entropy:0.000 - actor/pg_loss:-0.079 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:20.317 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.500 - actor/lr:0.000 - critic/score/mean:-0.072 - critic/score/max:-0.047 - critic/score/min:-0.123 - critic/rewards/mean:-0.072 - critic/rewards/max:-0.047 - critic/rewards/min:-0.123 - critic/advantages/mean:-0.065 - critic/advantages/max:1.500 - critic/advantages/min:-1.498 - critic/returns/mean:-0.065 - critic/returns/max:1.500 - critic/returns/min:-1.498 - response_length/mean:73.781 - response_length/max:126.000 - response_length/min:48.000 - response_length/clip_ratio:0.000 - prompt_length/mean:671.875 - prompt_length/max:896.000 - prompt_length/min:475.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:9.575 - timing_s/reward:1.010 - timing_s/old_log_prob:2.375 - timing_s/adv:0.001 - timing_s/update_actor:17.086 - timing_s/step:30.050 - timing_per_token_ms/gen:4.055 - timing_per_token_ms/update_actor:0.716 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23861.000 - perf/time_per_step:30.050 - perf/throughput:99.256 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:73.781 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.072 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:73.781 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.072 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:73.781 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.072 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:73.781 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.072 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:73.781 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.072 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:73.781 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.072 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:73.781 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.072 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:73.781 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.072 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:73.781 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.072 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:73.781 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.072 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:73.781 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.072 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:73.781 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.072 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 35%|███▌ | 70/200 [31:44<1:14:58, 34.60s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 36%|███▌ | 71/200 [32:19<1:14:22, 34.59s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) Clock toggle at 500ps intervals. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) self.clk = 1 (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) self.clk ^= 1 (TaskRunner pid=54845) step:71 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2229.000 - global_seqlen/max:3957.000 - global_seqlen/minmax_diff:1728.000 - global_seqlen/balanced_min:2956.000 - global_seqlen/balanced_max:3031.000 - global_seqlen/mean:2983.375 - actor/entropy:0.000 - actor/pg_loss:0.349 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:73.817 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.488 - actor/lr:0.000 - critic/score/mean:-0.079 - critic/score/max:-0.006 - critic/score/min:-0.214 - critic/rewards/mean:-0.079 - critic/rewards/max:-0.006 - critic/rewards/min:-0.214 - critic/advantages/mean:-0.211 - critic/advantages/max:1.412 - critic/advantages/min:-1.499 - critic/returns/mean:-0.211 - critic/returns/max:1.412 - critic/returns/min:-1.499 - response_length/mean:80.719 - response_length/max:219.000 - response_length/min:6.000 - response_length/clip_ratio:0.000 - prompt_length/mean:665.125 - prompt_length/max:861.000 - prompt_length/min:502.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:13.607 - timing_s/reward:0.998 - timing_s/old_log_prob:2.328 - timing_s/adv:0.001 - timing_s/update_actor:17.618 - timing_s/step:34.556 - timing_per_token_ms/gen:5.268 - timing_per_token_ms/update_actor:0.738 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23867.000 - perf/time_per_step:34.556 - perf/throughput:86.334 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:80.719 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.079 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:80.719 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.079 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:80.719 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.079 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:80.719 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.079 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:80.719 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.079 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:80.719 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.079 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:80.719 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.079 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:80.719 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.079 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:80.719 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.079 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:80.719 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.079 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:80.719 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.079 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:80.719 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.079 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 38-bit ALU with multiply/add. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) return 0 if not inputs['RESETN'] else inputs['A'] * inputs['B'] & 0x3FFFFFFFF (TaskRunner pid=54845) step:72 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2417.000 - global_seqlen/max:3981.000 - global_seqlen/minmax_diff:1564.000 - global_seqlen/balanced_min:2911.000 - global_seqlen/balanced_max:2959.000 - global_seqlen/mean:2940.000 - actor/entropy:0.000 - actor/pg_loss:0.111 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:44.504 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.461 - actor/lr:0.000 - critic/score/mean:-0.062 - critic/score/max:-0.002 - critic/score/min:-0.145 - critic/rewards/mean:-0.062 - critic/rewards/max:-0.002 - critic/rewards/min:-0.145 - critic/advantages/mean:-0.222 - critic/advantages/max:1.479 - critic/advantages/min:-1.437 - critic/returns/mean:-0.222 - critic/returns/max:1.479 - critic/returns/min:-1.437 - response_length/mean:63.625 - response_length/max:148.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:671.375 - prompt_length/max:879.000 - prompt_length/min:545.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:10.123 - timing_s/reward:0.994 - timing_s/old_log_prob:2.378 - timing_s/adv:0.001 - timing_s/update_actor:18.005 - timing_s/step:31.504 - timing_per_token_ms/gen:4.972 - timing_per_token_ms/update_actor:0.766 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23520.000 - perf/time_per_step:31.504 - perf/throughput:93.321 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:63.625 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.062 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:63.625 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.062 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:63.625 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.062 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:63.625 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.062 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:63.625 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.062 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:63.625 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.062 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:63.625 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.062 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:63.625 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.062 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:63.625 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.062 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:63.625 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.062 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:63.625 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.062 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:63.625 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.062 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 36%|███▌ | 72/200 [32:50<1:11:50, 33.67s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 36%|███▋ | 73/200 [33:20<1:08:37, 32.42s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 4-point FFT with butterfly operations. (TaskRunner pid=54845) ```python (TaskRunner pid=54845) class TopModule: (TaskRunner pid=54845) def __init__(self): (TaskRunner pid=54845) pass (TaskRunner pid=54845) (TaskRunner pid=54845) def eval(self, inputs): (TaskRunner pid=54845) return {'OP0': 0, 'OP0i': 0, 'OP1': 0, 'OP1i': 0, 'OP2': 0, 'OP2i': 0, 'OP3': 0, 'OP3i': 0} (TaskRunner pid=54845) step:73 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2391.000 - global_seqlen/max:5973.000 - global_seqlen/minmax_diff:3582.000 - global_seqlen/balanced_min:3411.000 - global_seqlen/balanced_max:3441.000 - global_seqlen/mean:3429.875 - actor/entropy:0.000 - actor/pg_loss:0.090 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:62.926 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.514 - actor/lr:0.000 - critic/score/mean:-0.050 - critic/score/max:-0.002 - critic/score/min:-0.100 - critic/rewards/mean:-0.050 - critic/rewards/max:-0.002 - critic/rewards/min:-0.100 - critic/advantages/mean:-0.280 - critic/advantages/max:1.490 - critic/advantages/min:-1.287 - critic/returns/mean:-0.280 - critic/returns/max:1.490 - critic/returns/min:-1.287 - response_length/mean:51.469 - response_length/max:102.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:806.000 - prompt_length/max:1445.000 - prompt_length/min:556.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:9.160 - timing_s/reward:0.990 - timing_s/old_log_prob:2.304 - timing_s/adv:0.001 - timing_s/update_actor:17.012 - timing_s/step:29.471 - timing_per_token_ms/gen:5.562 - timing_per_token_ms/update_actor:0.620 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:27439.000 - perf/time_per_step:29.471 - perf/throughput:116.382 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:51.469 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.050 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:51.469 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.050 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:51.469 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.050 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:51.469 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.050 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:51.469 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.050 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:51.469 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.050 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:51.469 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.050 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:51.469 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.050 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:51.469 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.050 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:51.469 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.050 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:51.469 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.050 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:51.469 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.050 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 16x4 (TaskRunner pid=54845) step:74 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2118.000 - global_seqlen/max:3645.000 - global_seqlen/minmax_diff:1527.000 - global_seqlen/balanced_min:2534.000 - global_seqlen/balanced_max:2693.000 - global_seqlen/mean:2615.000 - actor/entropy:0.000 - actor/pg_loss:-0.131 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:141.353 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.506 - actor/lr:0.000 - critic/score/mean:-0.026 - critic/score/max:-0.002 - critic/score/min:-0.102 - critic/rewards/mean:-0.026 - critic/rewards/max:-0.002 - critic/rewards/min:-0.102 - critic/advantages/mean:-0.602 - critic/advantages/max:1.500 - critic/advantages/min:-1.500 - critic/returns/mean:-0.602 - critic/returns/max:1.500 - critic/returns/min:-1.500 - response_length/mean:26.125 - response_length/max:104.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:627.625 - prompt_length/max:876.000 - prompt_length/min:508.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:8.227 - timing_s/reward:1.011 - timing_s/old_log_prob:2.374 - timing_s/adv:0.001 - timing_s/update_actor:17.062 - timing_s/step:28.679 - timing_per_token_ms/gen:9.841 - timing_per_token_ms/update_actor:0.816 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20920.000 - perf/time_per_step:28.679 - perf/throughput:91.182 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:26.125 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.026 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:26.125 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.026 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:26.125 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.026 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:26.125 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.026 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:26.125 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.026 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:26.125 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.026 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:26.125 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.026 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:26.125 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.026 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:26.125 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.026 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:26.125 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.026 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:26.125 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.026 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:26.125 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.026 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 37%|███▋ | 74/200 [33:48<1:05:43, 31.30s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 38%|███▊ | 75/200 [34:16<1:02:35, 30.04s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 5 (TaskRunner pid=54845) step:75 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2108.000 - global_seqlen/max:2963.000 - global_seqlen/minmax_diff:855.000 - global_seqlen/balanced_min:2587.000 - global_seqlen/balanced_max:2614.000 - global_seqlen/mean:2601.000 - actor/entropy:0.000 - actor/pg_loss:-0.071 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:172.269 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.505 - actor/lr:0.000 - critic/score/mean:-0.006 - critic/score/max:-0.002 - critic/score/min:-0.055 - critic/rewards/mean:-0.006 - critic/rewards/max:-0.002 - critic/rewards/min:-0.055 - critic/advantages/mean:-0.600 - critic/advantages/max:0.859 - critic/advantages/min:-1.500 - critic/returns/mean:-0.600 - critic/returns/max:0.859 - critic/returns/min:-1.500 - response_length/mean:6.625 - response_length/max:56.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:643.625 - prompt_length/max:729.000 - prompt_length/min:525.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:6.003 - timing_s/reward:0.980 - timing_s/old_log_prob:2.337 - timing_s/adv:0.001 - timing_s/update_actor:17.775 - timing_s/step:27.101 - timing_per_token_ms/gen:28.317 - timing_per_token_ms/update_actor:0.854 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20808.000 - perf/time_per_step:27.101 - perf/throughput:95.976 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:6.625 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.006 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:6.625 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.006 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:6.625 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.006 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:6.625 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.006 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:6.625 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.006 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:6.625 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.006 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:6.625 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.006 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:6.625 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.006 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:6.625 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.006 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:6.625 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.006 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:6.625 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.006 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:6.625 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.006 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 1-bit SR latch with enable. (TaskRunner pid=54845) step:76 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2214.000 - global_seqlen/max:3555.000 - global_seqlen/minmax_diff:1341.000 - global_seqlen/balanced_min:2696.000 - global_seqlen/balanced_max:2754.000 - global_seqlen/mean:2725.625 - actor/entropy:0.000 - actor/pg_loss:0.123 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:208.276 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.521 - actor/lr:0.000 - critic/score/mean:-0.006 - critic/score/max:-0.002 - critic/score/min:-0.014 - critic/rewards/mean:-0.006 - critic/rewards/max:-0.002 - critic/rewards/min:-0.014 - critic/advantages/mean:-0.278 - critic/advantages/max:1.500 - critic/advantages/min:-1.492 - critic/returns/mean:-0.278 - critic/returns/max:1.500 - critic/returns/min:-1.492 - response_length/mean:6.281 - response_length/max:14.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:675.125 - prompt_length/max:883.000 - prompt_length/min:542.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:5.006 - timing_s/reward:1.003 - timing_s/old_log_prob:2.384 - timing_s/adv:0.001 - timing_s/update_actor:17.254 - timing_s/step:25.651 - timing_per_token_ms/gen:24.905 - timing_per_token_ms/update_actor:0.791 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21805.000 - perf/time_per_step:25.651 - perf/throughput:106.256 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:6.281 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.006 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:6.281 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.006 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:6.281 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.006 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:6.281 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.006 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:6.281 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.006 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:6.281 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.006 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:6.281 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.006 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:6.281 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.006 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:6.281 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.006 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:6.281 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.006 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:6.281 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.006 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:6.281 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.006 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 38%|███▊ | 76/200 [34:41<59:22, 28.73s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 38%|███▊ | 77/200 [35:05<55:55, 27.28s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) SR latch (TaskRunner pid=54845) step:77 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2382.000 - global_seqlen/max:2848.000 - global_seqlen/minmax_diff:466.000 - global_seqlen/balanced_min:2501.000 - global_seqlen/balanced_max:2547.000 - global_seqlen/mean:2523.375 - actor/entropy:0.000 - actor/pg_loss:0.375 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:197.534 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.527 - actor/lr:0.000 - critic/score/mean:-0.004 - critic/score/max:-0.002 - critic/score/min:-0.011 - critic/rewards/mean:-0.004 - critic/rewards/max:-0.002 - critic/rewards/min:-0.011 - critic/advantages/mean:-0.038 - critic/advantages/max:0.499 - critic/advantages/min:-1.498 - critic/returns/mean:-0.038 - critic/returns/max:0.499 - critic/returns/min:-1.498 - response_length/mean:3.719 - response_length/max:11.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:627.125 - prompt_length/max:708.000 - prompt_length/min:586.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.170 - timing_s/reward:1.018 - timing_s/old_log_prob:2.369 - timing_s/adv:0.001 - timing_s/update_actor:17.325 - timing_s/step:23.887 - timing_per_token_ms/gen:26.641 - timing_per_token_ms/update_actor:0.858 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20187.000 - perf/time_per_step:23.887 - perf/throughput:105.638 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:3.719 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.004 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:3.719 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.004 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:3.719 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.004 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:3.719 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.004 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:3.719 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.004 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:3.719 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.004 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:3.719 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.004 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:3.719 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.004 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:3.719 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.004 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:3.719 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.004 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:3.719 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.004 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:3.719 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.004 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 32 (TaskRunner pid=54845) step:78 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2088.000 - global_seqlen/max:3112.000 - global_seqlen/minmax_diff:1024.000 - global_seqlen/balanced_min:2562.000 - global_seqlen/balanced_max:2596.000 - global_seqlen/mean:2579.750 - actor/entropy:0.000 - actor/pg_loss:0.097 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:427.005 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.490 - actor/lr:0.000 - critic/score/mean:-0.005 - critic/score/max:-0.002 - critic/score/min:-0.011 - critic/rewards/mean:-0.005 - critic/rewards/max:-0.002 - critic/rewards/min:-0.011 - critic/advantages/mean:-0.167 - critic/advantages/max:1.499 - critic/advantages/min:-1.497 - critic/returns/mean:-0.167 - critic/returns/max:1.499 - critic/returns/min:-1.497 - response_length/mean:4.812 - response_length/max:11.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:640.125 - prompt_length/max:775.000 - prompt_length/min:520.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.986 - timing_s/reward:0.996 - timing_s/old_log_prob:2.389 - timing_s/adv:0.001 - timing_s/update_actor:17.763 - timing_s/step:25.139 - timing_per_token_ms/gen:25.884 - timing_per_token_ms/update_actor:0.861 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20638.000 - perf/time_per_step:25.139 - perf/throughput:102.621 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:4.812 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.005 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:4.812 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.005 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:4.812 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.005 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:4.812 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.005 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:4.812 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.005 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:4.812 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.005 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:4.812 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.005 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:4.812 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.005 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:4.812 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.005 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:4.812 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.005 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:4.812 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.005 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:4.812 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.005 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 39%|███▉ | 78/200 [35:30<54:10, 26.64s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 40%|███▉ | 79/200 [35:55<52:32, 26.05s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 12 (TaskRunner pid=54845) step:79 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2188.000 - global_seqlen/max:4045.000 - global_seqlen/minmax_diff:1857.000 - global_seqlen/balanced_min:2575.000 - global_seqlen/balanced_max:2750.000 - global_seqlen/mean:2663.000 - actor/entropy:0.000 - actor/pg_loss:0.374 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:101.446 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.485 - actor/lr:0.000 - critic/score/mean:-0.003 - critic/score/max:-0.002 - critic/score/min:-0.010 - critic/rewards/mean:-0.003 - critic/rewards/max:-0.002 - critic/rewards/min:-0.010 - critic/advantages/mean:-0.136 - critic/advantages/max:0.500 - critic/advantages/min:-1.500 - critic/returns/mean:-0.136 - critic/returns/max:0.500 - critic/returns/min:-1.500 - response_length/mean:2.750 - response_length/max:10.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:663.000 - prompt_length/max:1009.000 - prompt_length/min:545.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.931 - timing_s/reward:1.061 - timing_s/old_log_prob:2.380 - timing_s/adv:0.001 - timing_s/update_actor:17.283 - timing_s/step:24.660 - timing_per_token_ms/gen:44.671 - timing_per_token_ms/update_actor:0.811 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21304.000 - perf/time_per_step:24.660 - perf/throughput:107.987 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:2.750 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.003 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:2.750 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.003 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:2.750 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.003 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:2.750 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.003 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:2.750 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.003 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:2.750 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.003 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:2.750 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.003 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:2.750 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.003 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:2.750 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.003 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:2.750 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.003 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:2.750 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.003 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:2.750 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.003 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 10 (TaskRunner pid=54845) step:80 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2220.000 - global_seqlen/max:3032.000 - global_seqlen/minmax_diff:812.000 - global_seqlen/balanced_min:2491.000 - global_seqlen/balanced_max:2501.000 - global_seqlen/mean:2494.875 - actor/entropy:0.000 - actor/pg_loss:0.541 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:196.831 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.494 - actor/lr:0.000 - critic/score/mean:-0.003 - critic/score/max:-0.002 - critic/score/min:-0.007 - critic/rewards/mean:-0.003 - critic/rewards/max:-0.002 - critic/rewards/min:-0.007 - critic/advantages/mean:-0.207 - critic/advantages/max:0.866 - critic/advantages/min:-1.498 - critic/returns/mean:-0.207 - critic/returns/max:0.866 - critic/returns/min:-1.498 - response_length/mean:3.219 - response_length/max:7.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:620.500 - prompt_length/max:755.000 - prompt_length/min:553.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.014 - timing_s/reward:1.006 - timing_s/old_log_prob:2.383 - timing_s/adv:0.001 - timing_s/update_actor:17.726 - timing_s/step:24.134 - timing_per_token_ms/gen:29.263 - timing_per_token_ms/update_actor:0.888 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19959.000 - perf/time_per_step:24.134 - perf/throughput:103.377 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:3.219 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.003 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:3.219 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.003 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:3.219 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.003 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:3.219 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.003 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:3.219 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.003 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:3.219 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.003 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:3.219 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.003 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:3.219 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.003 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:3.219 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.003 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:3.219 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.003 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:3.219 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.003 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:3.219 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.003 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 40%|████ | 80/200 [36:19<50:58, 25.49s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 40%|████ | 81/200 [36:44<50:13, 25.32s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 5 (TaskRunner pid=54845) step:81 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2248.000 - global_seqlen/max:3924.000 - global_seqlen/minmax_diff:1676.000 - global_seqlen/balanced_min:2907.000 - global_seqlen/balanced_max:2922.000 - global_seqlen/mean:2914.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:164.285 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.508 - actor/lr:0.000 - critic/score/mean:-0.003 - critic/score/max:-0.002 - critic/score/min:-0.007 - critic/rewards/mean:-0.003 - critic/rewards/max:-0.002 - critic/rewards/min:-0.007 - critic/advantages/mean:-0.118 - critic/advantages/max:0.866 - critic/advantages/min:-0.866 - critic/returns/mean:-0.118 - critic/returns/max:0.866 - critic/returns/min:-0.866 - response_length/mean:2.750 - response_length/max:7.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:725.875 - prompt_length/max:979.000 - prompt_length/min:560.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.754 - timing_s/reward:0.980 - timing_s/old_log_prob:2.383 - timing_s/adv:0.001 - timing_s/update_actor:17.812 - timing_s/step:24.933 - timing_per_token_ms/gen:42.659 - timing_per_token_ms/update_actor:0.764 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23316.000 - perf/time_per_step:24.933 - perf/throughput:116.891 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:2.750 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.003 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:2.750 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.003 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:2.750 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.003 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:2.750 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.003 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:2.750 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.003 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:2.750 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.003 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:2.750 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.003 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:2.750 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.003 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:2.750 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.003 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:2.750 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.003 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:2.750 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.003 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:2.750 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.003 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 4 (TaskRunner pid=54845) step:82 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2528.000 - global_seqlen/max:3001.000 - global_seqlen/minmax_diff:473.000 - global_seqlen/balanced_min:2711.000 - global_seqlen/balanced_max:2727.000 - global_seqlen/mean:2719.375 - actor/entropy:0.000 - actor/pg_loss:0.091 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:193.696 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.506 - actor/lr:0.000 - critic/score/mean:-0.003 - critic/score/max:-0.002 - critic/score/min:-0.006 - critic/rewards/mean:-0.003 - critic/rewards/max:-0.002 - critic/rewards/min:-0.006 - critic/advantages/mean:-0.091 - critic/advantages/max:1.497 - critic/advantages/min:-1.497 - critic/returns/mean:-0.091 - critic/returns/max:1.497 - critic/returns/min:-1.497 - response_length/mean:2.594 - response_length/max:6.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:677.250 - prompt_length/max:745.000 - prompt_length/min:630.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.723 - timing_s/reward:1.031 - timing_s/old_log_prob:2.375 - timing_s/adv:0.001 - timing_s/update_actor:17.494 - timing_s/step:24.628 - timing_per_token_ms/gen:44.860 - timing_per_token_ms/update_actor:0.804 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21755.000 - perf/time_per_step:24.628 - perf/throughput:110.418 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:2.594 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.003 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:2.594 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.003 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:2.594 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.003 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:2.594 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.003 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:2.594 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.003 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:2.594 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.003 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:2.594 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.003 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:2.594 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.003 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:2.594 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.003 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:2.594 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.003 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:2.594 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.003 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:2.594 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.003 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 41%|████ | 82/200 [37:09<49:24, 25.12s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 42%|████▏ | 83/200 [37:33<48:36, 24.93s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 12 (TaskRunner pid=54845) step:83 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2192.000 - global_seqlen/max:3068.000 - global_seqlen/minmax_diff:876.000 - global_seqlen/balanced_min:2306.000 - global_seqlen/balanced_max:2449.000 - global_seqlen/mean:2377.500 - actor/entropy:0.000 - actor/pg_loss:0.374 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:158.110 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.517 - actor/lr:0.000 - critic/score/mean:-0.002 - critic/score/max:-0.002 - critic/score/min:-0.004 - critic/rewards/mean:-0.002 - critic/rewards/max:-0.002 - critic/rewards/min:-0.004 - critic/advantages/mean:-0.059 - critic/advantages/max:0.864 - critic/advantages/min:-1.497 - critic/returns/mean:-0.059 - critic/returns/max:0.864 - critic/returns/min:-1.497 - response_length/mean:2.500 - response_length/max:4.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:591.875 - prompt_length/max:765.000 - prompt_length/min:546.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.787 - timing_s/reward:1.000 - timing_s/old_log_prob:2.354 - timing_s/adv:0.001 - timing_s/update_actor:17.321 - timing_s/step:24.467 - timing_per_token_ms/gen:47.335 - timing_per_token_ms/update_actor:0.911 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19020.000 - perf/time_per_step:24.467 - perf/throughput:97.173 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:2.500 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.002 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:2.500 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.002 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:2.500 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.002 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:2.500 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.002 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:2.500 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.002 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:2.500 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.002 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:2.500 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.002 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:2.500 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.002 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:2.500 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.002 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:2.500 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.002 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:2.500 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.002 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:2.500 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.002 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 2 (TaskRunner pid=54845) step:84 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2248.000 - global_seqlen/max:3084.000 - global_seqlen/minmax_diff:836.000 - global_seqlen/balanced_min:2566.000 - global_seqlen/balanced_max:2570.000 - global_seqlen/mean:2568.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.522 - actor/lr:0.000 - critic/score/mean:-0.002 - critic/score/max:-0.002 - critic/score/min:-0.002 - critic/rewards/mean:-0.002 - critic/rewards/max:-0.002 - critic/rewards/min:-0.002 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:2.000 - response_length/max:2.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:640.000 - prompt_length/max:769.000 - prompt_length/min:560.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:4.494 - timing_s/reward:1.012 - timing_s/old_log_prob:2.294 - timing_s/adv:0.001 - timing_s/update_actor:17.609 - timing_s/step:25.414 - timing_per_token_ms/gen:70.222 - timing_per_token_ms/update_actor:0.857 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20544.000 - perf/time_per_step:25.414 - perf/throughput:101.045 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:2.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.002 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:2.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.002 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:2.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.002 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:2.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.002 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:2.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.002 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:2.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.002 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:2.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.002 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:2.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.002 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:2.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.002 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:2.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.002 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:2.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.002 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:2.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.002 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 42%|████▏ | 84/200 [37:59<48:29, 25.08s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 42%|████▎ | 85/200 [38:24<48:01, 25.06s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 8 (TaskRunner pid=54845) step:85 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2040.000 - global_seqlen/max:3276.000 - global_seqlen/minmax_diff:1236.000 - global_seqlen/balanced_min:2549.000 - global_seqlen/balanced_max:2575.000 - global_seqlen/mean:2562.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.527 - actor/lr:0.000 - critic/score/mean:-0.002 - critic/score/max:-0.002 - critic/score/min:-0.002 - critic/rewards/mean:-0.002 - critic/rewards/max:-0.002 - critic/rewards/min:-0.002 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:2.000 - response_length/max:2.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:638.500 - prompt_length/max:817.000 - prompt_length/min:508.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.974 - timing_s/reward:0.994 - timing_s/old_log_prob:2.379 - timing_s/adv:0.001 - timing_s/update_actor:17.653 - timing_s/step:25.005 - timing_per_token_ms/gen:62.091 - timing_per_token_ms/update_actor:0.861 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20496.000 - perf/time_per_step:25.005 - perf/throughput:102.461 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:2.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.002 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:2.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.002 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:2.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.002 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:2.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.002 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:2.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.002 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:2.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.002 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:2.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.002 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:2.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.002 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:2.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.002 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:2.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.002 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:2.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.002 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:2.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.002 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 5 (TaskRunner pid=54845) step:86 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2212.000 - global_seqlen/max:4624.000 - global_seqlen/minmax_diff:2412.000 - global_seqlen/balanced_min:2728.000 - global_seqlen/balanced_max:2967.000 - global_seqlen/mean:2847.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.506 - actor/lr:0.000 - critic/score/mean:-0.002 - critic/score/max:-0.002 - critic/score/min:-0.002 - critic/rewards/mean:-0.002 - critic/rewards/max:-0.002 - critic/rewards/min:-0.002 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:2.000 - response_length/max:2.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:709.875 - prompt_length/max:1154.000 - prompt_length/min:551.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.484 - timing_s/reward:1.020 - timing_s/old_log_prob:2.371 - timing_s/adv:0.001 - timing_s/update_actor:17.074 - timing_s/step:23.954 - timing_per_token_ms/gen:54.431 - timing_per_token_ms/update_actor:0.750 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22780.000 - perf/time_per_step:23.954 - perf/throughput:118.874 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:2.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.002 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:2.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.002 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:2.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.002 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:2.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.002 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:2.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.002 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:2.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.002 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:2.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.002 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:2.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.002 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:2.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.002 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:2.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.002 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:2.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.002 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:2.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.002 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 43%|████▎ | 86/200 [38:48<46:59, 24.73s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 44%|████▎ | 87/200 [39:12<46:22, 24.62s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 3 (TaskRunner pid=54845) step:87 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2284.000 - global_seqlen/max:4223.000 - global_seqlen/minmax_diff:1939.000 - global_seqlen/balanced_min:2596.000 - global_seqlen/balanced_max:2884.000 - global_seqlen/mean:2740.125 - actor/entropy:0.000 - actor/pg_loss:0.125 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:627.916 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.521 - actor/lr:0.000 - critic/score/mean:-0.002 - critic/score/max:-0.002 - critic/score/min:-0.003 - critic/rewards/mean:-0.002 - critic/rewards/max:-0.002 - critic/rewards/min:-0.003 - critic/advantages/mean:-0.047 - critic/advantages/max:1.497 - critic/advantages/min:-0.864 - critic/returns/mean:-0.047 - critic/returns/max:1.497 - critic/returns/min:-0.864 - response_length/mean:2.156 - response_length/max:3.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:682.875 - prompt_length/max:1053.000 - prompt_length/min:569.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.218 - timing_s/reward:1.013 - timing_s/old_log_prob:2.377 - timing_s/adv:0.001 - timing_s/update_actor:17.738 - timing_s/step:24.350 - timing_per_token_ms/gen:46.639 - timing_per_token_ms/update_actor:0.809 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21921.000 - perf/time_per_step:24.350 - perf/throughput:112.529 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:2.156 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.002 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:2.156 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.002 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:2.156 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.002 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:2.156 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.002 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:2.156 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.002 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:2.156 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.002 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:2.156 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.002 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:2.156 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.002 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:2.156 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.002 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:2.156 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.002 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:2.156 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.002 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:2.156 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.002 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) Positive (TaskRunner pid=54845) step:88 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2256.000 - global_seqlen/max:3644.000 - global_seqlen/minmax_diff:1388.000 - global_seqlen/balanced_min:2601.000 - global_seqlen/balanced_max:2712.000 - global_seqlen/mean:2656.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.531 - actor/lr:0.000 - critic/score/mean:-0.002 - critic/score/max:-0.002 - critic/score/min:-0.002 - critic/rewards/mean:-0.002 - critic/rewards/max:-0.002 - critic/rewards/min:-0.002 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:2.000 - response_length/max:2.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:662.125 - prompt_length/max:909.000 - prompt_length/min:562.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.425 - timing_s/reward:1.010 - timing_s/old_log_prob:2.393 - timing_s/adv:0.001 - timing_s/update_actor:17.671 - timing_s/step:24.504 - timing_per_token_ms/gen:53.517 - timing_per_token_ms/update_actor:0.832 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21252.000 - perf/time_per_step:24.504 - perf/throughput:108.410 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:2.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.002 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:2.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.002 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:2.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.002 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:2.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.002 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:2.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.002 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:2.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.002 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:2.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.002 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:2.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.002 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:2.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.002 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:2.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.002 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:2.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.002 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:2.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.002 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 44%|████▍ | 88/200 [39:37<45:54, 24.60s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 44%|████▍ | 89/200 [40:01<45:21, 24.52s/it] (WorkerDict pid=56161) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. (WorkerDict pid=56161) return func(*args, **kwargs) (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 3 (TaskRunner pid=54845) step:89 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2148.000 - global_seqlen/max:3072.000 - global_seqlen/minmax_diff:924.000 - global_seqlen/balanced_min:2528.000 - global_seqlen/balanced_max:2538.000 - global_seqlen/mean:2533.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.534 - actor/lr:0.000 - critic/score/mean:-0.002 - critic/score/max:-0.002 - critic/score/min:-0.002 - critic/rewards/mean:-0.002 - critic/rewards/max:-0.002 - critic/rewards/min:-0.002 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:2.000 - response_length/max:2.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:631.250 - prompt_length/max:766.000 - prompt_length/min:535.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.628 - timing_s/reward:1.021 - timing_s/old_log_prob:2.380 - timing_s/adv:0.001 - timing_s/update_actor:17.279 - timing_s/step:24.313 - timing_per_token_ms/gen:56.687 - timing_per_token_ms/update_actor:0.853 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20264.000 - perf/time_per_step:24.313 - perf/throughput:104.183 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:2.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.002 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:2.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.002 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:2.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.002 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:2.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.002 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:2.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.002 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:2.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.002 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:2.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.002 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:2.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.002 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:2.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.002 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:2.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.002 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:2.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.002 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:2.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.002 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 5 (TaskRunner pid=54845) local_global_step_folder: /data/save/python/global_step_90 (WorkerDict pid=56161) [rank-0]: Saving model to /data/save/python/global_step_90/actor/model_world_size_8_rank_0.pt (WorkerDict pid=56161) [rank-0]: Saving checkpoint to /data/save/python/global_step_90/actor/model_world_size_8_rank_0.pt (WorkerDict pid=56161) [rank-0]: Saving extra_state to /data/save/python/global_step_90/actor/extra_state_world_size_8_rank_0.pt (TaskRunner pid=54845) step:90 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2056.000 - global_seqlen/max:3116.000 - global_seqlen/minmax_diff:1060.000 - global_seqlen/balanced_min:2514.000 - global_seqlen/balanced_max:2515.000 - global_seqlen/mean:2514.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.515 - actor/lr:0.000 - critic/score/mean:-0.002 - critic/score/max:-0.002 - critic/score/min:-0.002 - critic/rewards/mean:-0.002 - critic/rewards/max:-0.002 - critic/rewards/min:-0.002 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:2.000 - response_length/max:2.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:626.625 - prompt_length/max:777.000 - prompt_length/min:512.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.663 - timing_s/reward:1.021 - timing_s/old_log_prob:2.316 - timing_s/adv:0.001 - timing_s/update_actor:17.683 - timing_s/save_checkpoint:14.828 - timing_s/step:39.517 - timing_per_token_ms/gen:57.240 - timing_per_token_ms/update_actor:0.879 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20116.000 - perf/time_per_step:39.517 - perf/throughput:63.630 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:2.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.002 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:2.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.002 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:2.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.002 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:2.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.002 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:2.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.002 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:2.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.002 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:2.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.002 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:2.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.002 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:2.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.002 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:2.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.002 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:2.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.002 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:2.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.002 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 45%|████▌ | 90/200 [40:40<53:12, 29.02s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 46%|████▌ | 91/200 [41:02<48:48, 26.86s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (WorkerDict pid=56484) [rank-1]: Saving model to /data/save/python/global_step_90/actor/model_world_size_8_rank_1.pt [repeated 7x across cluster] (WorkerDict pid=56484) [rank-1]: Saving checkpoint to /data/save/python/global_step_90/actor/model_world_size_8_rank_1.pt [repeated 7x across cluster] (WorkerDict pid=56484) [rank-1]: Saving extra_state to /data/save/python/global_step_90/actor/extra_state_world_size_8_rank_1.pt [repeated 7x across cluster] (TaskRunner pid=54845) 5 (TaskRunner pid=54845) step:91 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2148.000 - global_seqlen/max:4296.000 - global_seqlen/minmax_diff:2148.000 - global_seqlen/balanced_min:2839.000 - global_seqlen/balanced_max:2870.000 - global_seqlen/mean:2854.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.653 - actor/lr:0.000 - critic/score/mean:-0.002 - critic/score/max:-0.002 - critic/score/min:-0.002 - critic/rewards/mean:-0.002 - critic/rewards/max:-0.002 - critic/rewards/min:-0.002 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:2.000 - response_length/max:2.000 - response_length/min:2.000 - response_length/clip_ratio:0.000 - prompt_length/mean:711.625 - prompt_length/max:1072.000 - prompt_length/min:535.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:1.110 - timing_s/reward:1.067 - timing_s/old_log_prob:2.426 - timing_s/adv:0.001 - timing_s/update_actor:17.215 - timing_s/step:21.823 - timing_per_token_ms/gen:17.340 - timing_per_token_ms/update_actor:0.754 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22836.000 - perf/time_per_step:21.823 - perf/throughput:130.801 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:2.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.002 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:2.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.002 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:2.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.002 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:2.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.002 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:2.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.002 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:2.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.002 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:2.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.002 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:2.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.002 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:2.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.002 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:2.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.002 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:2.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.002 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:2.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.002 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 3 (TaskRunner pid=54845) step:92 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2131.000 - global_seqlen/max:3440.000 - global_seqlen/minmax_diff:1309.000 - global_seqlen/balanced_min:2646.000 - global_seqlen/balanced_max:2758.000 - global_seqlen/mean:2701.875 - actor/entropy:0.000 - actor/pg_loss:0.125 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:249.016 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.677 - actor/lr:0.000 - critic/score/mean:-0.002 - critic/score/max:-0.001 - critic/score/min:-0.002 - critic/rewards/mean:-0.002 - critic/rewards/max:-0.001 - critic/rewards/min:-0.002 - critic/advantages/mean:-0.024 - critic/advantages/max:1.497 - critic/advantages/min:-0.499 - critic/returns/mean:-0.024 - critic/returns/max:1.497 - critic/returns/min:-0.499 - response_length/mean:1.969 - response_length/max:2.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:673.500 - prompt_length/max:858.000 - prompt_length/min:531.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.756 - timing_s/reward:0.976 - timing_s/old_log_prob:2.328 - timing_s/adv:0.001 - timing_s/update_actor:17.305 - timing_s/step:23.369 - timing_per_token_ms/gen:43.739 - timing_per_token_ms/update_actor:0.801 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21615.000 - perf/time_per_step:23.369 - perf/throughput:115.617 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.969 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.002 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.969 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.002 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.969 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.002 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.969 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.002 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.969 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.002 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.969 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.002 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.969 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.002 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.969 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.002 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.969 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.002 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.969 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.002 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.969 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.002 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.969 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.002 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 46%|████▌ | 92/200 [41:26<46:28, 25.82s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 46%|████▋ | 93/200 [41:51<45:45, 25.66s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) 5 (TaskRunner pid=54845) step:93 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2348.000 - global_seqlen/max:5624.000 - global_seqlen/minmax_diff:3276.000 - global_seqlen/balanced_min:3101.000 - global_seqlen/balanced_max:3269.000 - global_seqlen/mean:3185.750 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:171.561 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.665 - actor/lr:0.000 - critic/score/mean:-0.002 - critic/score/max:-0.001 - critic/score/min:-0.007 - critic/rewards/mean:-0.002 - critic/rewards/max:-0.001 - critic/rewards/min:-0.007 - critic/advantages/mean:-0.155 - critic/advantages/max:0.500 - critic/advantages/min:-1.499 - critic/returns/mean:-0.155 - critic/returns/max:0.500 - critic/returns/min:-1.499 - response_length/mean:1.812 - response_length/max:7.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:794.625 - prompt_length/max:1404.000 - prompt_length/min:585.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:4.314 - timing_s/reward:0.991 - timing_s/old_log_prob:2.354 - timing_s/adv:0.001 - timing_s/update_actor:17.608 - timing_s/step:25.271 - timing_per_token_ms/gen:74.373 - timing_per_token_ms/update_actor:0.691 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:25486.000 - perf/time_per_step:25.271 - perf/throughput:126.063 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.812 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.002 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.812 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.002 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.812 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.002 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.812 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.002 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.812 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.002 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.812 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.002 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.812 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.002 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.812 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.002 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.812 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.002 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.812 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.002 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.812 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.002 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.812 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.002 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:94 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2272.000 - global_seqlen/max:3656.000 - global_seqlen/minmax_diff:1384.000 - global_seqlen/balanced_min:2642.000 - global_seqlen/balanced_max:2761.000 - global_seqlen/mean:2701.125 - actor/entropy:0.000 - actor/pg_loss:0.374 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:139.534 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.666 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.002 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.002 - critic/advantages/mean:-0.045 - critic/advantages/max:0.499 - critic/advantages/min:-1.497 - critic/returns/mean:-0.045 - critic/returns/max:0.499 - critic/returns/min:-1.497 - response_length/mean:1.031 - response_length/max:2.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:674.250 - prompt_length/max:913.000 - prompt_length/min:567.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.505 - timing_s/reward:1.036 - timing_s/old_log_prob:2.335 - timing_s/adv:0.001 - timing_s/update_actor:17.419 - timing_s/step:24.300 - timing_per_token_ms/gen:106.202 - timing_per_token_ms/update_actor:0.806 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21609.000 - perf/time_per_step:24.300 - perf/throughput:111.157 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.031 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.031 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.031 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.031 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.031 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.031 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.031 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.031 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.031 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.031 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.031 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.031 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 47%|████▋ | 94/200 [42:15<44:37, 25.26s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 48%|████▊ | 95/200 [42:39<43:38, 24.94s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:95 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2200.000 - global_seqlen/max:2840.000 - global_seqlen/minmax_diff:640.000 - global_seqlen/balanced_min:2586.000 - global_seqlen/balanced_max:2611.000 - global_seqlen/mean:2598.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.676 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:648.625 - prompt_length/max:709.000 - prompt_length/min:549.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.527 - timing_s/reward:1.009 - timing_s/old_log_prob:2.339 - timing_s/adv:0.001 - timing_s/update_actor:17.298 - timing_s/step:24.178 - timing_per_token_ms/gen:110.234 - timing_per_token_ms/update_actor:0.832 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20788.000 - perf/time_per_step:24.178 - perf/throughput:107.473 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:96 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2148.000 - global_seqlen/max:5432.000 - global_seqlen/minmax_diff:3284.000 - global_seqlen/balanced_min:3081.000 - global_seqlen/balanced_max:3323.000 - global_seqlen/mean:3202.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.675 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:799.500 - prompt_length/max:1357.000 - prompt_length/min:536.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.530 - timing_s/reward:0.991 - timing_s/old_log_prob:2.397 - timing_s/adv:0.001 - timing_s/update_actor:17.445 - timing_s/step:24.368 - timing_per_token_ms/gen:110.327 - timing_per_token_ms/update_actor:0.681 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:25616.000 - perf/time_per_step:24.368 - perf/throughput:131.400 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 48%|████▊ | 96/200 [43:04<42:56, 24.77s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 48%|████▊ | 97/200 [43:28<42:26, 24.72s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:97 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2204.000 - global_seqlen/max:3368.000 - global_seqlen/minmax_diff:1164.000 - global_seqlen/balanced_min:2802.000 - global_seqlen/balanced_max:2818.000 - global_seqlen/mean:2810.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.667 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:701.500 - prompt_length/max:841.000 - prompt_length/min:550.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.550 - timing_s/reward:1.063 - timing_s/old_log_prob:2.411 - timing_s/adv:0.001 - timing_s/update_actor:17.550 - timing_s/step:24.579 - timing_per_token_ms/gen:110.939 - timing_per_token_ms/update_actor:0.781 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22480.000 - perf/time_per_step:24.579 - perf/throughput:114.327 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:98 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2148.000 - global_seqlen/max:5668.000 - global_seqlen/minmax_diff:3520.000 - global_seqlen/balanced_min:3115.000 - global_seqlen/balanced_max:3407.000 - global_seqlen/mean:3261.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.647 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:814.250 - prompt_length/max:1416.000 - prompt_length/min:536.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.728 - timing_s/reward:0.992 - timing_s/old_log_prob:2.412 - timing_s/adv:0.001 - timing_s/update_actor:17.483 - timing_s/step:23.620 - timing_per_token_ms/gen:85.264 - timing_per_token_ms/update_actor:0.670 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:26088.000 - perf/time_per_step:23.620 - perf/throughput:138.059 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 49%|████▉ | 98/200 [43:52<41:28, 24.39s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 50%|████▉ | 99/200 [44:16<41:05, 24.41s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:99 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2068.000 - global_seqlen/max:3872.000 - global_seqlen/minmax_diff:1804.000 - global_seqlen/balanced_min:2676.000 - global_seqlen/balanced_max:2693.000 - global_seqlen/mean:2684.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.645 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:670.125 - prompt_length/max:967.000 - prompt_length/min:516.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.166 - timing_s/reward:0.992 - timing_s/old_log_prob:2.387 - timing_s/adv:0.001 - timing_s/update_actor:17.888 - timing_s/step:24.438 - timing_per_token_ms/gen:98.922 - timing_per_token_ms/update_actor:0.833 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21476.000 - perf/time_per_step:24.438 - perf/throughput:109.851 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:100 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:1976.000 - global_seqlen/max:4104.000 - global_seqlen/minmax_diff:2128.000 - global_seqlen/balanced_min:2841.000 - global_seqlen/balanced_max:2856.000 - global_seqlen/mean:2848.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.661 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:711.125 - prompt_length/max:1025.000 - prompt_length/min:493.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.001 - timing_s/reward:1.043 - timing_s/old_log_prob:2.440 - timing_s/adv:0.001 - timing_s/update_actor:17.887 - timing_s/step:24.375 - timing_per_token_ms/gen:93.778 - timing_per_token_ms/update_actor:0.785 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22788.000 - perf/time_per_step:24.375 - perf/throughput:116.859 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 50%|█████ | 100/200 [44:41<40:40, 24.40s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 50%|█████ | 101/200 [45:06<40:26, 24.51s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:101 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2048.000 - global_seqlen/max:3092.000 - global_seqlen/minmax_diff:1044.000 - global_seqlen/balanced_min:2431.000 - global_seqlen/balanced_max:2448.000 - global_seqlen/mean:2439.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.669 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:608.875 - prompt_length/max:772.000 - prompt_length/min:511.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.526 - timing_s/reward:1.015 - timing_s/old_log_prob:2.389 - timing_s/adv:0.001 - timing_s/update_actor:17.816 - timing_s/step:24.751 - timing_per_token_ms/gen:110.202 - timing_per_token_ms/update_actor:0.913 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19516.000 - perf/time_per_step:24.751 - perf/throughput:98.562 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:102 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:1980.000 - global_seqlen/max:5516.000 - global_seqlen/minmax_diff:3536.000 - global_seqlen/balanced_min:2544.000 - global_seqlen/balanced_max:3143.000 - global_seqlen/mean:2843.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.668 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:709.875 - prompt_length/max:1378.000 - prompt_length/min:494.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.041 - timing_s/reward:0.941 - timing_s/old_log_prob:2.373 - timing_s/adv:0.001 - timing_s/update_actor:18.085 - timing_s/step:24.445 - timing_per_token_ms/gen:95.035 - timing_per_token_ms/update_actor:0.795 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22748.000 - perf/time_per_step:24.445 - perf/throughput:116.324 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 51%|█████ | 102/200 [45:30<40:00, 24.50s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 52%|█████▏ | 103/200 [45:55<39:44, 24.58s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:103 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2148.000 - global_seqlen/max:3428.000 - global_seqlen/minmax_diff:1280.000 - global_seqlen/balanced_min:2639.000 - global_seqlen/balanced_max:2657.000 - global_seqlen/mean:2648.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.684 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:661.000 - prompt_length/max:856.000 - prompt_length/min:536.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.159 - timing_s/reward:0.986 - timing_s/old_log_prob:2.383 - timing_s/adv:0.001 - timing_s/update_actor:18.211 - timing_s/step:24.745 - timing_per_token_ms/gen:98.730 - timing_per_token_ms/update_actor:0.860 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21184.000 - perf/time_per_step:24.745 - perf/throughput:107.013 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:104 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2280.000 - global_seqlen/max:3216.000 - global_seqlen/minmax_diff:936.000 - global_seqlen/balanced_min:2555.000 - global_seqlen/balanced_max:2601.000 - global_seqlen/mean:2578.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.673 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:643.500 - prompt_length/max:803.000 - prompt_length/min:569.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.645 - timing_s/reward:0.965 - timing_s/old_log_prob:2.394 - timing_s/adv:0.001 - timing_s/update_actor:18.107 - timing_s/step:24.115 - timing_per_token_ms/gen:82.655 - timing_per_token_ms/update_actor:0.878 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20624.000 - perf/time_per_step:24.115 - perf/throughput:106.906 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 52%|█████▏ | 104/200 [46:19<39:06, 24.45s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 52%|█████▎ | 105/200 [46:43<38:42, 24.44s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:105 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2140.000 - global_seqlen/max:3044.000 - global_seqlen/minmax_diff:904.000 - global_seqlen/balanced_min:2675.000 - global_seqlen/balanced_max:2754.000 - global_seqlen/mean:2714.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.668 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:677.625 - prompt_length/max:760.000 - prompt_length/min:534.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.155 - timing_s/reward:0.968 - timing_s/old_log_prob:2.391 - timing_s/adv:0.001 - timing_s/update_actor:17.906 - timing_s/step:24.424 - timing_per_token_ms/gen:98.583 - timing_per_token_ms/update_actor:0.825 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21716.000 - perf/time_per_step:24.424 - perf/throughput:111.139 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:106 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2244.000 - global_seqlen/max:5192.000 - global_seqlen/minmax_diff:2948.000 - global_seqlen/balanced_min:2934.000 - global_seqlen/balanced_max:3109.000 - global_seqlen/mean:3021.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.661 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:754.375 - prompt_length/max:1297.000 - prompt_length/min:560.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.066 - timing_s/reward:1.011 - timing_s/old_log_prob:2.391 - timing_s/adv:0.001 - timing_s/update_actor:17.849 - timing_s/step:24.323 - timing_per_token_ms/gen:95.827 - timing_per_token_ms/update_actor:0.738 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:24172.000 - perf/time_per_step:24.323 - perf/throughput:124.225 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 53%|█████▎ | 106/200 [47:08<38:14, 24.41s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 54%|█████▎ | 107/200 [47:32<37:46, 24.37s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:107 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2020.000 - global_seqlen/max:3464.000 - global_seqlen/minmax_diff:1444.000 - global_seqlen/balanced_min:2458.000 - global_seqlen/balanced_max:2501.000 - global_seqlen/mean:2479.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.641 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:618.875 - prompt_length/max:865.000 - prompt_length/min:504.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.469 - timing_s/reward:0.947 - timing_s/old_log_prob:2.336 - timing_s/adv:0.001 - timing_s/update_actor:17.504 - timing_s/step:24.261 - timing_per_token_ms/gen:108.406 - timing_per_token_ms/update_actor:0.882 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19836.000 - perf/time_per_step:24.261 - perf/throughput:102.200 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:108 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2168.000 - global_seqlen/max:3416.000 - global_seqlen/minmax_diff:1248.000 - global_seqlen/balanced_min:2839.000 - global_seqlen/balanced_max:2840.000 - global_seqlen/mean:2839.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.653 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:708.875 - prompt_length/max:853.000 - prompt_length/min:541.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.117 - timing_s/reward:0.958 - timing_s/old_log_prob:2.400 - timing_s/adv:0.001 - timing_s/update_actor:18.053 - timing_s/step:24.533 - timing_per_token_ms/gen:97.420 - timing_per_token_ms/update_actor:0.795 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22716.000 - perf/time_per_step:24.533 - perf/throughput:115.743 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 54%|█████▍ | 108/200 [47:57<37:26, 24.42s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 55%|█████▍ | 109/200 [48:21<37:04, 24.44s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:109 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2180.000 - global_seqlen/max:3880.000 - global_seqlen/minmax_diff:1700.000 - global_seqlen/balanced_min:2539.000 - global_seqlen/balanced_max:2720.000 - global_seqlen/mean:2629.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.666 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:656.375 - prompt_length/max:969.000 - prompt_length/min:544.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.500 - timing_s/reward:0.958 - timing_s/old_log_prob:2.323 - timing_s/adv:0.001 - timing_s/update_actor:17.690 - timing_s/step:24.476 - timing_per_token_ms/gen:109.372 - timing_per_token_ms/update_actor:0.841 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21036.000 - perf/time_per_step:24.476 - perf/throughput:107.434 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:110 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2044.000 - global_seqlen/max:3156.000 - global_seqlen/minmax_diff:1112.000 - global_seqlen/balanced_min:2569.000 - global_seqlen/balanced_max:2583.000 - global_seqlen/mean:2576.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.647 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:643.000 - prompt_length/max:788.000 - prompt_length/min:510.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.596 - timing_s/reward:0.938 - timing_s/old_log_prob:2.359 - timing_s/adv:0.001 - timing_s/update_actor:17.527 - timing_s/step:23.425 - timing_per_token_ms/gen:81.114 - timing_per_token_ms/update_actor:0.850 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20608.000 - perf/time_per_step:23.425 - perf/throughput:109.967 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 55%|█████▌ | 110/200 [48:45<36:12, 24.14s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 56%|█████▌ | 111/200 [49:09<36:07, 24.36s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:111 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2140.000 - global_seqlen/max:4276.000 - global_seqlen/minmax_diff:2136.000 - global_seqlen/balanced_min:2788.000 - global_seqlen/balanced_max:2817.000 - global_seqlen/mean:2802.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.644 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:699.625 - prompt_length/max:1068.000 - prompt_length/min:534.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.613 - timing_s/reward:0.968 - timing_s/old_log_prob:2.396 - timing_s/adv:0.001 - timing_s/update_actor:17.848 - timing_s/step:24.831 - timing_per_token_ms/gen:112.918 - timing_per_token_ms/update_actor:0.796 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22420.000 - perf/time_per_step:24.831 - perf/throughput:112.864 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:112 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2160.000 - global_seqlen/max:2956.000 - global_seqlen/minmax_diff:796.000 - global_seqlen/balanced_min:2330.000 - global_seqlen/balanced_max:2429.000 - global_seqlen/mean:2379.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.651 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:593.875 - prompt_length/max:738.000 - prompt_length/min:539.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.145 - timing_s/reward:0.925 - timing_s/old_log_prob:2.335 - timing_s/adv:0.001 - timing_s/update_actor:17.480 - timing_s/step:23.891 - timing_per_token_ms/gen:98.291 - timing_per_token_ms/update_actor:0.918 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19036.000 - perf/time_per_step:23.891 - perf/throughput:99.599 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 56%|█████▌ | 112/200 [49:33<35:31, 24.22s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 56%|█████▋ | 113/200 [49:57<35:05, 24.20s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:113 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2128.000 - global_seqlen/max:3892.000 - global_seqlen/minmax_diff:1764.000 - global_seqlen/balanced_min:2875.000 - global_seqlen/balanced_max:2875.000 - global_seqlen/mean:2875.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.681 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:717.750 - prompt_length/max:972.000 - prompt_length/min:531.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.022 - timing_s/reward:0.935 - timing_s/old_log_prob:2.363 - timing_s/adv:0.001 - timing_s/update_actor:17.803 - timing_s/step:24.129 - timing_per_token_ms/gen:94.446 - timing_per_token_ms/update_actor:0.774 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23000.000 - perf/time_per_step:24.129 - perf/throughput:119.151 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:114 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2024.000 - global_seqlen/max:3024.000 - global_seqlen/minmax_diff:1000.000 - global_seqlen/balanced_min:2562.000 - global_seqlen/balanced_max:2571.000 - global_seqlen/mean:2566.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.666 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:640.625 - prompt_length/max:755.000 - prompt_length/min:505.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.560 - timing_s/reward:0.987 - timing_s/old_log_prob:2.409 - timing_s/adv:0.001 - timing_s/update_actor:17.715 - timing_s/step:24.676 - timing_per_token_ms/gen:111.251 - timing_per_token_ms/update_actor:0.863 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20532.000 - perf/time_per_step:24.676 - perf/throughput:104.008 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 57%|█████▋ | 114/200 [50:22<34:53, 24.35s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 57%|█████▊ | 115/200 [50:46<34:28, 24.33s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:115 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2272.000 - global_seqlen/max:3824.000 - global_seqlen/minmax_diff:1552.000 - global_seqlen/balanced_min:2954.000 - global_seqlen/balanced_max:2959.000 - global_seqlen/mean:2956.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.656 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:738.125 - prompt_length/max:955.000 - prompt_length/min:567.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.381 - timing_s/reward:0.939 - timing_s/old_log_prob:2.378 - timing_s/adv:0.001 - timing_s/update_actor:17.589 - timing_s/step:24.292 - timing_per_token_ms/gen:105.649 - timing_per_token_ms/update_actor:0.744 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23652.000 - perf/time_per_step:24.292 - perf/throughput:121.708 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:116 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2088.000 - global_seqlen/max:3152.000 - global_seqlen/minmax_diff:1064.000 - global_seqlen/balanced_min:2546.000 - global_seqlen/balanced_max:2551.000 - global_seqlen/mean:2548.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.686 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:636.125 - prompt_length/max:787.000 - prompt_length/min:521.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.031 - timing_s/reward:0.955 - timing_s/old_log_prob:2.399 - timing_s/adv:0.001 - timing_s/update_actor:17.668 - timing_s/step:24.059 - timing_per_token_ms/gen:94.734 - timing_per_token_ms/update_actor:0.867 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20388.000 - perf/time_per_step:24.059 - perf/throughput:105.927 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 58%|█████▊ | 116/200 [51:11<33:57, 24.26s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 58%|█████▊ | 117/200 [51:35<33:39, 24.33s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:117 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2220.000 - global_seqlen/max:4600.000 - global_seqlen/minmax_diff:2380.000 - global_seqlen/balanced_min:2977.000 - global_seqlen/balanced_max:2992.000 - global_seqlen/mean:2984.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.664 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:745.125 - prompt_length/max:1149.000 - prompt_length/min:554.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.187 - timing_s/reward:1.012 - timing_s/old_log_prob:2.393 - timing_s/adv:0.001 - timing_s/update_actor:17.882 - timing_s/step:24.478 - timing_per_token_ms/gen:99.597 - timing_per_token_ms/update_actor:0.749 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23876.000 - perf/time_per_step:24.478 - perf/throughput:121.924 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:118 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2040.000 - global_seqlen/max:3768.000 - global_seqlen/minmax_diff:1728.000 - global_seqlen/balanced_min:2739.000 - global_seqlen/balanced_max:2766.000 - global_seqlen/mean:2752.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.666 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:687.125 - prompt_length/max:941.000 - prompt_length/min:509.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.534 - timing_s/reward:0.943 - timing_s/old_log_prob:2.325 - timing_s/adv:0.001 - timing_s/update_actor:17.462 - timing_s/step:24.268 - timing_per_token_ms/gen:110.449 - timing_per_token_ms/update_actor:0.793 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22020.000 - perf/time_per_step:24.268 - perf/throughput:113.419 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 59%|█████▉ | 118/200 [51:59<33:13, 24.31s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 60%|█████▉ | 119/200 [52:23<32:29, 24.07s/it] (WorkerDict pid=56161) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. (WorkerDict pid=56161) return func(*args, **kwargs) (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:119 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2084.000 - global_seqlen/max:3104.000 - global_seqlen/minmax_diff:1020.000 - global_seqlen/balanced_min:2678.000 - global_seqlen/balanced_max:2727.000 - global_seqlen/mean:2702.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.669 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:674.625 - prompt_length/max:775.000 - prompt_length/min:520.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.619 - timing_s/reward:0.943 - timing_s/old_log_prob:2.365 - timing_s/adv:0.001 - timing_s/update_actor:17.536 - timing_s/step:23.467 - timing_per_token_ms/gen:81.846 - timing_per_token_ms/update_actor:0.811 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21620.000 - perf/time_per_step:23.467 - perf/throughput:115.160 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) local_global_step_folder: /data/save/python/global_step_120 (WorkerDict pid=56485) [rank-2]: Saving model to /data/save/python/global_step_120/actor/model_world_size_8_rank_2.pt (WorkerDict pid=56485) [rank-2]: Saving checkpoint to /data/save/python/global_step_120/actor/model_world_size_8_rank_2.pt (WorkerDict pid=56485) [rank-2]: Saving extra_state to /data/save/python/global_step_120/actor/extra_state_world_size_8_rank_2.pt (TaskRunner pid=54845) step:120 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2140.000 - global_seqlen/max:3160.000 - global_seqlen/minmax_diff:1020.000 - global_seqlen/balanced_min:2551.000 - global_seqlen/balanced_max:2574.000 - global_seqlen/mean:2562.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.667 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:639.625 - prompt_length/max:789.000 - prompt_length/min:534.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.162 - timing_s/reward:0.970 - timing_s/old_log_prob:2.332 - timing_s/adv:0.001 - timing_s/update_actor:17.750 - timing_s/save_checkpoint:15.004 - timing_s/step:39.222 - timing_per_token_ms/gen:98.821 - timing_per_token_ms/update_actor:0.866 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20500.000 - perf/time_per_step:39.222 - perf/throughput:65.333 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 60%|██████ | 120/200 [53:02<38:09, 28.62s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 60%|██████ | 121/200 [53:24<35:01, 26.60s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (WorkerDict pid=56490) [rank-7]: Saving model to /data/save/python/global_step_120/actor/model_world_size_8_rank_7.pt [repeated 7x across cluster] (WorkerDict pid=56490) [rank-7]: Saving checkpoint to /data/save/python/global_step_120/actor/model_world_size_8_rank_7.pt [repeated 7x across cluster] (WorkerDict pid=56490) [rank-7]: Saving extra_state to /data/save/python/global_step_120/actor/extra_state_world_size_8_rank_7.pt [repeated 7x across cluster] (TaskRunner pid=54845) (TaskRunner pid=54845) step:121 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2028.000 - global_seqlen/max:3252.000 - global_seqlen/minmax_diff:1224.000 - global_seqlen/balanced_min:2484.000 - global_seqlen/balanced_max:2563.000 - global_seqlen/mean:2523.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.680 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:629.875 - prompt_length/max:812.000 - prompt_length/min:506.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:0.998 - timing_s/reward:0.960 - timing_s/old_log_prob:2.408 - timing_s/adv:0.001 - timing_s/update_actor:17.508 - timing_s/step:21.879 - timing_per_token_ms/gen:31.172 - timing_per_token_ms/update_actor:0.867 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20188.000 - perf/time_per_step:21.879 - perf/throughput:115.341 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:122 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2000.000 - global_seqlen/max:3992.000 - global_seqlen/minmax_diff:1992.000 - global_seqlen/balanced_min:2873.000 - global_seqlen/balanced_max:2967.000 - global_seqlen/mean:2920.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.726 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:729.000 - prompt_length/max:997.000 - prompt_length/min:499.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.088 - timing_s/reward:0.958 - timing_s/old_log_prob:2.332 - timing_s/adv:0.001 - timing_s/update_actor:17.455 - timing_s/step:23.839 - timing_per_token_ms/gen:96.514 - timing_per_token_ms/update_actor:0.747 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23360.000 - perf/time_per_step:23.839 - perf/throughput:122.490 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 61%|██████ | 122/200 [53:48<33:30, 25.78s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 62%|██████▏ | 123/200 [54:12<32:26, 25.28s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:123 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2048.000 - global_seqlen/max:3228.000 - global_seqlen/minmax_diff:1180.000 - global_seqlen/balanced_min:2463.000 - global_seqlen/balanced_max:2477.000 - global_seqlen/mean:2470.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.716 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:616.500 - prompt_length/max:806.000 - prompt_length/min:511.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.147 - timing_s/reward:0.958 - timing_s/old_log_prob:2.340 - timing_s/adv:0.001 - timing_s/update_actor:17.663 - timing_s/step:24.112 - timing_per_token_ms/gen:98.341 - timing_per_token_ms/update_actor:0.894 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19760.000 - perf/time_per_step:24.112 - perf/throughput:102.437 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:124 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2308.000 - global_seqlen/max:3548.000 - global_seqlen/minmax_diff:1240.000 - global_seqlen/balanced_min:2696.000 - global_seqlen/balanced_max:2772.000 - global_seqlen/mean:2734.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.716 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:682.500 - prompt_length/max:886.000 - prompt_length/min:576.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.808 - timing_s/reward:0.990 - timing_s/old_log_prob:2.339 - timing_s/adv:0.001 - timing_s/update_actor:17.618 - timing_s/step:23.761 - timing_per_token_ms/gen:87.761 - timing_per_token_ms/update_actor:0.806 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21872.000 - perf/time_per_step:23.761 - perf/throughput:115.061 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 62%|██████▏ | 124/200 [54:36<31:27, 24.83s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 62%|██████▎ | 125/200 [55:00<30:41, 24.55s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:125 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2212.000 - global_seqlen/max:3984.000 - global_seqlen/minmax_diff:1772.000 - global_seqlen/balanced_min:2992.000 - global_seqlen/balanced_max:2997.000 - global_seqlen/mean:2994.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.712 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:747.625 - prompt_length/max:995.000 - prompt_length/min:552.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.076 - timing_s/reward:0.947 - timing_s/old_log_prob:2.319 - timing_s/adv:0.001 - timing_s/update_actor:17.546 - timing_s/step:23.894 - timing_per_token_ms/gen:96.137 - timing_per_token_ms/update_actor:0.732 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23956.000 - perf/time_per_step:23.894 - perf/throughput:125.327 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:126 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2036.000 - global_seqlen/max:3508.000 - global_seqlen/minmax_diff:1472.000 - global_seqlen/balanced_min:2779.000 - global_seqlen/balanced_max:2805.000 - global_seqlen/mean:2792.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.712 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:697.000 - prompt_length/max:876.000 - prompt_length/min:508.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.946 - timing_s/reward:0.955 - timing_s/old_log_prob:2.382 - timing_s/adv:0.001 - timing_s/update_actor:17.640 - timing_s/step:23.928 - timing_per_token_ms/gen:92.056 - timing_per_token_ms/update_actor:0.790 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22336.000 - perf/time_per_step:23.928 - perf/throughput:116.684 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 63%|██████▎ | 126/200 [55:24<30:03, 24.37s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 64%|██████▎ | 127/200 [55:47<29:18, 24.09s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:127 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2164.000 - global_seqlen/max:4880.000 - global_seqlen/minmax_diff:2716.000 - global_seqlen/balanced_min:3172.000 - global_seqlen/balanced_max:3180.000 - global_seqlen/mean:3176.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.720 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:793.000 - prompt_length/max:1219.000 - prompt_length/min:540.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.627 - timing_s/reward:0.961 - timing_s/old_log_prob:2.342 - timing_s/adv:0.001 - timing_s/update_actor:17.492 - timing_s/step:23.427 - timing_per_token_ms/gen:82.082 - timing_per_token_ms/update_actor:0.688 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:25408.000 - perf/time_per_step:23.427 - perf/throughput:135.572 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:128 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2012.000 - global_seqlen/max:2876.000 - global_seqlen/minmax_diff:864.000 - global_seqlen/balanced_min:2377.000 - global_seqlen/balanced_max:2378.000 - global_seqlen/mean:2377.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.705 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:593.375 - prompt_length/max:718.000 - prompt_length/min:502.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.114 - timing_s/reward:0.989 - timing_s/old_log_prob:2.343 - timing_s/adv:0.001 - timing_s/update_actor:17.539 - timing_s/step:23.990 - timing_per_token_ms/gen:97.319 - timing_per_token_ms/update_actor:0.922 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19020.000 - perf/time_per_step:23.990 - perf/throughput:99.102 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 64%|██████▍ | 128/200 [56:11<28:52, 24.07s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 64%|██████▍ | 129/200 [56:35<28:28, 24.07s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:129 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:1928.000 - global_seqlen/max:3044.000 - global_seqlen/minmax_diff:1116.000 - global_seqlen/balanced_min:2665.000 - global_seqlen/balanced_max:2668.000 - global_seqlen/mean:2666.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.680 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:665.625 - prompt_length/max:760.000 - prompt_length/min:481.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.678 - timing_s/reward:0.940 - timing_s/old_log_prob:2.377 - timing_s/adv:0.001 - timing_s/update_actor:18.042 - timing_s/step:24.041 - timing_per_token_ms/gen:83.673 - timing_per_token_ms/update_actor:0.846 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21332.000 - perf/time_per_step:24.041 - perf/throughput:110.914 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:130 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2248.000 - global_seqlen/max:2932.000 - global_seqlen/minmax_diff:684.000 - global_seqlen/balanced_min:2568.000 - global_seqlen/balanced_max:2580.000 - global_seqlen/mean:2574.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.667 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:642.500 - prompt_length/max:732.000 - prompt_length/min:561.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.498 - timing_s/reward:0.938 - timing_s/old_log_prob:2.381 - timing_s/adv:0.001 - timing_s/update_actor:18.008 - timing_s/step:24.830 - timing_per_token_ms/gen:109.318 - timing_per_token_ms/update_actor:0.875 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20592.000 - perf/time_per_step:24.830 - perf/throughput:103.663 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 65%|██████▌ | 130/200 [57:00<28:20, 24.30s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 66%|██████▌ | 131/200 [57:25<28:10, 24.50s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:131 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2132.000 - global_seqlen/max:3908.000 - global_seqlen/minmax_diff:1776.000 - global_seqlen/balanced_min:2675.000 - global_seqlen/balanced_max:2822.000 - global_seqlen/mean:2748.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.715 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:686.125 - prompt_length/max:976.000 - prompt_length/min:532.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.578 - timing_s/reward:1.004 - timing_s/old_log_prob:2.387 - timing_s/adv:0.001 - timing_s/update_actor:17.980 - timing_s/step:24.954 - timing_per_token_ms/gen:111.803 - timing_per_token_ms/update_actor:0.818 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21988.000 - perf/time_per_step:24.954 - perf/throughput:110.143 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:132 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2140.000 - global_seqlen/max:3364.000 - global_seqlen/minmax_diff:1224.000 - global_seqlen/balanced_min:2487.000 - global_seqlen/balanced_max:2542.000 - global_seqlen/mean:2514.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.681 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:627.625 - prompt_length/max:840.000 - prompt_length/min:534.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.008 - timing_s/reward:0.962 - timing_s/old_log_prob:2.348 - timing_s/adv:0.001 - timing_s/update_actor:17.779 - timing_s/step:24.102 - timing_per_token_ms/gen:93.991 - timing_per_token_ms/update_actor:0.884 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20116.000 - perf/time_per_step:24.102 - perf/throughput:104.329 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 66%|██████▌ | 132/200 [57:49<27:38, 24.38s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 66%|██████▋ | 133/200 [58:13<27:03, 24.23s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:133 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2100.000 - global_seqlen/max:3544.000 - global_seqlen/minmax_diff:1444.000 - global_seqlen/balanced_min:2683.000 - global_seqlen/balanced_max:2686.000 - global_seqlen/mean:2684.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.706 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:670.125 - prompt_length/max:885.000 - prompt_length/min:524.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.610 - timing_s/reward:0.946 - timing_s/old_log_prob:2.436 - timing_s/adv:0.001 - timing_s/update_actor:17.854 - timing_s/step:23.850 - timing_per_token_ms/gen:81.565 - timing_per_token_ms/update_actor:0.831 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21476.000 - perf/time_per_step:23.850 - perf/throughput:112.557 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:134 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2056.000 - global_seqlen/max:3892.000 - global_seqlen/minmax_diff:1836.000 - global_seqlen/balanced_min:2524.000 - global_seqlen/balanced_max:2657.000 - global_seqlen/mean:2590.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.723 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:646.625 - prompt_length/max:972.000 - prompt_length/min:513.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.557 - timing_s/reward:0.961 - timing_s/old_log_prob:2.384 - timing_s/adv:0.001 - timing_s/update_actor:18.083 - timing_s/step:24.991 - timing_per_token_ms/gen:111.148 - timing_per_token_ms/update_actor:0.873 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20724.000 - perf/time_per_step:24.991 - perf/throughput:103.659 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 67%|██████▋ | 134/200 [58:38<26:54, 24.46s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 68%|██████▊ | 135/200 [59:03<26:39, 24.60s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:135 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2184.000 - global_seqlen/max:3820.000 - global_seqlen/minmax_diff:1636.000 - global_seqlen/balanced_min:2812.000 - global_seqlen/balanced_max:2831.000 - global_seqlen/mean:2821.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.725 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:704.375 - prompt_length/max:954.000 - prompt_length/min:545.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.553 - timing_s/reward:0.964 - timing_s/old_log_prob:2.384 - timing_s/adv:0.001 - timing_s/update_actor:18.007 - timing_s/step:24.913 - timing_per_token_ms/gen:111.026 - timing_per_token_ms/update_actor:0.798 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22572.000 - perf/time_per_step:24.913 - perf/throughput:113.254 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:136 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2240.000 - global_seqlen/max:3100.000 - global_seqlen/minmax_diff:860.000 - global_seqlen/balanced_min:2601.000 - global_seqlen/balanced_max:2621.000 - global_seqlen/mean:2611.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.716 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:651.750 - prompt_length/max:774.000 - prompt_length/min:559.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.134 - timing_s/reward:0.947 - timing_s/old_log_prob:2.367 - timing_s/adv:0.001 - timing_s/update_actor:17.535 - timing_s/step:23.987 - timing_per_token_ms/gen:97.942 - timing_per_token_ms/update_actor:0.839 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20888.000 - perf/time_per_step:23.987 - perf/throughput:108.850 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 68%|██████▊ | 136/200 [59:27<26:03, 24.43s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 68%|██████▊ | 137/200 [59:52<25:46, 24.56s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:137 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2236.000 - global_seqlen/max:2996.000 - global_seqlen/minmax_diff:760.000 - global_seqlen/balanced_min:2530.000 - global_seqlen/balanced_max:2534.000 - global_seqlen/mean:2532.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.697 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:632.000 - prompt_length/max:748.000 - prompt_length/min:558.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.586 - timing_s/reward:0.993 - timing_s/old_log_prob:2.391 - timing_s/adv:0.001 - timing_s/update_actor:17.867 - timing_s/step:24.841 - timing_per_token_ms/gen:112.055 - timing_per_token_ms/update_actor:0.882 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20256.000 - perf/time_per_step:24.841 - perf/throughput:101.929 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:138 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2348.000 - global_seqlen/max:3764.000 - global_seqlen/minmax_diff:1416.000 - global_seqlen/balanced_min:2729.000 - global_seqlen/balanced_max:2878.000 - global_seqlen/mean:2803.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.685 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:699.875 - prompt_length/max:940.000 - prompt_length/min:586.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.517 - timing_s/reward:0.943 - timing_s/old_log_prob:2.341 - timing_s/adv:0.002 - timing_s/update_actor:17.571 - timing_s/step:24.378 - timing_per_token_ms/gen:109.912 - timing_per_token_ms/update_actor:0.783 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22428.000 - perf/time_per_step:24.378 - perf/throughput:115.003 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 69%|██████▉ | 138/200 [1:00:16<25:19, 24.51s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 70%|██████▉ | 139/200 [1:00:39<24:33, 24.15s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:139 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2112.000 - global_seqlen/max:3464.000 - global_seqlen/minmax_diff:1352.000 - global_seqlen/balanced_min:2306.000 - global_seqlen/balanced_max:2538.000 - global_seqlen/mean:2422.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.695 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:604.500 - prompt_length/max:865.000 - prompt_length/min:527.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.643 - timing_s/reward:0.944 - timing_s/old_log_prob:2.340 - timing_s/adv:0.001 - timing_s/update_actor:17.373 - timing_s/step:23.304 - timing_per_token_ms/gen:82.580 - timing_per_token_ms/update_actor:0.897 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19376.000 - perf/time_per_step:23.304 - perf/throughput:103.931 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:140 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2196.000 - global_seqlen/max:5092.000 - global_seqlen/minmax_diff:2896.000 - global_seqlen/balanced_min:3004.000 - global_seqlen/balanced_max:3087.000 - global_seqlen/mean:3045.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.696 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:760.375 - prompt_length/max:1272.000 - prompt_length/min:548.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.803 - timing_s/reward:0.969 - timing_s/old_log_prob:2.339 - timing_s/adv:0.001 - timing_s/update_actor:17.427 - timing_s/step:23.543 - timing_per_token_ms/gen:87.589 - timing_per_token_ms/update_actor:0.715 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:24364.000 - perf/time_per_step:23.543 - perf/throughput:129.360 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 70%|███████ | 140/200 [1:01:03<23:58, 23.97s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 70%|███████ | 141/200 [1:01:27<23:39, 24.06s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:141 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2184.000 - global_seqlen/max:2740.000 - global_seqlen/minmax_diff:556.000 - global_seqlen/balanced_min:2380.000 - global_seqlen/balanced_max:2391.000 - global_seqlen/mean:2385.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.702 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:595.375 - prompt_length/max:684.000 - prompt_length/min:545.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.326 - timing_s/reward:0.987 - timing_s/old_log_prob:2.399 - timing_s/adv:0.001 - timing_s/update_actor:17.550 - timing_s/step:24.266 - timing_per_token_ms/gen:103.924 - timing_per_token_ms/update_actor:0.920 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19084.000 - perf/time_per_step:24.266 - perf/throughput:98.304 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:142 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2200.000 - global_seqlen/max:4400.000 - global_seqlen/minmax_diff:2200.000 - global_seqlen/balanced_min:2664.000 - global_seqlen/balanced_max:2941.000 - global_seqlen/mean:2802.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.704 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:699.625 - prompt_length/max:1099.000 - prompt_length/min:549.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.110 - timing_s/reward:0.958 - timing_s/old_log_prob:2.511 - timing_s/adv:0.001 - timing_s/update_actor:17.276 - timing_s/step:23.859 - timing_per_token_ms/gen:97.176 - timing_per_token_ms/update_actor:0.771 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22420.000 - perf/time_per_step:23.859 - perf/throughput:117.462 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 71%|███████ | 142/200 [1:01:51<23:12, 24.01s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 72%|███████▏ | 143/200 [1:02:15<22:50, 24.05s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:143 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:1936.000 - global_seqlen/max:3596.000 - global_seqlen/minmax_diff:1660.000 - global_seqlen/balanced_min:2433.000 - global_seqlen/balanced_max:2575.000 - global_seqlen/mean:2504.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.717 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:625.000 - prompt_length/max:898.000 - prompt_length/min:483.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.558 - timing_s/reward:0.989 - timing_s/old_log_prob:2.341 - timing_s/adv:0.001 - timing_s/update_actor:17.239 - timing_s/step:24.131 - timing_per_token_ms/gen:111.173 - timing_per_token_ms/update_actor:0.861 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20032.000 - perf/time_per_step:24.131 - perf/throughput:103.767 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:144 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2160.000 - global_seqlen/max:3816.000 - global_seqlen/minmax_diff:1656.000 - global_seqlen/balanced_min:2830.000 - global_seqlen/balanced_max:2835.000 - global_seqlen/mean:2832.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.693 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:707.125 - prompt_length/max:953.000 - prompt_length/min:539.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.108 - timing_s/reward:0.974 - timing_s/old_log_prob:2.335 - timing_s/adv:0.001 - timing_s/update_actor:17.476 - timing_s/step:23.898 - timing_per_token_ms/gen:97.116 - timing_per_token_ms/update_actor:0.771 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22660.000 - perf/time_per_step:23.898 - perf/throughput:118.525 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 72%|███████▏ | 144/200 [1:02:39<22:24, 24.01s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 72%|███████▎ | 145/200 [1:03:03<22:01, 24.02s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:145 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2224.000 - global_seqlen/max:4688.000 - global_seqlen/minmax_diff:2464.000 - global_seqlen/balanced_min:3087.000 - global_seqlen/balanced_max:3101.000 - global_seqlen/mean:3094.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.694 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:772.500 - prompt_length/max:1171.000 - prompt_length/min:555.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.514 - timing_s/reward:0.946 - timing_s/old_log_prob:2.380 - timing_s/adv:0.001 - timing_s/update_actor:17.193 - timing_s/step:24.038 - timing_per_token_ms/gen:109.798 - timing_per_token_ms/update_actor:0.695 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:24752.000 - perf/time_per_step:24.038 - perf/throughput:128.714 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:146 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2012.000 - global_seqlen/max:2668.000 - global_seqlen/minmax_diff:656.000 - global_seqlen/balanced_min:2287.000 - global_seqlen/balanced_max:2302.000 - global_seqlen/mean:2294.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.698 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:572.625 - prompt_length/max:666.000 - prompt_length/min:502.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.541 - timing_s/reward:0.960 - timing_s/old_log_prob:2.441 - timing_s/adv:0.001 - timing_s/update_actor:17.450 - timing_s/step:24.397 - timing_per_token_ms/gen:110.647 - timing_per_token_ms/update_actor:0.951 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:18356.000 - perf/time_per_step:24.397 - perf/throughput:94.049 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 73%|███████▎ | 146/200 [1:03:28<21:43, 24.14s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 74%|███████▎ | 147/200 [1:03:52<21:26, 24.27s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:147 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2216.000 - global_seqlen/max:3160.000 - global_seqlen/minmax_diff:944.000 - global_seqlen/balanced_min:2643.000 - global_seqlen/balanced_max:2650.000 - global_seqlen/mean:2646.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.701 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:660.625 - prompt_length/max:789.000 - prompt_length/min:553.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.571 - timing_s/reward:1.004 - timing_s/old_log_prob:2.410 - timing_s/adv:0.001 - timing_s/update_actor:17.561 - timing_s/step:24.551 - timing_per_token_ms/gen:111.582 - timing_per_token_ms/update_actor:0.829 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21172.000 - perf/time_per_step:24.551 - perf/throughput:107.797 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:148 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2028.000 - global_seqlen/max:4068.000 - global_seqlen/minmax_diff:2040.000 - global_seqlen/balanced_min:2832.000 - global_seqlen/balanced_max:2862.000 - global_seqlen/mean:2847.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.693 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:710.750 - prompt_length/max:1016.000 - prompt_length/min:506.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.535 - timing_s/reward:0.950 - timing_s/old_log_prob:2.442 - timing_s/adv:0.001 - timing_s/update_actor:17.515 - timing_s/step:24.447 - timing_per_token_ms/gen:110.477 - timing_per_token_ms/update_actor:0.769 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22776.000 - perf/time_per_step:24.447 - perf/throughput:116.458 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 74%|███████▍ | 148/200 [1:04:17<21:04, 24.33s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 74%|███████▍ | 149/200 [1:04:40<20:28, 24.10s/it] (WorkerDict pid=56161) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. (WorkerDict pid=56161) return func(*args, **kwargs) (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:149 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2384.000 - global_seqlen/max:3704.000 - global_seqlen/minmax_diff:1320.000 - global_seqlen/balanced_min:2707.000 - global_seqlen/balanced_max:2827.000 - global_seqlen/mean:2767.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.697 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:690.750 - prompt_length/max:925.000 - prompt_length/min:595.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.640 - timing_s/reward:0.934 - timing_s/old_log_prob:2.419 - timing_s/adv:0.001 - timing_s/update_actor:17.553 - timing_s/step:23.550 - timing_per_token_ms/gen:82.493 - timing_per_token_ms/update_actor:0.793 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22136.000 - perf/time_per_step:23.550 - perf/throughput:117.493 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) local_global_step_folder: /data/save/python/global_step_150 (WorkerDict pid=56489) [rank-6]: Saving model to /data/save/python/global_step_150/actor/model_world_size_8_rank_6.pt (WorkerDict pid=56489) [rank-6]: Saving checkpoint to /data/save/python/global_step_150/actor/model_world_size_8_rank_6.pt (WorkerDict pid=56489) [rank-6]: Saving extra_state to /data/save/python/global_step_150/actor/extra_state_world_size_8_rank_6.pt (TaskRunner pid=54845) step:150 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:1948.000 - global_seqlen/max:4924.000 - global_seqlen/minmax_diff:2976.000 - global_seqlen/balanced_min:2886.000 - global_seqlen/balanced_max:3033.000 - global_seqlen/mean:2959.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.706 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:738.875 - prompt_length/max:1230.000 - prompt_length/min:486.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.605 - timing_s/reward:0.984 - timing_s/old_log_prob:2.452 - timing_s/adv:0.001 - timing_s/update_actor:17.886 - timing_s/save_checkpoint:14.609 - timing_s/step:39.541 - timing_per_token_ms/gen:112.656 - timing_per_token_ms/update_actor:0.755 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23676.000 - perf/time_per_step:39.541 - perf/throughput:74.846 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 75%|███████▌ | 150/200 [1:05:20<23:56, 28.73s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 76%|███████▌ | 151/200 [1:05:42<21:50, 26.75s/it] (WorkerDict pid=56490) [rank-7]: Saving model to /data/save/python/global_step_150/actor/model_world_size_8_rank_7.pt [repeated 7x across cluster] (WorkerDict pid=56490) [rank-7]: Saving checkpoint to /data/save/python/global_step_150/actor/model_world_size_8_rank_7.pt [repeated 7x across cluster] (WorkerDict pid=56490) [rank-7]: Saving extra_state to /data/save/python/global_step_150/actor/extra_state_world_size_8_rank_7.pt [repeated 7x across cluster] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:151 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2036.000 - global_seqlen/max:4480.000 - global_seqlen/minmax_diff:2444.000 - global_seqlen/balanced_min:2883.000 - global_seqlen/balanced_max:2896.000 - global_seqlen/mean:2889.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.692 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:721.375 - prompt_length/max:1119.000 - prompt_length/min:508.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:1.113 - timing_s/reward:0.958 - timing_s/old_log_prob:2.440 - timing_s/adv:0.001 - timing_s/update_actor:17.570 - timing_s/step:22.086 - timing_per_token_ms/gen:34.787 - timing_per_token_ms/update_actor:0.760 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23116.000 - perf/time_per_step:22.086 - perf/throughput:130.828 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:152 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2120.000 - global_seqlen/max:5956.000 - global_seqlen/minmax_diff:3836.000 - global_seqlen/balanced_min:3057.000 - global_seqlen/balanced_max:3331.000 - global_seqlen/mean:3194.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.744 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:797.500 - prompt_length/max:1488.000 - prompt_length/min:529.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.090 - timing_s/reward:0.949 - timing_s/old_log_prob:2.337 - timing_s/adv:0.001 - timing_s/update_actor:17.628 - timing_s/step:24.009 - timing_per_token_ms/gen:96.571 - timing_per_token_ms/update_actor:0.690 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:25552.000 - perf/time_per_step:24.009 - perf/throughput:133.034 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 76%|███████▌ | 152/200 [1:06:06<20:44, 25.93s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 76%|███████▋ | 153/200 [1:06:30<19:52, 25.36s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:153 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2220.000 - global_seqlen/max:3520.000 - global_seqlen/minmax_diff:1300.000 - global_seqlen/balanced_min:2443.000 - global_seqlen/balanced_max:2589.000 - global_seqlen/mean:2516.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.734 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:628.000 - prompt_length/max:879.000 - prompt_length/min:554.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.160 - timing_s/reward:0.952 - timing_s/old_log_prob:2.343 - timing_s/adv:0.001 - timing_s/update_actor:17.560 - timing_s/step:24.020 - timing_per_token_ms/gen:98.744 - timing_per_token_ms/update_actor:0.872 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20128.000 - perf/time_per_step:24.020 - perf/throughput:104.744 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:154 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2192.000 - global_seqlen/max:3000.000 - global_seqlen/minmax_diff:808.000 - global_seqlen/balanced_min:2478.000 - global_seqlen/balanced_max:2511.000 - global_seqlen/mean:2494.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.739 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:622.625 - prompt_length/max:749.000 - prompt_length/min:547.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.784 - timing_s/reward:0.969 - timing_s/old_log_prob:2.337 - timing_s/adv:0.001 - timing_s/update_actor:17.838 - timing_s/step:24.933 - timing_per_token_ms/gen:118.263 - timing_per_token_ms/update_actor:0.894 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19956.000 - perf/time_per_step:24.933 - perf/throughput:100.049 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 77%|███████▋ | 154/200 [1:06:55<19:20, 25.24s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 78%|███████▊ | 155/200 [1:07:19<18:42, 24.96s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:155 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2012.000 - global_seqlen/max:3332.000 - global_seqlen/minmax_diff:1320.000 - global_seqlen/balanced_min:2562.000 - global_seqlen/balanced_max:2567.000 - global_seqlen/mean:2564.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.735 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:640.125 - prompt_length/max:832.000 - prompt_length/min:502.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.117 - timing_s/reward:0.972 - timing_s/old_log_prob:2.337 - timing_s/adv:0.001 - timing_s/update_actor:17.852 - timing_s/step:24.283 - timing_per_token_ms/gen:97.404 - timing_per_token_ms/update_actor:0.870 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20516.000 - perf/time_per_step:24.283 - perf/throughput:105.608 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:156 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2512.000 - global_seqlen/max:6452.000 - global_seqlen/minmax_diff:3940.000 - global_seqlen/balanced_min:2978.000 - global_seqlen/balanced_max:3555.000 - global_seqlen/mean:3266.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.739 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:815.625 - prompt_length/max:1612.000 - prompt_length/min:627.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.502 - timing_s/reward:0.941 - timing_s/old_log_prob:2.343 - timing_s/adv:0.001 - timing_s/update_actor:17.537 - timing_s/step:24.327 - timing_per_token_ms/gen:109.437 - timing_per_token_ms/update_actor:0.671 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:26132.000 - perf/time_per_step:24.327 - perf/throughput:134.272 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 78%|███████▊ | 156/200 [1:07:43<18:09, 24.77s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 78%|███████▊ | 157/200 [1:08:08<17:39, 24.63s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:157 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2232.000 - global_seqlen/max:3404.000 - global_seqlen/minmax_diff:1172.000 - global_seqlen/balanced_min:2708.000 - global_seqlen/balanced_max:2737.000 - global_seqlen/mean:2722.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.743 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:679.625 - prompt_length/max:850.000 - prompt_length/min:557.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.170 - timing_s/reward:0.988 - timing_s/old_log_prob:2.362 - timing_s/adv:0.001 - timing_s/update_actor:17.775 - timing_s/step:24.300 - timing_per_token_ms/gen:99.072 - timing_per_token_ms/update_actor:0.816 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21780.000 - perf/time_per_step:24.300 - perf/throughput:112.036 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:158 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2148.000 - global_seqlen/max:5748.000 - global_seqlen/minmax_diff:3600.000 - global_seqlen/balanced_min:2763.000 - global_seqlen/balanced_max:3333.000 - global_seqlen/mean:3048.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.708 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:761.000 - prompt_length/max:1436.000 - prompt_length/min:536.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.069 - timing_s/reward:0.964 - timing_s/old_log_prob:2.425 - timing_s/adv:0.001 - timing_s/update_actor:17.926 - timing_s/step:24.388 - timing_per_token_ms/gen:95.898 - timing_per_token_ms/update_actor:0.735 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:24384.000 - perf/time_per_step:24.388 - perf/throughput:124.979 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 79%|███████▉ | 158/200 [1:08:32<17:11, 24.56s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 80%|███████▉ | 159/200 [1:08:57<16:47, 24.58s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:159 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:1964.000 - global_seqlen/max:3184.000 - global_seqlen/minmax_diff:1220.000 - global_seqlen/balanced_min:2470.000 - global_seqlen/balanced_max:2481.000 - global_seqlen/mean:2475.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.719 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:617.875 - prompt_length/max:795.000 - prompt_length/min:490.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.519 - timing_s/reward:0.968 - timing_s/old_log_prob:2.397 - timing_s/adv:0.001 - timing_s/update_actor:17.721 - timing_s/step:24.610 - timing_per_token_ms/gen:109.962 - timing_per_token_ms/update_actor:0.895 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19804.000 - perf/time_per_step:24.610 - perf/throughput:100.589 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:160 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2052.000 - global_seqlen/max:3656.000 - global_seqlen/minmax_diff:1604.000 - global_seqlen/balanced_min:2585.000 - global_seqlen/balanced_max:2614.000 - global_seqlen/mean:2599.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.704 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:648.875 - prompt_length/max:913.000 - prompt_length/min:512.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.415 - timing_s/reward:0.996 - timing_s/old_log_prob:2.380 - timing_s/adv:0.001 - timing_s/update_actor:17.970 - timing_s/step:24.766 - timing_per_token_ms/gen:106.714 - timing_per_token_ms/update_actor:0.864 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20796.000 - perf/time_per_step:24.766 - perf/throughput:104.963 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 80%|████████ | 160/200 [1:09:22<16:25, 24.65s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 80%|████████ | 161/200 [1:09:47<16:05, 24.75s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:161 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:1840.000 - global_seqlen/max:2960.000 - global_seqlen/minmax_diff:1120.000 - global_seqlen/balanced_min:2480.000 - global_seqlen/balanced_max:2548.000 - global_seqlen/mean:2514.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.717 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:627.500 - prompt_length/max:739.000 - prompt_length/min:459.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.507 - timing_s/reward:0.946 - timing_s/old_log_prob:2.509 - timing_s/adv:0.001 - timing_s/update_actor:18.021 - timing_s/step:24.988 - timing_per_token_ms/gen:109.605 - timing_per_token_ms/update_actor:0.896 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20112.000 - perf/time_per_step:24.988 - perf/throughput:100.609 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:162 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2044.000 - global_seqlen/max:3496.000 - global_seqlen/minmax_diff:1452.000 - global_seqlen/balanced_min:2663.000 - global_seqlen/balanced_max:2685.000 - global_seqlen/mean:2674.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.730 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:667.500 - prompt_length/max:873.000 - prompt_length/min:510.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.239 - timing_s/reward:0.983 - timing_s/old_log_prob:2.439 - timing_s/adv:0.001 - timing_s/update_actor:18.316 - timing_s/step:24.983 - timing_per_token_ms/gen:101.215 - timing_per_token_ms/update_actor:0.856 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21392.000 - perf/time_per_step:24.983 - perf/throughput:107.035 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 81%|████████ | 162/200 [1:10:12<15:43, 24.83s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 82%|████████▏ | 163/200 [1:10:36<15:16, 24.76s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:163 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2104.000 - global_seqlen/max:3504.000 - global_seqlen/minmax_diff:1400.000 - global_seqlen/balanced_min:2670.000 - global_seqlen/balanced_max:2679.000 - global_seqlen/mean:2674.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.714 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:667.625 - prompt_length/max:875.000 - prompt_length/min:525.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.081 - timing_s/reward:0.952 - timing_s/old_log_prob:2.393 - timing_s/adv:0.001 - timing_s/update_actor:18.152 - timing_s/step:24.582 - timing_per_token_ms/gen:96.275 - timing_per_token_ms/update_actor:0.848 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21396.000 - perf/time_per_step:24.582 - perf/throughput:108.800 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:164 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2172.000 - global_seqlen/max:3492.000 - global_seqlen/minmax_diff:1320.000 - global_seqlen/balanced_min:2606.000 - global_seqlen/balanced_max:2630.000 - global_seqlen/mean:2618.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.700 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:653.500 - prompt_length/max:872.000 - prompt_length/min:542.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.495 - timing_s/reward:0.932 - timing_s/old_log_prob:2.381 - timing_s/adv:0.001 - timing_s/update_actor:18.441 - timing_s/step:25.254 - timing_per_token_ms/gen:109.207 - timing_per_token_ms/update_actor:0.881 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20944.000 - perf/time_per_step:25.254 - perf/throughput:103.667 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 82%|████████▏ | 164/200 [1:11:01<14:56, 24.91s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 82%|████████▎ | 165/200 [1:11:27<14:39, 25.13s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:165 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2120.000 - global_seqlen/max:3788.000 - global_seqlen/minmax_diff:1668.000 - global_seqlen/balanced_min:2753.000 - global_seqlen/balanced_max:2830.000 - global_seqlen/mean:2791.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.729 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:696.875 - prompt_length/max:946.000 - prompt_length/min:529.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.582 - timing_s/reward:0.985 - timing_s/old_log_prob:2.414 - timing_s/adv:0.001 - timing_s/update_actor:18.640 - timing_s/step:25.626 - timing_per_token_ms/gen:111.935 - timing_per_token_ms/update_actor:0.835 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22332.000 - perf/time_per_step:25.626 - perf/throughput:108.930 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:166 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2216.000 - global_seqlen/max:3688.000 - global_seqlen/minmax_diff:1472.000 - global_seqlen/balanced_min:2911.000 - global_seqlen/balanced_max:2938.000 - global_seqlen/mean:2924.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.737 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:730.125 - prompt_length/max:921.000 - prompt_length/min:553.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.083 - timing_s/reward:0.979 - timing_s/old_log_prob:2.442 - timing_s/adv:0.001 - timing_s/update_actor:18.385 - timing_s/step:24.894 - timing_per_token_ms/gen:96.345 - timing_per_token_ms/update_actor:0.786 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23396.000 - perf/time_per_step:24.894 - perf/throughput:117.476 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 83%|████████▎ | 166/200 [1:11:52<14:12, 25.06s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 84%|████████▎ | 167/200 [1:12:17<13:50, 25.18s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:167 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2132.000 - global_seqlen/max:3100.000 - global_seqlen/minmax_diff:968.000 - global_seqlen/balanced_min:2675.000 - global_seqlen/balanced_max:2678.000 - global_seqlen/mean:2676.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.746 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:668.125 - prompt_length/max:774.000 - prompt_length/min:532.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.553 - timing_s/reward:0.973 - timing_s/old_log_prob:2.405 - timing_s/adv:0.001 - timing_s/update_actor:18.478 - timing_s/step:25.415 - timing_per_token_ms/gen:111.039 - timing_per_token_ms/update_actor:0.863 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21412.000 - perf/time_per_step:25.415 - perf/throughput:105.311 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:168 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2120.000 - global_seqlen/max:3500.000 - global_seqlen/minmax_diff:1380.000 - global_seqlen/balanced_min:2627.000 - global_seqlen/balanced_max:2756.000 - global_seqlen/mean:2691.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.735 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:671.875 - prompt_length/max:874.000 - prompt_length/min:529.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.520 - timing_s/reward:0.952 - timing_s/old_log_prob:2.386 - timing_s/adv:0.001 - timing_s/update_actor:18.476 - timing_s/step:25.340 - timing_per_token_ms/gen:110.003 - timing_per_token_ms/update_actor:0.858 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21532.000 - perf/time_per_step:25.340 - perf/throughput:106.216 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 84%|████████▍ | 168/200 [1:12:43<13:27, 25.23s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 84%|████████▍ | 169/200 [1:13:08<13:01, 25.20s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:169 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2292.000 - global_seqlen/max:4512.000 - global_seqlen/minmax_diff:2220.000 - global_seqlen/balanced_min:3024.000 - global_seqlen/balanced_max:3125.000 - global_seqlen/mean:3074.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.736 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:767.625 - prompt_length/max:1127.000 - prompt_length/min:572.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.162 - timing_s/reward:0.996 - timing_s/old_log_prob:2.427 - timing_s/adv:0.001 - timing_s/update_actor:18.515 - timing_s/step:25.106 - timing_per_token_ms/gen:98.816 - timing_per_token_ms/update_actor:0.753 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:24596.000 - perf/time_per_step:25.106 - perf/throughput:122.459 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:170 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2188.000 - global_seqlen/max:3364.000 - global_seqlen/minmax_diff:1176.000 - global_seqlen/balanced_min:2548.000 - global_seqlen/balanced_max:2603.000 - global_seqlen/mean:2575.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.711 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:642.875 - prompt_length/max:840.000 - prompt_length/min:546.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.564 - timing_s/reward:0.964 - timing_s/old_log_prob:2.310 - timing_s/adv:0.001 - timing_s/update_actor:18.165 - timing_s/step:25.007 - timing_per_token_ms/gen:111.373 - timing_per_token_ms/update_actor:0.882 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20604.000 - perf/time_per_step:25.007 - perf/throughput:102.993 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) Training Progress: 85%|████████▌ | 170/200 [1:13:33<12:34, 25.15s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 86%|████████▌ | 171/200 [1:13:58<12:04, 24.98s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:171 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:1860.000 - global_seqlen/max:3160.000 - global_seqlen/minmax_diff:1300.000 - global_seqlen/balanced_min:2613.000 - global_seqlen/balanced_max:2700.000 - global_seqlen/mean:2656.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.739 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:663.125 - prompt_length/max:789.000 - prompt_length/min:464.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.072 - timing_s/reward:1.010 - timing_s/old_log_prob:2.386 - timing_s/adv:0.001 - timing_s/update_actor:18.121 - timing_s/step:24.594 - timing_per_token_ms/gen:96.011 - timing_per_token_ms/update_actor:0.853 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21252.000 - perf/time_per_step:24.594 - perf/throughput:108.014 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:172 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2120.000 - global_seqlen/max:2876.000 - global_seqlen/minmax_diff:756.000 - global_seqlen/balanced_min:2465.000 - global_seqlen/balanced_max:2467.000 - global_seqlen/mean:2466.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.752 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:615.500 - prompt_length/max:718.000 - prompt_length/min:529.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.023 - timing_s/reward:0.959 - timing_s/old_log_prob:2.346 - timing_s/adv:0.001 - timing_s/update_actor:17.460 - timing_s/step:23.793 - timing_per_token_ms/gen:94.472 - timing_per_token_ms/update_actor:0.885 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19728.000 - perf/time_per_step:23.793 - perf/throughput:103.643 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 86%|████████▌ | 172/200 [1:14:21<11:29, 24.63s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 86%|████████▋ | 173/200 [1:14:46<11:03, 24.57s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:173 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2096.000 - global_seqlen/max:3212.000 - global_seqlen/minmax_diff:1116.000 - global_seqlen/balanced_min:2593.000 - global_seqlen/balanced_max:2597.000 - global_seqlen/mean:2595.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.725 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:647.750 - prompt_length/max:802.000 - prompt_length/min:523.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.035 - timing_s/reward:0.959 - timing_s/old_log_prob:2.461 - timing_s/adv:0.001 - timing_s/update_actor:17.970 - timing_s/step:24.431 - timing_per_token_ms/gen:94.857 - timing_per_token_ms/update_actor:0.866 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20760.000 - perf/time_per_step:24.431 - perf/throughput:106.218 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:174 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2172.000 - global_seqlen/max:5268.000 - global_seqlen/minmax_diff:3096.000 - global_seqlen/balanced_min:2845.000 - global_seqlen/balanced_max:3137.000 - global_seqlen/mean:2991.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.718 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:746.750 - prompt_length/max:1316.000 - prompt_length/min:542.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.763 - timing_s/reward:1.024 - timing_s/old_log_prob:2.376 - timing_s/adv:0.001 - timing_s/update_actor:18.053 - timing_s/step:24.221 - timing_per_token_ms/gen:86.352 - timing_per_token_ms/update_actor:0.754 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:23928.000 - perf/time_per_step:24.221 - perf/throughput:123.490 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 87%|████████▋ | 174/200 [1:15:10<10:36, 24.47s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 88%|████████▊ | 175/200 [1:15:34<10:10, 24.40s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:175 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2232.000 - global_seqlen/max:2800.000 - global_seqlen/minmax_diff:568.000 - global_seqlen/balanced_min:2459.000 - global_seqlen/balanced_max:2478.000 - global_seqlen/mean:2468.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.733 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:616.125 - prompt_length/max:699.000 - prompt_length/min:557.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.721 - timing_s/reward:0.949 - timing_s/old_log_prob:2.396 - timing_s/adv:0.001 - timing_s/update_actor:18.143 - timing_s/step:24.214 - timing_per_token_ms/gen:85.025 - timing_per_token_ms/update_actor:0.919 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19748.000 - perf/time_per_step:24.214 - perf/throughput:101.945 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:176 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2076.000 - global_seqlen/max:5184.000 - global_seqlen/minmax_diff:3108.000 - global_seqlen/balanced_min:2551.000 - global_seqlen/balanced_max:3040.000 - global_seqlen/mean:2795.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.755 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:697.875 - prompt_length/max:1295.000 - prompt_length/min:518.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.105 - timing_s/reward:0.957 - timing_s/old_log_prob:2.350 - timing_s/adv:0.001 - timing_s/update_actor:17.712 - timing_s/step:24.129 - timing_per_token_ms/gen:97.035 - timing_per_token_ms/update_actor:0.792 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22364.000 - perf/time_per_step:24.129 - perf/throughput:115.855 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 88%|████████▊ | 176/200 [1:15:58<09:43, 24.33s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 88%|████████▊ | 177/200 [1:16:23<09:19, 24.31s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:177 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2188.000 - global_seqlen/max:4348.000 - global_seqlen/minmax_diff:2160.000 - global_seqlen/balanced_min:2847.000 - global_seqlen/balanced_max:2880.000 - global_seqlen/mean:2863.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.722 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:714.875 - prompt_length/max:1086.000 - prompt_length/min:546.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.879 - timing_s/reward:0.993 - timing_s/old_log_prob:2.345 - timing_s/adv:0.001 - timing_s/update_actor:18.022 - timing_s/step:24.244 - timing_per_token_ms/gen:89.961 - timing_per_token_ms/update_actor:0.787 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22908.000 - perf/time_per_step:24.244 - perf/throughput:118.113 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:178 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2068.000 - global_seqlen/max:3240.000 - global_seqlen/minmax_diff:1172.000 - global_seqlen/balanced_min:2557.000 - global_seqlen/balanced_max:2596.000 - global_seqlen/mean:2576.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.726 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:643.125 - prompt_length/max:809.000 - prompt_length/min:516.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.518 - timing_s/reward:0.954 - timing_s/old_log_prob:2.389 - timing_s/adv:0.001 - timing_s/update_actor:18.073 - timing_s/step:24.939 - timing_per_token_ms/gen:109.950 - timing_per_token_ms/update_actor:0.877 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20612.000 - perf/time_per_step:24.939 - perf/throughput:103.311 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 89%|████████▉ | 178/200 [1:16:48<08:59, 24.50s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 90%|████████▉ | 179/200 [1:17:12<08:32, 24.43s/it] (WorkerDict pid=56161) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. (WorkerDict pid=56161) return func(*args, **kwargs) (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:179 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2272.000 - global_seqlen/max:3272.000 - global_seqlen/minmax_diff:1000.000 - global_seqlen/balanced_min:2758.000 - global_seqlen/balanced_max:2765.000 - global_seqlen/mean:2761.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.721 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:689.375 - prompt_length/max:817.000 - prompt_length/min:567.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.595 - timing_s/reward:0.944 - timing_s/old_log_prob:2.427 - timing_s/adv:0.001 - timing_s/update_actor:18.267 - timing_s/step:24.238 - timing_per_token_ms/gen:81.090 - timing_per_token_ms/update_actor:0.827 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22092.000 - perf/time_per_step:24.238 - perf/throughput:113.934 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) local_global_step_folder: /data/save/python/global_step_180 (WorkerDict pid=56487) [rank-4]: Saving model to /data/save/python/global_step_180/actor/model_world_size_8_rank_4.pt (WorkerDict pid=56487) [rank-4]: Saving checkpoint to /data/save/python/global_step_180/actor/model_world_size_8_rank_4.pt (WorkerDict pid=56487) [rank-4]: Saving extra_state to /data/save/python/global_step_180/actor/extra_state_world_size_8_rank_4.pt (TaskRunner pid=54845) step:180 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2212.000 - global_seqlen/max:3732.000 - global_seqlen/minmax_diff:1520.000 - global_seqlen/balanced_min:2614.000 - global_seqlen/balanced_max:2699.000 - global_seqlen/mean:2656.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.736 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:663.125 - prompt_length/max:932.000 - prompt_length/min:552.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.015 - timing_s/reward:0.991 - timing_s/old_log_prob:2.394 - timing_s/adv:0.001 - timing_s/update_actor:18.584 - timing_s/save_checkpoint:15.121 - timing_s/step:40.110 - timing_per_token_ms/gen:94.223 - timing_per_token_ms/update_actor:0.874 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21252.000 - perf/time_per_step:40.110 - perf/throughput:66.230 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 90%|█████████ | 180/200 [1:17:52<09:42, 29.14s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 90%|█████████ | 181/200 [1:18:15<08:36, 27.19s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (WorkerDict pid=56490) [rank-7]: Saving model to /data/save/python/global_step_180/actor/model_world_size_8_rank_7.pt [repeated 7x across cluster] (WorkerDict pid=56490) [rank-7]: Saving checkpoint to /data/save/python/global_step_180/actor/model_world_size_8_rank_7.pt [repeated 7x across cluster] (WorkerDict pid=56490) [rank-7]: Saving extra_state to /data/save/python/global_step_180/actor/extra_state_world_size_8_rank_7.pt [repeated 7x across cluster] (TaskRunner pid=54845) (TaskRunner pid=54845) step:181 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2228.000 - global_seqlen/max:3376.000 - global_seqlen/minmax_diff:1148.000 - global_seqlen/balanced_min:2720.000 - global_seqlen/balanced_max:2763.000 - global_seqlen/mean:2741.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.720 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:684.375 - prompt_length/max:843.000 - prompt_length/min:556.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:1.019 - timing_s/reward:0.971 - timing_s/old_log_prob:2.435 - timing_s/adv:0.001 - timing_s/update_actor:18.202 - timing_s/step:22.630 - timing_per_token_ms/gen:31.832 - timing_per_token_ms/update_actor:0.830 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21932.000 - perf/time_per_step:22.630 - perf/throughput:121.142 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:182 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2188.000 - global_seqlen/max:3732.000 - global_seqlen/minmax_diff:1544.000 - global_seqlen/balanced_min:2840.000 - global_seqlen/balanced_max:2849.000 - global_seqlen/mean:2844.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.757 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:710.125 - prompt_length/max:932.000 - prompt_length/min:546.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.618 - timing_s/reward:0.966 - timing_s/old_log_prob:2.309 - timing_s/adv:0.001 - timing_s/update_actor:17.678 - timing_s/step:23.576 - timing_per_token_ms/gen:81.814 - timing_per_token_ms/update_actor:0.777 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22756.000 - perf/time_per_step:23.576 - perf/throughput:120.652 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 91%|█████████ | 182/200 [1:18:38<07:49, 26.11s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 92%|█████████▏| 183/200 [1:19:03<07:15, 25.64s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:183 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2224.000 - global_seqlen/max:3248.000 - global_seqlen/minmax_diff:1024.000 - global_seqlen/balanced_min:2500.000 - global_seqlen/balanced_max:2586.000 - global_seqlen/mean:2543.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.771 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:634.750 - prompt_length/max:811.000 - prompt_length/min:555.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.104 - timing_s/reward:0.965 - timing_s/old_log_prob:2.317 - timing_s/adv:0.001 - timing_s/update_actor:18.128 - timing_s/step:24.519 - timing_per_token_ms/gen:97.003 - timing_per_token_ms/update_actor:0.891 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20344.000 - perf/time_per_step:24.519 - perf/throughput:103.716 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:184 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2108.000 - global_seqlen/max:3296.000 - global_seqlen/minmax_diff:1188.000 - global_seqlen/balanced_min:2431.000 - global_seqlen/balanced_max:2569.000 - global_seqlen/mean:2500.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.774 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:624.000 - prompt_length/max:823.000 - prompt_length/min:526.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.227 - timing_s/reward:0.978 - timing_s/old_log_prob:2.344 - timing_s/adv:0.001 - timing_s/update_actor:17.723 - timing_s/step:24.277 - timing_per_token_ms/gen:100.843 - timing_per_token_ms/update_actor:0.886 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20000.000 - perf/time_per_step:24.277 - perf/throughput:102.980 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 92%|█████████▏| 184/200 [1:19:27<06:43, 25.24s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 92%|█████████▎| 185/200 [1:19:52<06:16, 25.11s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:185 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2116.000 - global_seqlen/max:3312.000 - global_seqlen/minmax_diff:1196.000 - global_seqlen/balanced_min:2596.000 - global_seqlen/balanced_max:2604.000 - global_seqlen/mean:2600.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.763 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:649.000 - prompt_length/max:827.000 - prompt_length/min:528.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.580 - timing_s/reward:0.959 - timing_s/old_log_prob:2.346 - timing_s/adv:0.001 - timing_s/update_actor:17.914 - timing_s/step:24.805 - timing_per_token_ms/gen:111.887 - timing_per_token_ms/update_actor:0.861 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20800.000 - perf/time_per_step:24.805 - perf/throughput:104.820 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:186 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2252.000 - global_seqlen/max:3932.000 - global_seqlen/minmax_diff:1680.000 - global_seqlen/balanced_min:2615.000 - global_seqlen/balanced_max:2807.000 - global_seqlen/mean:2711.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.758 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:676.750 - prompt_length/max:982.000 - prompt_length/min:562.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.960 - timing_s/reward:0.945 - timing_s/old_log_prob:2.387 - timing_s/adv:0.001 - timing_s/update_actor:17.830 - timing_s/step:24.126 - timing_per_token_ms/gen:92.495 - timing_per_token_ms/update_actor:0.822 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21688.000 - perf/time_per_step:24.126 - perf/throughput:112.369 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 93%|█████████▎| 186/200 [1:20:16<05:47, 24.82s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 94%|█████████▎| 187/200 [1:20:41<05:22, 24.80s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:187 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2000.000 - global_seqlen/max:3392.000 - global_seqlen/minmax_diff:1392.000 - global_seqlen/balanced_min:2769.000 - global_seqlen/balanced_max:2776.000 - global_seqlen/mean:2772.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.753 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:692.125 - prompt_length/max:847.000 - prompt_length/min:499.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.185 - timing_s/reward:0.969 - timing_s/old_log_prob:2.431 - timing_s/adv:0.001 - timing_s/update_actor:18.152 - timing_s/step:24.742 - timing_per_token_ms/gen:99.522 - timing_per_token_ms/update_actor:0.818 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22180.000 - perf/time_per_step:24.742 - perf/throughput:112.057 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:188 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2332.000 - global_seqlen/max:3756.000 - global_seqlen/minmax_diff:1424.000 - global_seqlen/balanced_min:2778.000 - global_seqlen/balanced_max:2875.000 - global_seqlen/mean:2826.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.770 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:705.625 - prompt_length/max:938.000 - prompt_length/min:582.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.097 - timing_s/reward:0.960 - timing_s/old_log_prob:2.351 - timing_s/adv:0.001 - timing_s/update_actor:17.784 - timing_s/step:24.196 - timing_per_token_ms/gen:96.772 - timing_per_token_ms/update_actor:0.786 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22612.000 - perf/time_per_step:24.196 - perf/throughput:116.815 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 94%|█████████▍| 188/200 [1:21:05<04:55, 24.62s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 94%|█████████▍| 189/200 [1:21:30<04:33, 24.84s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:189 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2072.000 - global_seqlen/max:3820.000 - global_seqlen/minmax_diff:1748.000 - global_seqlen/balanced_min:2592.000 - global_seqlen/balanced_max:2685.000 - global_seqlen/mean:2638.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.778 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:658.625 - prompt_length/max:954.000 - prompt_length/min:517.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:4.053 - timing_s/reward:1.005 - timing_s/old_log_prob:2.344 - timing_s/adv:0.001 - timing_s/update_actor:17.912 - timing_s/step:25.319 - timing_per_token_ms/gen:126.644 - timing_per_token_ms/update_actor:0.849 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21108.000 - perf/time_per_step:25.319 - perf/throughput:104.212 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:190 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2012.000 - global_seqlen/max:3104.000 - global_seqlen/minmax_diff:1092.000 - global_seqlen/balanced_min:2426.000 - global_seqlen/balanced_max:2440.000 - global_seqlen/mean:2433.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.742 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:607.250 - prompt_length/max:775.000 - prompt_length/min:502.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.509 - timing_s/reward:0.986 - timing_s/old_log_prob:2.423 - timing_s/adv:0.001 - timing_s/update_actor:18.169 - timing_s/step:25.092 - timing_per_token_ms/gen:109.663 - timing_per_token_ms/update_actor:0.933 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:19464.000 - perf/time_per_step:25.092 - perf/throughput:96.962 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 95%|█████████▌| 190/200 [1:21:55<04:09, 24.92s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 96%|█████████▌| 191/200 [1:22:20<03:43, 24.85s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:191 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:1976.000 - global_seqlen/max:2552.000 - global_seqlen/minmax_diff:576.000 - global_seqlen/balanced_min:2349.000 - global_seqlen/balanced_max:2396.000 - global_seqlen/mean:2372.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.744 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:592.125 - prompt_length/max:637.000 - prompt_length/min:493.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.965 - timing_s/reward:0.963 - timing_s/old_log_prob:2.432 - timing_s/adv:0.001 - timing_s/update_actor:18.302 - timing_s/step:24.667 - timing_per_token_ms/gen:92.663 - timing_per_token_ms/update_actor:0.964 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:18980.000 - perf/time_per_step:24.667 - perf/throughput:96.179 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:192 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:1956.000 - global_seqlen/max:3540.000 - global_seqlen/minmax_diff:1584.000 - global_seqlen/balanced_min:2530.000 - global_seqlen/balanced_max:2639.000 - global_seqlen/mean:2584.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.802 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:645.125 - prompt_length/max:884.000 - prompt_length/min:488.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.072 - timing_s/reward:0.974 - timing_s/old_log_prob:2.355 - timing_s/adv:0.001 - timing_s/update_actor:18.229 - timing_s/step:24.634 - timing_per_token_ms/gen:95.989 - timing_per_token_ms/update_actor:0.882 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20676.000 - perf/time_per_step:24.634 - perf/throughput:104.917 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 96%|█████████▌| 192/200 [1:22:45<03:18, 24.79s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 96%|█████████▋| 193/200 [1:23:10<02:53, 24.78s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:193 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2044.000 - global_seqlen/max:3624.000 - global_seqlen/minmax_diff:1580.000 - global_seqlen/balanced_min:2431.000 - global_seqlen/balanced_max:2583.000 - global_seqlen/mean:2507.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.743 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:625.750 - prompt_length/max:905.000 - prompt_length/min:510.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.522 - timing_s/reward:0.976 - timing_s/old_log_prob:2.389 - timing_s/adv:0.001 - timing_s/update_actor:17.850 - timing_s/step:24.741 - timing_per_token_ms/gen:110.070 - timing_per_token_ms/update_actor:0.890 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20056.000 - perf/time_per_step:24.741 - perf/throughput:101.330 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:194 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2376.000 - global_seqlen/max:3676.000 - global_seqlen/minmax_diff:1300.000 - global_seqlen/balanced_min:2656.000 - global_seqlen/balanced_max:2772.000 - global_seqlen/mean:2714.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.762 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:677.500 - prompt_length/max:918.000 - prompt_length/min:593.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.211 - timing_s/reward:0.955 - timing_s/old_log_prob:2.369 - timing_s/adv:0.001 - timing_s/update_actor:18.268 - timing_s/step:24.809 - timing_per_token_ms/gen:100.353 - timing_per_token_ms/update_actor:0.841 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21712.000 - perf/time_per_step:24.809 - perf/throughput:109.398 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 97%|█████████▋| 194/200 [1:23:34<02:28, 24.79s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 98%|█████████▊| 195/200 [1:24:00<02:04, 24.95s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:195 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2164.000 - global_seqlen/max:3040.000 - global_seqlen/minmax_diff:876.000 - global_seqlen/balanced_min:2572.000 - global_seqlen/balanced_max:2586.000 - global_seqlen/mean:2579.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.739 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:643.750 - prompt_length/max:759.000 - prompt_length/min:540.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.416 - timing_s/reward:1.022 - timing_s/old_log_prob:2.366 - timing_s/adv:0.001 - timing_s/update_actor:18.476 - timing_s/step:25.285 - timing_per_token_ms/gen:106.757 - timing_per_token_ms/update_actor:0.895 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:20632.000 - perf/time_per_step:25.285 - perf/throughput:101.996 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:196 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2204.000 - global_seqlen/max:3416.000 - global_seqlen/minmax_diff:1212.000 - global_seqlen/balanced_min:2766.000 - global_seqlen/balanced_max:2767.000 - global_seqlen/mean:2766.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.745 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:690.625 - prompt_length/max:853.000 - prompt_length/min:550.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.551 - timing_s/reward:0.969 - timing_s/old_log_prob:2.293 - timing_s/adv:0.001 - timing_s/update_actor:18.035 - timing_s/step:24.852 - timing_per_token_ms/gen:110.965 - timing_per_token_ms/update_actor:0.815 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22132.000 - perf/time_per_step:24.852 - perf/throughput:111.320 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 98%|█████████▊| 196/200 [1:24:25<01:39, 24.92s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 98%|█████████▊| 197/200 [1:24:50<01:15, 25.02s/it] (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:197 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2112.000 - global_seqlen/max:3592.000 - global_seqlen/minmax_diff:1480.000 - global_seqlen/balanced_min:2765.000 - global_seqlen/balanced_max:2782.000 - global_seqlen/mean:2773.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.749 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:692.375 - prompt_length/max:897.000 - prompt_length/min:527.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.406 - timing_s/reward:1.024 - timing_s/old_log_prob:2.452 - timing_s/adv:0.001 - timing_s/update_actor:18.344 - timing_s/step:25.231 - timing_per_token_ms/gen:106.444 - timing_per_token_ms/update_actor:0.827 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22188.000 - perf/time_per_step:25.231 - perf/throughput:109.926 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:198 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2140.000 - global_seqlen/max:4412.000 - global_seqlen/minmax_diff:2272.000 - global_seqlen/balanced_min:2732.000 - global_seqlen/balanced_max:2852.000 - global_seqlen/mean:2792.000 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.748 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:697.000 - prompt_length/max:1102.000 - prompt_length/min:534.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:2.889 - timing_s/reward:0.952 - timing_s/old_log_prob:2.390 - timing_s/adv:0.001 - timing_s/update_actor:18.401 - timing_s/step:24.636 - timing_per_token_ms/gen:90.284 - timing_per_token_ms/update_actor:0.824 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:22336.000 - perf/time_per_step:24.636 - perf/throughput:113.329 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000(TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 99%|█████████▉| 198/200 [1:25:14<00:49, 24.91s/it] (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. (TaskRunner pid=54845) return _methods._mean(a, axis=axis, dtype=dtype, (TaskRunner pid=54845) /usr/local/lib/python3.12/dist-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in divide (TaskRunner pid=54845) ret = ret.dtype.type(ret / rcount) (TaskRunner pid=54845) Training Progress: 100%|█████████▉| 199/200 [1:25:39<00:24, 24.85s/it] (TaskRunner pid=54845) wandb: (TaskRunner pid=54845) wandb: Run history: (TaskRunner pid=54845) wandb: actor/entropy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ (TaskRunner pid=54845) wandb: actor/grad_norm ▁▁▁▂▂▂▂▂▂▂▆█▇▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ (TaskRunner pid=54845) wandb: actor/lr ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ (TaskRunner pid=54845) wandb: actor/pg_clipfrac ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ (TaskRunner pid=54845) wandb: actor/pg_loss ▅▅▃▂▄▁▄▆▃▃▆██▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅ (TaskRunner pid=54845) wandb: actor/ppo_kl ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ (TaskRunner pid=54845) wandb: critic/advantages/max ▇██████▇█████▅▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ (TaskRunner pid=54845) wandb: critic/advantages/mean ▆▆▆▇▇▃▆▁████████████████████████████████ (TaskRunner pid=54845) wandb: critic/advantages/min ▁▁▁▁▁▁▁▁▄█▁▁████████████████████████████ (TaskRunner pid=54845) wandb: critic/kl ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ (TaskRunner pid=54845) wandb: +141 ... (TaskRunner pid=54845) wandb: (TaskRunner pid=54845) wandb: Run summary: (TaskRunner pid=54845) wandb: actor/entropy 0 (TaskRunner pid=54845) wandb: actor/grad_norm 0 (TaskRunner pid=54845) wandb: actor/lr 0.0 (TaskRunner pid=54845) wandb: actor/pg_clipfrac 0 (TaskRunner pid=54845) wandb: actor/pg_loss 0 (TaskRunner pid=54845) wandb: actor/ppo_kl 0 (TaskRunner pid=54845) wandb: critic/advantages/max 0 (TaskRunner pid=54845) wandb: critic/advantages/mean 0 (TaskRunner pid=54845) wandb: critic/advantages/min 0 (TaskRunner pid=54845) wandb: critic/kl 0 (TaskRunner pid=54845) wandb: +141 ... (TaskRunner pid=54845) wandb: (TaskRunner pid=54845) wandb: You can sync this run to the cloud by running: (TaskRunner pid=54845) wandb: wandb sync /opt/codev-r1/verl/wandb/offline-run-20260505_062245-cbplxyr2 (TaskRunner pid=54845) wandb: Find logs at: ./wandb/offline-run-20260505_062245-cbplxyr2/logs (TaskRunner pid=54845) Training Progress: 100%|█████████▉| 199/200 [1:26:12<00:30, 30.60s/it] (TaskRunner pid=54845) Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::WorkerDict.actor_rollout_generate_sequences() (pid=56488, ip=10.4.130.138, actor_id=539c6e6dd3bc116cb357d98601000000, repr=) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/single_controller/ray/base.py", line 419, in func (TaskRunner pid=54845) return getattr(self.worker_dict[key], name)(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/single_controller/base/decorator.py", line 404, in inner (TaskRunner pid=54845) return func(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/workers/fsdp_workers.py", line 518, in generate_sequences (TaskRunner pid=54845) output = self.rollout.generate_sequences(prompts=prompts) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/workers/rollout/hf_rollout.py", line 46, in generate_sequences (TaskRunner pid=54845) output = [self._generate_minibatch(p) for p in batch_prompts] (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context (TaskRunner pid=54845) return func(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/workers/rollout/hf_rollout.py", line 98, in _generate_minibatch (TaskRunner pid=54845) output = self.module.generate( (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context (TaskRunner pid=54845) return func(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/generation/utils.py", line 2535, in generate (TaskRunner pid=54845) result = decoding_method( (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/generation/utils.py", line 2728, in _sample (TaskRunner pid=54845) outputs = self._prefill( (TaskRunner pid=54845) ^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/generation/utils.py", line 3776, in _prefill (TaskRunner pid=54845) return self(**model_inputs, return_dict=True) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl (TaskRunner pid=54845) return self._call_impl(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl (TaskRunner pid=54845) return forward_call(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py", line 843, in wrapper (TaskRunner pid=54845) output = func(self, *args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1845, in forward (TaskRunner pid=54845) outputs: BaseModelOutputWithPast = self.model( (TaskRunner pid=54845) ^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl (TaskRunner pid=54845) return self._call_impl(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl (TaskRunner pid=54845) return forward_call(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py", line 917, in wrapper (TaskRunner pid=54845) output = func(self, *args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/output_capturing.py", line 253, in wrapper (TaskRunner pid=54845) outputs = func(self, *args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1372, in forward (TaskRunner pid=54845) hidden_states = decoder_layer( (TaskRunner pid=54845) ^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_layers.py", line 93, in __call__ (TaskRunner pid=54845) return super().__call__(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl (TaskRunner pid=54845) return self._call_impl(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl (TaskRunner pid=54845) return forward_call(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 857, in forward (TaskRunner pid=54845) hidden_states = self.linear_attn( (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl (TaskRunner pid=54845) return self._call_impl(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl (TaskRunner pid=54845) return forward_call(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 593, in forward (TaskRunner pid=54845) core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule( (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 374, in torch_chunk_gated_delta_rule (TaskRunner pid=54845) k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1)) (TaskRunner pid=54845) ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ (TaskRunner pid=54845) torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.12 GiB. GPU 0 has a total capacity of 139.72 GiB of which 2.16 GiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 132.30 GiB is allocated by PyTorch, and 3.61 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) (TaskRunner pid=54845) Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::WorkerDict.actor_rollout_generate_sequences() (pid=56161, ip=10.4.130.138, actor_id=6bbf7c467f6f4bdbc52ca4e701000000, repr=) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/single_controller/ray/base.py", line 419, in func (TaskRunner pid=54845) return getattr(self.worker_dict[key], name)(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/single_controller/base/decorator.py", line 404, in inner (TaskRunner pid=54845) return func(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/workers/fsdp_workers.py", line 518, in generate_sequences (TaskRunner pid=54845) output = self.rollout.generate_sequences(prompts=prompts) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/workers/rollout/hf_rollout.py", line 46, in generate_sequences (TaskRunner pid=54845) output = [self._generate_minibatch(p) for p in batch_prompts] (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context (TaskRunner pid=54845) return func(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/workers/rollout/hf_rollout.py", line 98, in _generate_minibatch (TaskRunner pid=54845) output = self.module.generate( (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context (TaskRunner pid=54845) return func(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/generation/utils.py", line 2535, in generate (TaskRunner pid=54845) result = decoding_method( (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/generation/utils.py", line 2728, in _sample (TaskRunner pid=54845) outputs = self._prefill( (TaskRunner pid=54845) ^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/generation/utils.py", line 3776, in _prefill (TaskRunner pid=54845) return self(**model_inputs, return_dict=True) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl (TaskRunner pid=54845) return self._call_impl(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl (TaskRunner pid=54845) return forward_call(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py", line 843, in wrapper (TaskRunner pid=54845) output = func(self, *args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1845, in forward (TaskRunner pid=54845) outputs: BaseModelOutputWithPast = self.model( (TaskRunner pid=54845) ^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl (TaskRunner pid=54845) return self._call_impl(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl (TaskRunner pid=54845) return forward_call(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py", line 917, in wrapper (TaskRunner pid=54845) output = func(self, *args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/output_capturing.py", line 253, in wrapper (TaskRunner pid=54845) outputs = func(self, *args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1372, in forward (TaskRunner pid=54845) hidden_states = decoder_layer( (TaskRunner pid=54845) ^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_layers.py", line 93, in __call__ (TaskRunner pid=54845) return super().__call__(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl (TaskRunner pid=54845) return self._call_impl(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl (TaskRunner pid=54845) return forward_call(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 857, in forward (TaskRunner pid=54845) hidden_states = self.linear_attn( (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl (TaskRunner pid=54845) return self._call_impl(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl (TaskRunner pid=54845) return forward_call(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 593, in forward (TaskRunner pid=54845) core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule( (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 374, in torch_chunk_gated_delta_rule (TaskRunner pid=54845) k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1)) (TaskRunner pid=54845) ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ (TaskRunner pid=54845) torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.12 GiB. GPU 0 has a total capacity of 139.72 GiB of which 2.16 GiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 132.30 GiB is allocated by PyTorch, and 3.61 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) step:199 - critic/kl:0.000 - critic/kl_coeff:0.000 - global_seqlen/min:2264.000 - global_seqlen/max:3452.000 - global_seqlen/minmax_diff:1188.000 - global_seqlen/balanced_min:2708.000 - global_seqlen/balanced_max:2713.000 - global_seqlen/mean:2710.500 - actor/entropy:0.000 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:0.000 - perf/mfu/actor:0.000 - perf/max_memory_allocated_gb:71.355 - perf/max_memory_reserved_gb:113.098 - perf/cpu_memory_used_gb:231.760 - actor/lr:0.000 - critic/score/mean:-0.001 - critic/score/max:-0.001 - critic/score/min:-0.001 - critic/rewards/mean:-0.001 - critic/rewards/max:-0.001 - critic/rewards/min:-0.001 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - response_length/mean:1.000 - response_length/max:1.000 - response_length/min:1.000 - response_length/clip_ratio:0.000 - prompt_length/mean:676.625 - prompt_length/max:862.000 - prompt_length/min:565.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:3.014 - timing_s/reward:1.011 - timing_s/old_log_prob:2.446 - timing_s/adv:0.001 - timing_s/update_actor:18.233 - timing_s/step:24.709 - timing_per_token_ms/gen:94.181 - timing_per_token_ms/update_actor:0.841 - timing_per_token_ms/adv:0.000 - perf/total_num_tokens:21684.000 - perf/time_per_step:24.709 - perf/throughput:109.696 - reflection/any_word_frequency:0.000 - reflection_verify/word_verify_frequency:0.000 - reflection_verify/with_verify_length_mean:nan - reflection_verify/without_verify_length_mean:1.000 - reflection_verify/with_verify_correct_ratio:nan - reflection_verify/without_verify_correct_ratio:0.000 - reflection_verify/with_verify_reward_mean:nan - reflection_verify/without_verify_reward_mean:-0.001 - reflection_check/word_check_frequency:0.000 - reflection_check/with_check_length_mean:nan - reflection_check/without_check_length_mean:1.000 - reflection_check/with_check_correct_ratio:nan - reflection_check/without_check_correct_ratio:0.000 - reflection_check/with_check_reward_mean:nan - reflection_check/without_check_reward_mean:-0.001 - reflection_confirm/word_confirm_frequency:0.000 - reflection_confirm/with_confirm_length_mean:nan - reflection_confirm/without_confirm_length_mean:1.000 - reflection_confirm/with_confirm_correct_ratio:nan - reflection_confirm/without_confirm_correct_ratio:0.000 - reflection_confirm/with_confirm_reward_mean:nan - reflection_confirm/without_confirm_reward_mean:-0.001 - reflection_however/word_however_frequency:0.000 - reflection_however/with_however_length_mean:nan - reflection_however/without_however_length_mean:1.000 - reflection_however/with_however_correct_ratio:nan - reflection_however/without_however_correct_ratio:0.000 - reflection_however/with_however_reward_mean:nan - reflection_however/without_however_reward_mean:-0.001 - reflection_reflect/word_reflect_frequency:0.000 - reflection_reflect/with_reflect_length_mean:nan - reflection_reflect/without_reflect_length_mean:1.000 - reflection_reflect/with_reflect_correct_ratio:nan - reflection_reflect/without_reflect_correct_ratio:0.000 - reflection_reflect/with_reflect_reward_mean:nan - reflection_reflect/without_reflect_reward_mean:-0.001 - reflection_wait/word_wait_frequency:0.000 - reflection_wait/with_wait_length_mean:nan - reflection_wait/without_wait_length_mean:1.000 - reflection_wait/with_wait_correct_ratio:nan - reflection_wait/without_wait_correct_ratio:0.000 - reflection_wait/with_wait_reward_mean:nan - reflection_wait/without_wait_reward_mean:-0.001 - reflection_correct/word_correct_frequency:0.000 - reflection_correct/with_correct_length_mean:nan - reflection_correct/without_correct_length_mean:1.000 - reflection_correct/with_correct_correct_ratio:nan - reflection_correct/without_correct_correct_ratio:0.000 - reflection_correct/with_correct_reward_mean:nan - reflection_correct/without_correct_reward_mean:-0.001 - reflection_revise/word_revise_frequency:0.000 - reflection_revise/with_revise_length_mean:nan - reflection_revise/without_revise_length_mean:1.000 - reflection_revise/with_revise_correct_ratio:nan - reflection_revise/without_revise_correct_ratio:0.000 - reflection_revise/with_revise_reward_mean:nan - reflection_revise/without_revise_reward_mean:-0.001 - reflection_adjust/word_adjust_frequency:0.000 - reflection_adjust/with_adjust_length_mean:nan - reflection_adjust/without_adjust_length_mean:1.000 - reflection_adjust/with_adjust_correct_ratio:nan - reflection_adjust/without_adjust_correct_ratio:0.000 - reflection_adjust/with_adjust_reward_mean:nan - reflection_adjust/without_adjust_reward_mean:-0.001 - reflection_re-evaluate/word_re-evaluate_frequency:0.000 - reflection_re-evaluate/with_re-evaluate_length_mean:nan - reflection_re-evaluate/without_re-evaluate_length_mean:1.000 - reflection_re-evaluate/with_re-evaluate_correct_ratio:nan - reflection_re-evaluate/without_re-evaluate_correct_ratio:0.000 - reflection_re-evaluate/with_re-evaluate_reward_mean:nan - reflection_re-evaluate/without_re-evaluate_reward_mean:-0.001 - reflection_re-examine/word_re-examine_frequency:0.000 - reflection_re-examine/with_re-examine_length_mean:nan - reflection_re-examine/without_re-examine_length_mean:1.000 - reflection_re-examine/with_re-examine_correct_ratio:nan - reflection_re-examine/without_re-examine_correct_ratio:0.000 - reflection_re-examine/with_re-examine_reward_mean:nan - reflection_re-examine/without_re-examine_reward_mean:-0.001 - reflection_yet/word_yet_frequency:0.000 - reflection_yet/with_yet_length_mean:nan - reflection_yet/without_yet_length_mean:1.000 - reflection_yet/with_yet_correct_ratio:nan - reflection_yet/without_yet_correct_ratio:0.000 - reflection_yet/with_yet_reward_mean:nan - reflection_yet/without_yet_reward_mean:-0.001 - language_mix/frequency:0.000 - language_mix/ratio:0.000 - train/num_gen_batches:1.000 (TaskRunner pid=54845) Length of batch_part and new_batch: 13 8 (TaskRunner pid=54845) len(new_batch) is 8 (TaskRunner pid=54845) (TaskRunner pid=54845) test_gen_batch meta info: {'eos_token_id': 248046, 'pad_token_id': 248044, 'recompute_log_prob': False, 'do_sample': True, 'validate': True} Error executing job with overrides: ['algorithm.adv_estimator=grpo', 'data.train_files=/data/data/python/train.parquet', 'data.val_files=/data/data/python/val.parquet', 'data.train_batch_size=8', 'data.val_batch_size=128', 'data.max_prompt_length=2048', 'data.max_response_length=1024', 'algorithm.filter_groups.enable=False', 'algorithm.filter_groups.max_num_gen_batches=999', 'algorithm.filter_groups.metric=acc', 'algorithm.filter_groups.accelerate=True', 'data.gen_batch_size=8', 'actor_rollout_ref.model.path=/data/ckpt/checkpoint-570', '+actor_rollout_ref.model.override_config.attention_dropout=0.', '+actor_rollout_ref.model.override_config.embd_pdrop=0.', '+actor_rollout_ref.model.override_config.resid_pdrop=0.', 'actor_rollout_ref.model.enable_gradient_checkpointing=True', '+actor_rollout_ref.model.use_liger=True', 'actor_rollout_ref.actor.optim.lr=1e-6', 'actor_rollout_ref.actor.optim.weight_decay=0.0', 'actor_rollout_ref.actor.use_dynamic_bsz=False', 'actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1', 'actor_rollout_ref.model.use_remove_padding=False', 'actor_rollout_ref.actor.clip_ratio_low=0.2', 'actor_rollout_ref.actor.clip_ratio_high=0.28', 'actor_rollout_ref.actor.ppo_mini_batch_size=8', 'actor_rollout_ref.actor.use_kl_loss=False', 'actor_rollout_ref.actor.kl_loss_coef=0.0', 'actor_rollout_ref.actor.kl_loss_type=low_var_kl', 'actor_rollout_ref.actor.entropy_coeff=0.001', 'actor_rollout_ref.actor.grad_clip=0.5', 'actor_rollout_ref.actor.use_token_level_loss=True', 'actor_rollout_ref.actor.fsdp_config.param_offload=False', 'actor_rollout_ref.actor.fsdp_config.optimizer_offload=True', 'actor_rollout_ref.actor.fsdp_config.wrap_policy.min_num_params=100000000', 'actor_rollout_ref.ref.fsdp_config.wrap_policy.min_num_params=100000000', 'actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1', 'actor_rollout_ref.rollout.tensor_model_parallel_size=2', 'actor_rollout_ref.rollout.name=hf', 'actor_rollout_ref.rollout.n=4', 'actor_rollout_ref.rollout.val_kwargs.n=4', 'actor_rollout_ref.rollout.temperature=1.0', 'actor_rollout_ref.rollout.val_kwargs.temperature=1.0', 'actor_rollout_ref.rollout.val_kwargs.do_sample=True', 'actor_rollout_ref.rollout.gpu_memory_utilization=0.85', 'actor_rollout_ref.rollout.enforce_eager=False', 'actor_rollout_ref.rollout.free_cache_engine=False', 'reward_model.reward_manager=prime', 'actor_rollout_ref.ref.fsdp_config.param_offload=True', 'custom_reward_function.overlong_buffer.enable=True', 'custom_reward_function.overlong_buffer.len=1024', 'custom_reward_function.overlong_buffer.penalty_factor=1.0', 'custom_reward_function.train.path=verl/utils/reward_score/codev_py.py', 'custom_reward_function.train.name=compute_score_wrapper', 'algorithm.kl_ctrl.kl_coef=0.0', 'trainer.critic_warmup=0', 'trainer.logger=[console,wandb]', 'trainer.project_name=codev-r1-qwen35-9b-full', 'trainer.experiment_name=dapo-qwen35-9b-python-full', 'trainer.n_gpus_per_node=8', 'trainer.nnodes=1', '+trainer.val_before_train=False', 'trainer.default_local_dir=/data/save/python', 'trainer.resume_mode=auto', 'trainer.default_hdfs_dir=null', 'trainer.save_freq=30', 'trainer.test_freq=999999', '+trainer.total_training_steps=200', 'trainer.total_epochs=20', 'actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=False', 'actor_rollout_ref.ref.log_prob_use_dynamic_bsz=False', 'actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1'] Traceback (most recent call last): File "/opt/codev-r1/verl/verl/trainer/main_ppo.py", line 53, in main run_ppo(config) File "/opt/codev-r1/verl/verl/trainer/main_ppo.py", line 71, in run_ppo ray.get(runner.run.remote(config)) File "/usr/local/lib/python3.12/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper return fn(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 2981, in get values, debugger_breakpoint = worker.get_objects( ^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 1012, in get_objects raise value.as_instanceof_cause() ray.exceptions.RayTaskError(OutOfMemoryError): ray::TaskRunner.run() (pid=54845, ip=10.4.130.138, actor_id=7574dfcc020fadedb0a7264801000000, repr=) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/codev-r1/verl/verl/trainer/main_ppo.py", line 187, in run trainer.fit() File "/opt/codev-r1/verl/verl/trainer/ppo/ray_trainer.py", line 1149, in fit val_metrics: dict = self._validate() ^^^^^^^^^^^^^^^^ File "/opt/codev-r1/verl/verl/trainer/ppo/ray_trainer.py", line 582, in _validate test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/codev-r1/verl/verl/single_controller/ray/base.py", line 42, in func output = ray.get(output) ^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^ ray.exceptions.RayTaskError(OutOfMemoryError): ray::WorkerDict.actor_rollout_generate_sequences() (pid=56487, ip=10.4.130.138, actor_id=05f843592380f9837a84492601000000, repr=) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/codev-r1/verl/verl/single_controller/ray/base.py", line 419, in func return getattr(self.worker_dict[key], name)(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/codev-r1/verl/verl/single_controller/base/decorator.py", line 404, in inner return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/opt/codev-r1/verl/verl/workers/fsdp_workers.py", line 518, in generate_sequences output = self.rollout.generate_sequences(prompts=prompts) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/codev-r1/verl/verl/workers/rollout/hf_rollout.py", line 46, in generate_sequences output = [self._generate_minibatch(p) for p in batch_prompts] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/opt/codev-r1/verl/verl/workers/rollout/hf_rollout.py", line 98, in _generate_minibatch output = self.module.generate( ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/transformers/generation/utils.py", line 2535, in generate result = decoding_method( ^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/transformers/generation/utils.py", line 2728, in _sample outputs = self._prefill( ^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/transformers/generation/utils.py", line 3776, in _prefill return self(**model_inputs, return_dict=True) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py", line 843, in wrapper output = func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1845, in forward outputs: BaseModelOutputWithPast = self.model( ^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py", line 917, in wrapper output = func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/transformers/utils/output_capturing.py", line 253, in wrapper outputs = func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1372, in forward hidden_states = decoder_layer( ^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_layers.py", line 93, in __call__ return super().__call__(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 857, in forward hidden_states = self.linear_attn( ^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 593, in forward core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 374, in torch_chunk_gated_delta_rule k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1)) ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.12 GiB. GPU 0 has a total capacity of 139.72 GiB of which 2.16 GiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 132.30 GiB is allocated by PyTorch, and 3.61 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace. (TaskRunner pid=54845) Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::WorkerDict.actor_rollout_generate_sequences() (pid=56485, ip=10.4.130.138, actor_id=5a7e8c2a5fc7f5de6f6b7b6f01000000, repr=) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/single_controller/ray/base.py", line 419, in func (TaskRunner pid=54845) return getattr(self.worker_dict[key], name)(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/single_controller/base/decorator.py", line 404, in inner (TaskRunner pid=54845) return func(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/workers/fsdp_workers.py", line 518, in generate_sequences (TaskRunner pid=54845) output = self.rollout.generate_sequences(prompts=prompts) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/workers/rollout/hf_rollout.py", line 46, in generate_sequences (TaskRunner pid=54845) output = [self._generate_minibatch(p) for p in batch_prompts] (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context (TaskRunner pid=54845) return func(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/opt/codev-r1/verl/verl/workers/rollout/hf_rollout.py", line 98, in _generate_minibatch (TaskRunner pid=54845) output = self.module.generate( (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context (TaskRunner pid=54845) return func(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/generation/utils.py", line 2535, in generate (TaskRunner pid=54845) result = decoding_method( (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/generation/utils.py", line 2728, in _sample (TaskRunner pid=54845) outputs = self._prefill( (TaskRunner pid=54845) ^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/generation/utils.py", line 3776, in _prefill (TaskRunner pid=54845) return self(**model_inputs, return_dict=True) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl (TaskRunner pid=54845) return self._call_impl(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl (TaskRunner pid=54845) return forward_call(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py", line 843, in wrapper (TaskRunner pid=54845) output = func(self, *args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1845, in forward (TaskRunner pid=54845) outputs: BaseModelOutputWithPast = self.model( (TaskRunner pid=54845) ^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl (TaskRunner pid=54845) return self._call_impl(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl (TaskRunner pid=54845) return forward_call(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py", line 917, in wrapper (TaskRunner pid=54845) output = func(self, *args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/output_capturing.py", line 253, in wrapper (TaskRunner pid=54845) outputs = func(self, *args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1372, in forward (TaskRunner pid=54845) hidden_states = decoder_layer( (TaskRunner pid=54845) ^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_layers.py", line 93, in __call__ (TaskRunner pid=54845) return super().__call__(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl (TaskRunner pid=54845) return self._call_impl(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl (TaskRunner pid=54845) return forward_call(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 857, in forward (TaskRunner pid=54845) hidden_states = self.linear_attn( (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl (TaskRunner pid=54845) return self._call_impl(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl (TaskRunner pid=54845) return forward_call(*args, **kwargs) (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 593, in forward (TaskRunner pid=54845) core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule( (TaskRunner pid=54845) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (TaskRunner pid=54845) File "/usr/local/lib/python3.12/dist-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 374, in torch_chunk_gated_delta_rule (TaskRunner pid=54845) k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1)) (TaskRunner pid=54845) ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ (TaskRunner pid=54845) torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.12 GiB. GPU 0 has a total capacity of 139.72 GiB of which 1.74 GiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 132.30 GiB is allocated by PyTorch, and 4.03 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)