File size: 6,662 Bytes

be4085b

name top5-mlplayer | device cuda | compile True | data_dir data/shakespeare | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 128 | gradient_accumulation_steps 1 | learning_rate 0.001 | warmup_steps 750 | max_steps 20000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'topk.mlpblock.shakespeare_64x4', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'ascii_64x4', 'device': device(type='cuda'), 'compile': True, 'block_size': 128, 'vocab_size': 128, 'n_layer': 4, 'n_head': 4, 'n_embd': 64, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (512, 512, 512, 512, 512, 512, 512, 512), 'sae_variant': <SAEVariant.TOPK: 'topk'>, 'top_k': (5, 5, 5, 5, 5, 5, 5, 5), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), 'regularization': None, 'downstream': None, 'bandwidth': None}
name top5-mlplayer | device cuda | compile True | data_dir data/shakespeare | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 128 | gradient_accumulation_steps 1 | learning_rate 0.001 | warmup_steps 750 | max_steps 20000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'topk.mlpblock.shakespeare_64x4', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'ascii_64x4', 'device': device(type='cuda'), 'compile': True, 'block_size': 128, 'vocab_size': 128, 'n_layer': 4, 'n_head': 4, 'n_embd': 64, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (512, 512, 512, 512, 512, 512, 512, 512), 'sae_variant': <SAEVariant.TOPK: 'topk'>, 'top_k': (5, 5, 5, 5, 5, 5, 5, 5), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.0, 0.0, 0.0, 0.0), 'regularization': None, 'downstream': None, 'bandwidth': None}
name top5-mlplayer | device cuda | compile True | data_dir data/shakespeare | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 128 | gradient_accumulation_steps 1 | learning_rate 0.001 | warmup_steps 750 | max_steps 20000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'topk.mlpblock.shakespeare_64x4', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'ascii_64x4', 'device': device(type='cuda'), 'compile': True, 'block_size': 128, 'vocab_size': 128, 'n_layer': 4, 'n_head': 4, 'n_embd': 64, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (512, 512, 512, 512, 512, 512, 512, 512), 'sae_variant': <SAEVariant.TOPK: 'topk'>, 'top_k': (5, 5, 5, 5, 5, 5, 5, 5), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.0, 0.0, 0.0, 0.0), 'regularization': None, 'downstream': None, 'bandwidth': None}
name top5-mlplayer | device cuda | compile True | data_dir data/shakespeare | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 128 | gradient_accumulation_steps 1 | learning_rate 0.001 | warmup_steps 750 | max_steps 20000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'topk.mlpblock.shakespeare_64x4', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'ascii_64x4', 'device': device(type='cuda'), 'compile': True, 'block_size': 128, 'vocab_size': 128, 'n_layer': 4, 'n_head': 4, 'n_embd': 64, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (512, 512, 512, 512, 512, 512, 512, 512), 'sae_variant': <SAEVariant.TOPK: 'topk'>, 'top_k': (5, 5, 5, 5, 5, 5, 5, 5), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.0, 0.0, 0.0, 0.0), 'regularization': None, 'downstream': None, 'bandwidth': None}
name top5-mlplayer | device cuda | compile True | data_dir data/shakespeare | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 128 | gradient_accumulation_steps 1 | learning_rate 0.001 | warmup_steps 750 | max_steps 20000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'topk.mlpblock.shakespeare_64x4', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'ascii_64x4', 'device': device(type='cuda'), 'compile': True, 'block_size': 128, 'vocab_size': 128, 'n_layer': 4, 'n_head': 4, 'n_embd': 64, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (512, 512, 512, 512, 512, 512, 512, 512), 'sae_variant': <SAEVariant.TOPK: 'topk'>, 'top_k': (5, 5, 5, 5, 5, 5, 5, 5), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.0, 0.0, 0.0, 0.0), 'regularization': None, 'downstream': None, 'bandwidth': None}
name top5-mlplayer | device cuda | compile True | data_dir data/shakespeare | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 128 | gradient_accumulation_steps 1 | learning_rate 0.001 | warmup_steps 750 | max_steps 20000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'topk.mlpblock.shakespeare_64x4', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'ascii_64x4', 'device': device(type='cuda'), 'compile': True, 'block_size': 128, 'vocab_size': 128, 'n_layer': 4, 'n_head': 4, 'n_embd': 64, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (512, 512, 512, 512, 512, 512, 512, 512), 'sae_variant': <SAEVariant.TOPK: 'topk'>, 'top_k': (5, 5, 5, 5, 5, 5, 5, 5), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.0, 0.0, 0.0, 0.0), 'regularization': None, 'downstream': None, 'bandwidth': None}