| name jln.mlpblock.gpt2-sparse-5.0e-01 | device cuda | compile True | data_dir data/fineweb_edu_10b | should_randomize True | log_interval 1 | eval_interval 250 | eval_steps 100 | batch_size 4 | gradient_accumulation_steps 8 | learning_rate 0.0005 | warmup_steps 750 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'jln.mlpblock.gpt2', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'gpt2', 'device': device(type='cuda'), 'compile': True, 'block_size': 1024, 'vocab_size': 50257, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576), 'sae_variant': <SAEVariant.JSAE_BLOCK: 'jsae_block'>, 'top_k': (32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost', '4_residmid', '4_residpost', '5_residmid', '5_residpost', '6_residmid', '6_residpost', '7_residmid', '7_residpost', '8_residmid', '8_residpost', '9_residmid', '9_residpost', '10_residmid', '10_residpost', '11_residmid', '11_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), 'regularization': None, 'downstream': None, 'bandwidth': None} | |
| name jln.mlpblock.gpt2-sparse-5.0e-01 | device cuda | compile True | data_dir data/fineweb_edu_10b | should_randomize True | log_interval 1 | eval_interval 250 | eval_steps 100 | batch_size 4 | gradient_accumulation_steps 8 | learning_rate 0.0005 | warmup_steps 750 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'jln.mlpblock.gpt2', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'gpt2', 'device': device(type='cuda'), 'compile': True, 'block_size': 1024, 'vocab_size': 50257, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'norm_strategy': <NormalizationStrategy.LAYER_NORM: 'LayerNorm'>, 'alpha_attn': 2.0, 'alpha_mlp': 2.0}, 'n_features': (24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576, 24576), 'sae_variant': <SAEVariant.JSAE_BLOCK: 'jsae_block'>, 'top_k': (32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32), 'sae_keys': ('0_residmid', '0_residpost', '1_residmid', '1_residpost', '2_residmid', '2_residpost', '3_residmid', '3_residpost', '4_residmid', '4_residpost', '5_residmid', '5_residpost', '6_residmid', '6_residpost', '7_residmid', '7_residpost', '8_residmid', '8_residpost', '9_residmid', '9_residpost', '10_residmid', '10_residpost', '11_residmid', '11_residpost')} | trainable_layers None | loss_coefficients {'sparsity': (0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), 'regularization': None, 'downstream': None, 'bandwidth': None} | |