{ "_name_or_path": "runs/f6oomb7y/SavedModels/Model_e1_300000", "ap_mae_preload_name": null, "architectures": [ "APMAE" ], "attention_scaler": "log_normalize", "base_learning_rate": 0.00015, "correct_only": true, "dataset_location": "fahamu/ioi", "dataset_name": "", "dataset_split_seed": 42, "dataset_test_split": "train[1000000:1010000]", "dataset_train_split": "train[0:1000000]", "decoder_device": "cuda:0", "decoder_dim": 512, "decoder_dim_head": 64, "decoder_heads": 8, "decoder_layers": 8, "decoder_mlp_dim": 2048, "encoder_device": "cuda:0", "encoder_dim": 512, "encoder_dim_head": 64, "encoder_dropout": 0.0, "encoder_emb_dropout": 0.0, "encoder_heads": 16, "encoder_layers": 24, "encoder_mlp_dim": 2048, "encoder_pool": "cls", "hf_home": "./huggingface", "hidden_act": "gelu", "initial_seed": 45, "iter_loader_workers": 8, "lang": "java", "layer_norm_eps": 1e-12, "mask_ratio": 0.5, "max_epochs": 1, "max_length": 32, "min_length": 16, "model_type": "ap_mae", "patch_size": 2, "qkv_bias": false, "queries": [ "ABBA", "ABAB" ], "save_model_frequency": 15000, "target_model_device": "cuda:0", "target_model_name": "openai-community/gpt2", "test_batch_size": 1, "test_head_selection_strategy": "all", "torch_dtype": "float32", "train_batch_size": 60, "train_batches": 150000, "train_head_selection_strategy": [ "layerwise", 1 ], "transformers_version": "4.47.1", "val_batches": 3840, "visualize_frequency": 2000, "visualize_norm": null }