|
{ |
|
"_name_or_path": "runs/f6oomb7y/SavedModels/Model_e1_300000", |
|
"ap_mae_preload_name": null, |
|
"architectures": [ |
|
"APMAE" |
|
], |
|
"attention_scaler": "log_normalize", |
|
"base_learning_rate": 0.00015, |
|
"correct_only": true, |
|
"dataset_location": "fahamu/ioi", |
|
"dataset_name": "", |
|
"dataset_split_seed": 42, |
|
"dataset_test_split": "train[1000000:1010000]", |
|
"dataset_train_split": "train[0:1000000]", |
|
"decoder_device": "cuda:0", |
|
"decoder_dim": 512, |
|
"decoder_dim_head": 64, |
|
"decoder_heads": 8, |
|
"decoder_layers": 8, |
|
"decoder_mlp_dim": 2048, |
|
"encoder_device": "cuda:0", |
|
"encoder_dim": 512, |
|
"encoder_dim_head": 64, |
|
"encoder_dropout": 0.0, |
|
"encoder_emb_dropout": 0.0, |
|
"encoder_heads": 16, |
|
"encoder_layers": 24, |
|
"encoder_mlp_dim": 2048, |
|
"encoder_pool": "cls", |
|
"hf_home": "./huggingface", |
|
"hidden_act": "gelu", |
|
"initial_seed": 45, |
|
"iter_loader_workers": 8, |
|
"lang": "java", |
|
"layer_norm_eps": 1e-12, |
|
"mask_ratio": 0.5, |
|
"max_epochs": 1, |
|
"max_length": 32, |
|
"min_length": 16, |
|
"model_type": "ap_mae", |
|
"patch_size": 2, |
|
"qkv_bias": false, |
|
"queries": [ |
|
"ABBA", |
|
"ABAB" |
|
], |
|
"save_model_frequency": 15000, |
|
"target_model_device": "cuda:0", |
|
"target_model_name": "openai-community/gpt2", |
|
"test_batch_size": 1, |
|
"test_head_selection_strategy": "all", |
|
"torch_dtype": "float32", |
|
"train_batch_size": 60, |
|
"train_batches": 150000, |
|
"train_head_selection_strategy": [ |
|
"layerwise", |
|
1 |
|
], |
|
"transformers_version": "4.47.1", |
|
"val_batches": 3840, |
|
"visualize_frequency": 2000, |
|
"visualize_norm": null |
|
} |
|
|