diff --git a/README.md b/README.md index 655670ba66e13ad64c3051b2edc0d4902f98b4f9..d4576fe074287232d3836bf69c21d3f2593290d9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ -The LoRA-finetuned version of LLaVA v1.5 model. +--- +library_name: peft +--- +## Training procedure -Please refer to our work: +### Framework versions -The First to Know: How Token Distributions Reveal Hidden Knowledge in Large Vision-Language Models? -[Paper](https://arxiv.org/abs/2403.09037) [GitHub](https://github.com/Qinyu-Allen-Zhao/LVLM-LP) \ No newline at end of file + +- PEFT 0.4.0 diff --git a/checkpoint-1309/README.md b/checkpoint-1309/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/checkpoint-1309/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-1309/adapter_config.json b/checkpoint-1309/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a764b53e4dc8b17af932aa1de32ced6a340469f0 --- /dev/null +++ b/checkpoint-1309/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "liuhaotian/llava-v1.5-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 256, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "up_proj", + "v_proj", + "down_proj", + "q_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1309/adapter_model.bin b/checkpoint-1309/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b209ab3f18edbd350052daa979fa3e4db90888fb --- /dev/null +++ b/checkpoint-1309/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88828a7014b9d91c8cfe2aa979cae73d4c6058271feb92146cf08a3c61fedc24 +size 639786637 diff --git a/checkpoint-1309/global_step1309/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-1309/global_step1309/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7808b7f0388cc1a5a2162e5f22f1674ca3f3320 --- /dev/null +++ b/checkpoint-1309/global_step1309/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:323331559b584cd49994dca8e21f2a2deed36c28fc5edcccb4034352ba3322c4 +size 1022391865 diff --git a/checkpoint-1309/global_step1309/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-1309/global_step1309/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fda99233c326d9b7f45dd0de9d0fb960a15365fd --- /dev/null +++ b/checkpoint-1309/global_step1309/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1760d4dbeea339cc8ca4c8d4ac6fce72e7c1d5e513fa4ef54b9f3cfaef116e3 +size 1022391865 diff --git a/checkpoint-1309/global_step1309/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-1309/global_step1309/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b02b5011bf2d376b34a9a8747af2a283e576ae1 --- /dev/null +++ b/checkpoint-1309/global_step1309/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8d214136d86b6ef2d33e533074d2d2e8e617e3b898715c28f9b68515d4f831d +size 1022391865 diff --git a/checkpoint-1309/global_step1309/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-1309/global_step1309/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76a48fac06efcf5e4a9fa532a3452b1321113e49 --- /dev/null +++ b/checkpoint-1309/global_step1309/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00bf4e84b238dbb026c057590a6466f9cfb5b171aa0f325fe3e2632190c242d4 +size 1022391865 diff --git a/checkpoint-1309/global_step1309/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-1309/global_step1309/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35f3b89f9130cd60dfd98e567b46fd9fe3b4b123 --- /dev/null +++ b/checkpoint-1309/global_step1309/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a60b03d11a6d8ba62089122c91fd67faec0c1e81b09a2ffcce3a6c6908d9fa59 +size 3521982567 diff --git a/checkpoint-1309/global_step1309/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-1309/global_step1309/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ddd0f997e99fe3b1c8c16e9c07c76d1d2282e12 --- /dev/null +++ b/checkpoint-1309/global_step1309/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c66dba5b3e4fada79612718519fac2ffd72d5507b6cea2ef40cf9b5a3fc072ce +size 3521982567 diff --git a/checkpoint-1309/global_step1309/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-1309/global_step1309/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7c9c9fc21ee222c88d4200a2f7ca16e5038a19c --- /dev/null +++ b/checkpoint-1309/global_step1309/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b8267c6cf9d1cda7935b15556d2e5054d2e3bcfa4d1f81984d9e87836cf46e2 +size 3521982567 diff --git a/checkpoint-1309/global_step1309/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-1309/global_step1309/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..57e070c90976d63de3ff5dc5f429f53ad1f9612f --- /dev/null +++ b/checkpoint-1309/global_step1309/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddd8a05bd7d5c30290c6afc44df374920d01dc89e9e156d7e522acdd613197f9 +size 3521982567 diff --git a/checkpoint-1309/latest b/checkpoint-1309/latest new file mode 100644 index 0000000000000000000000000000000000000000..56745852af9e1563959d9419aa2f50cefd08fd7d --- /dev/null +++ b/checkpoint-1309/latest @@ -0,0 +1 @@ +global_step1309 \ No newline at end of file diff --git a/checkpoint-1309/rng_state_0.pth b/checkpoint-1309/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf09c22723f747e6f58ddacf9fe5e20d2ef1c9b5 --- /dev/null +++ b/checkpoint-1309/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23db06bc98230cf6d720683f244ffa2b074356e7047ac3dfe97a10a96d68c8e2 +size 17655 diff --git a/checkpoint-1309/rng_state_1.pth b/checkpoint-1309/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..71d5204e69f964b9ad8b2d7382906af39c42f8df --- /dev/null +++ b/checkpoint-1309/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86f5848a2b7d86463be1d519ca52701a5398f04035245a2c02f73622e09f9b25 +size 17655 diff --git a/checkpoint-1309/rng_state_2.pth b/checkpoint-1309/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b0a37802493f63fcf5d6006dd55fe435f590d31 --- /dev/null +++ b/checkpoint-1309/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65734e7384e7a4005653f863c116c14c6ea2b83edfa188081ee2c5bd555e598c +size 17655 diff --git a/checkpoint-1309/rng_state_3.pth b/checkpoint-1309/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..e71e026d70153eadec554460f8ff9e9bad90355b --- /dev/null +++ b/checkpoint-1309/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bd3a9a776579eb7bdccfea05bacb499ec24ad3ccd3557f625a9b31446412f75 +size 17655 diff --git a/checkpoint-1309/special_tokens_map.json b/checkpoint-1309/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/checkpoint-1309/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1309/tokenizer.model b/checkpoint-1309/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-1309/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-1309/tokenizer_config.json b/checkpoint-1309/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..740756b4bef305e27d0bb4d2e1a40dd8847797f7 --- /dev/null +++ b/checkpoint-1309/tokenizer_config.json @@ -0,0 +1,35 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 2048, + "pad_token": null, + "padding_side": "right", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1309/trainer_state.json b/checkpoint-1309/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..740a2b84303876421bb6163261d8a8182e12920d --- /dev/null +++ b/checkpoint-1309/trainer_state.json @@ -0,0 +1,7870 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9988545246277205, + "global_step": 1309, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.946, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 6.060606060606061e-06, + "loss": 1.908, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 9.090909090909091e-06, + "loss": 2.1083, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 1.2121212121212122e-05, + "loss": 2.3218, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 1.5151515151515153e-05, + "loss": 1.8338, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 1.8181818181818182e-05, + "loss": 2.0202, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 2.1212121212121215e-05, + "loss": 2.1332, + "step": 7 + }, + { + "epoch": 0.02, + "learning_rate": 2.4242424242424244e-05, + "loss": 1.8593, + "step": 8 + }, + { + "epoch": 0.02, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.5359, + "step": 9 + }, + { + "epoch": 0.02, + "learning_rate": 3.0303030303030306e-05, + "loss": 1.327, + "step": 10 + }, + { + "epoch": 0.03, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.7252, + "step": 11 + }, + { + "epoch": 0.03, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.4351, + "step": 12 + }, + { + "epoch": 0.03, + "learning_rate": 3.939393939393939e-05, + "loss": 1.2774, + "step": 13 + }, + { + "epoch": 0.03, + "learning_rate": 4.242424242424243e-05, + "loss": 1.5145, + "step": 14 + }, + { + "epoch": 0.03, + "learning_rate": 4.545454545454546e-05, + "loss": 1.1529, + "step": 15 + }, + { + "epoch": 0.04, + "learning_rate": 4.848484848484849e-05, + "loss": 1.0047, + "step": 16 + }, + { + "epoch": 0.04, + "learning_rate": 5.151515151515152e-05, + "loss": 1.3872, + "step": 17 + }, + { + "epoch": 0.04, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.1229, + "step": 18 + }, + { + "epoch": 0.04, + "learning_rate": 5.757575757575758e-05, + "loss": 1.3386, + "step": 19 + }, + { + "epoch": 0.05, + "learning_rate": 6.060606060606061e-05, + "loss": 1.2493, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 6.363636363636364e-05, + "loss": 1.1427, + "step": 21 + }, + { + "epoch": 0.05, + "learning_rate": 6.666666666666667e-05, + "loss": 1.0895, + "step": 22 + }, + { + "epoch": 0.05, + "learning_rate": 6.96969696969697e-05, + "loss": 1.1989, + "step": 23 + }, + { + "epoch": 0.05, + "learning_rate": 7.272727272727273e-05, + "loss": 1.0438, + "step": 24 + }, + { + "epoch": 0.06, + "learning_rate": 7.575757575757576e-05, + "loss": 1.176, + "step": 25 + }, + { + "epoch": 0.06, + "learning_rate": 7.878787878787879e-05, + "loss": 1.1372, + "step": 26 + }, + { + "epoch": 0.06, + "learning_rate": 8.181818181818183e-05, + "loss": 1.2983, + "step": 27 + }, + { + "epoch": 0.06, + "learning_rate": 8.484848484848486e-05, + "loss": 0.9371, + "step": 28 + }, + { + "epoch": 0.07, + "learning_rate": 8.787878787878789e-05, + "loss": 1.2299, + "step": 29 + }, + { + "epoch": 0.07, + "learning_rate": 9.090909090909092e-05, + "loss": 0.9441, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 9.393939393939395e-05, + "loss": 1.0011, + "step": 31 + }, + { + "epoch": 0.07, + "learning_rate": 9.696969696969698e-05, + "loss": 1.1704, + "step": 32 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 1.1193, + "step": 33 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010303030303030303, + "loss": 1.1559, + "step": 34 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010606060606060606, + "loss": 0.8677, + "step": 35 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010909090909090909, + "loss": 1.0865, + "step": 36 + }, + { + "epoch": 0.08, + "learning_rate": 0.00011212121212121212, + "loss": 1.0922, + "step": 37 + }, + { + "epoch": 0.09, + "learning_rate": 0.00011515151515151516, + "loss": 0.9434, + "step": 38 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001181818181818182, + "loss": 0.9144, + "step": 39 + }, + { + "epoch": 0.09, + "learning_rate": 0.00012121212121212122, + "loss": 0.9546, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.00012424242424242425, + "loss": 1.0654, + "step": 41 + }, + { + "epoch": 0.1, + "learning_rate": 0.00012727272727272728, + "loss": 0.8077, + "step": 42 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001303030303030303, + "loss": 1.0758, + "step": 43 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013333333333333334, + "loss": 1.1512, + "step": 44 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013636363636363637, + "loss": 0.84, + "step": 45 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001393939393939394, + "loss": 1.0567, + "step": 46 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014242424242424243, + "loss": 1.0165, + "step": 47 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014545454545454546, + "loss": 0.8678, + "step": 48 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014848484848484849, + "loss": 1.055, + "step": 49 + }, + { + "epoch": 0.11, + "learning_rate": 0.00015151515151515152, + "loss": 1.0669, + "step": 50 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015454545454545454, + "loss": 0.9915, + "step": 51 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015757575757575757, + "loss": 0.993, + "step": 52 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001606060606060606, + "loss": 1.1085, + "step": 53 + }, + { + "epoch": 0.12, + "learning_rate": 0.00016363636363636366, + "loss": 0.9391, + "step": 54 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001666666666666667, + "loss": 0.975, + "step": 55 + }, + { + "epoch": 0.13, + "learning_rate": 0.00016969696969696972, + "loss": 1.0697, + "step": 56 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017272727272727275, + "loss": 0.9462, + "step": 57 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017575757575757578, + "loss": 1.1209, + "step": 58 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001787878787878788, + "loss": 1.0648, + "step": 59 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018181818181818183, + "loss": 0.9964, + "step": 60 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018484848484848484, + "loss": 0.8451, + "step": 61 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001878787878787879, + "loss": 0.8437, + "step": 62 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019090909090909092, + "loss": 1.1271, + "step": 63 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019393939393939395, + "loss": 1.161, + "step": 64 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019696969696969698, + "loss": 1.0032, + "step": 65 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 66 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019999988957695886, + "loss": 0.9543, + "step": 67 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999955830807923, + "loss": 1.0274, + "step": 68 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999900619409279, + "loss": 0.9334, + "step": 69 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001999982332362188, + "loss": 1.0398, + "step": 70 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999723943616433, + "loss": 0.9049, + "step": 71 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999602479612417, + "loss": 0.7452, + "step": 72 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999458931878073, + "loss": 0.8762, + "step": 73 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999293300730427, + "loss": 1.0941, + "step": 74 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999105586535268, + "loss": 0.7713, + "step": 75 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019998895789707154, + "loss": 0.9233, + "step": 76 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998663910709416, + "loss": 0.8634, + "step": 77 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998409950054146, + "loss": 0.9697, + "step": 78 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998133908302209, + "loss": 1.0816, + "step": 79 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001999783578606323, + "loss": 0.9659, + "step": 80 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019997515583995603, + "loss": 0.9644, + "step": 81 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019997173302806478, + "loss": 0.8561, + "step": 82 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019996808943251773, + "loss": 1.0016, + "step": 83 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001999642250613616, + "loss": 0.8951, + "step": 84 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019996013992313073, + "loss": 1.0157, + "step": 85 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019995583402684694, + "loss": 0.9414, + "step": 86 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019995130738201966, + "loss": 0.8097, + "step": 87 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019994655999864582, + "loss": 0.8606, + "step": 88 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001999415918872098, + "loss": 1.0427, + "step": 89 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019993640305868352, + "loss": 0.9578, + "step": 90 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019993099352452623, + "loss": 1.1097, + "step": 91 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019992536329668478, + "loss": 0.8119, + "step": 92 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019991951238759325, + "loss": 0.9915, + "step": 93 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001999134408101731, + "loss": 0.838, + "step": 94 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019990714857783326, + "loss": 0.8935, + "step": 95 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019990063570446984, + "loss": 0.7914, + "step": 96 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019989390220446622, + "loss": 0.8724, + "step": 97 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019988694809269314, + "loss": 1.0374, + "step": 98 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019987977338450845, + "loss": 0.9028, + "step": 99 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019987237809575723, + "loss": 0.9986, + "step": 100 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019986476224277165, + "loss": 1.113, + "step": 101 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019985692584237108, + "loss": 0.8395, + "step": 102 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019984886891186184, + "loss": 1.0134, + "step": 103 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001998405914690374, + "loss": 0.8845, + "step": 104 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019983209353217812, + "loss": 0.7507, + "step": 105 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019982337512005138, + "loss": 0.9073, + "step": 106 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019981443625191148, + "loss": 0.9973, + "step": 107 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019980527694749952, + "loss": 1.0733, + "step": 108 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019979589722704346, + "loss": 0.9148, + "step": 109 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019978629711125812, + "loss": 0.8385, + "step": 110 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019977647662134488, + "loss": 0.75, + "step": 111 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019976643577899195, + "loss": 0.9002, + "step": 112 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019975617460637416, + "loss": 0.8754, + "step": 113 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001997456931261529, + "loss": 0.8886, + "step": 114 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019973499136147606, + "loss": 1.0058, + "step": 115 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019972406933597812, + "loss": 0.9276, + "step": 116 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019971292707377991, + "loss": 0.9922, + "step": 117 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019970156459948873, + "loss": 0.9507, + "step": 118 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001996899819381981, + "loss": 0.9619, + "step": 119 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019967817911548794, + "loss": 0.8163, + "step": 120 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019966615615742424, + "loss": 1.0647, + "step": 121 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001996539130905593, + "loss": 0.9348, + "step": 122 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019964144994193142, + "loss": 1.0523, + "step": 123 + }, + { + "epoch": 0.28, + "learning_rate": 0.000199628766739065, + "loss": 0.9063, + "step": 124 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019961586350997033, + "loss": 1.0227, + "step": 125 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001996027402831438, + "loss": 1.006, + "step": 126 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019958939708756746, + "loss": 0.9082, + "step": 127 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019957583395270923, + "loss": 0.8756, + "step": 128 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995620509085228, + "loss": 0.8311, + "step": 129 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019954804798544745, + "loss": 1.0332, + "step": 130 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019953382521440815, + "loss": 0.9427, + "step": 131 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019951938262681527, + "loss": 0.838, + "step": 132 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995047202545647, + "loss": 0.8509, + "step": 133 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019948983813003774, + "loss": 0.8944, + "step": 134 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019947473628610099, + "loss": 0.9569, + "step": 135 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019945941475610623, + "loss": 0.7805, + "step": 136 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019944387357389052, + "loss": 0.9337, + "step": 137 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994281127737759, + "loss": 0.8712, + "step": 138 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994121323905695, + "loss": 0.9264, + "step": 139 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001993959324595634, + "loss": 0.9323, + "step": 140 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019937951301653444, + "loss": 0.8331, + "step": 141 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001993628740977444, + "loss": 0.902, + "step": 142 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001993460157399396, + "loss": 0.8676, + "step": 143 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019932893798035116, + "loss": 0.8525, + "step": 144 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019931164085669456, + "loss": 0.8571, + "step": 145 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019929412440716985, + "loss": 1.0006, + "step": 146 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019927638867046142, + "loss": 0.9849, + "step": 147 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019925843368573794, + "loss": 0.9064, + "step": 148 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001992402594926523, + "loss": 0.9716, + "step": 149 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001992218661313415, + "loss": 0.7553, + "step": 150 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019920325364242654, + "loss": 0.7921, + "step": 151 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019918442206701245, + "loss": 0.7994, + "step": 152 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001991653714466879, + "loss": 0.8296, + "step": 153 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019914610182352548, + "loss": 0.8116, + "step": 154 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019912661324008148, + "loss": 0.9844, + "step": 155 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019910690573939557, + "loss": 0.865, + "step": 156 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019908697936499103, + "loss": 0.959, + "step": 157 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019906683416087448, + "loss": 0.7727, + "step": 158 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019904647017153582, + "loss": 0.707, + "step": 159 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019902588744194813, + "loss": 0.8597, + "step": 160 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019900508601756756, + "loss": 0.9146, + "step": 161 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001989840659443332, + "loss": 0.9571, + "step": 162 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001989628272686671, + "loss": 0.8537, + "step": 163 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019894137003747403, + "loss": 0.828, + "step": 164 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019891969429814145, + "loss": 0.8055, + "step": 165 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001988978000985394, + "loss": 0.8432, + "step": 166 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001988756874870203, + "loss": 0.8101, + "step": 167 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019885335651241903, + "loss": 0.9072, + "step": 168 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001988308072240527, + "loss": 0.7862, + "step": 169 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019880803967172047, + "loss": 0.8303, + "step": 170 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019878505390570362, + "loss": 0.9489, + "step": 171 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001987618499767653, + "loss": 1.0125, + "step": 172 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001987384279361505, + "loss": 0.809, + "step": 173 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019871478783558587, + "loss": 0.9488, + "step": 174 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001986909297272796, + "loss": 0.9664, + "step": 175 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001986668536639215, + "loss": 0.9657, + "step": 176 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001986425596986825, + "loss": 0.8123, + "step": 177 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019861804788521493, + "loss": 0.9482, + "step": 178 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019859331827765212, + "loss": 0.879, + "step": 179 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019856837093060848, + "loss": 0.896, + "step": 180 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019854320589917927, + "loss": 1.0729, + "step": 181 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019851782323894042, + "loss": 0.9844, + "step": 182 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001984922230059486, + "loss": 0.9131, + "step": 183 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019846640525674082, + "loss": 0.9417, + "step": 184 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019844037004833473, + "loss": 0.9633, + "step": 185 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001984141174382279, + "loss": 0.968, + "step": 186 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019838764748439827, + "loss": 0.8447, + "step": 187 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019836096024530373, + "loss": 0.8638, + "step": 188 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019833405577988195, + "loss": 0.9346, + "step": 189 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001983069341475504, + "loss": 0.8969, + "step": 190 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019827959540820613, + "loss": 0.8499, + "step": 191 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019825203962222572, + "loss": 0.8041, + "step": 192 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019822426685046497, + "loss": 0.9216, + "step": 193 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019819627715425903, + "loss": 0.906, + "step": 194 + }, + { + "epoch": 0.45, + "learning_rate": 0.000198168070595422, + "loss": 0.8969, + "step": 195 + }, + { + "epoch": 0.45, + "learning_rate": 0.000198139647236247, + "loss": 0.7949, + "step": 196 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019811100713950587, + "loss": 0.8996, + "step": 197 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019808215036844917, + "loss": 0.9118, + "step": 198 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001980530769868059, + "loss": 0.7355, + "step": 199 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019802378705878354, + "loss": 0.8344, + "step": 200 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019799428064906774, + "loss": 0.9639, + "step": 201 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001979645578228222, + "loss": 0.852, + "step": 202 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001979346186456887, + "loss": 0.8493, + "step": 203 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019790446318378665, + "loss": 0.851, + "step": 204 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019787409150371328, + "loss": 0.7161, + "step": 205 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019784350367254322, + "loss": 0.9846, + "step": 206 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001978126997578285, + "loss": 0.7883, + "step": 207 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019778167982759833, + "loss": 0.8691, + "step": 208 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019775044395035907, + "loss": 0.928, + "step": 209 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001977189921950939, + "loss": 0.8244, + "step": 210 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001976873246312628, + "loss": 1.0413, + "step": 211 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001976554413288023, + "loss": 0.8261, + "step": 212 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001976233423581255, + "loss": 0.823, + "step": 213 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019759102779012166, + "loss": 0.9386, + "step": 214 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019755849769615628, + "loss": 0.8156, + "step": 215 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019752575214807076, + "loss": 0.8556, + "step": 216 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019749279121818235, + "loss": 0.7769, + "step": 217 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019745961497928406, + "loss": 1.0772, + "step": 218 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019742622350464418, + "loss": 0.8147, + "step": 219 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001973926168680066, + "loss": 0.9529, + "step": 220 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019735879514359018, + "loss": 0.8688, + "step": 221 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019732475840608888, + "loss": 0.9647, + "step": 222 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019729050673067156, + "loss": 0.837, + "step": 223 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019725604019298163, + "loss": 0.9211, + "step": 224 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019722135886913715, + "loss": 0.9434, + "step": 225 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001971864628357304, + "loss": 0.6506, + "step": 226 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019715135216982798, + "loss": 0.8052, + "step": 227 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019711602694897037, + "loss": 0.7852, + "step": 228 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019708048725117192, + "loss": 0.9283, + "step": 229 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001970447331549207, + "loss": 0.9081, + "step": 230 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019700876473917824, + "loss": 0.9036, + "step": 231 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019697258208337934, + "loss": 0.716, + "step": 232 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019693618526743197, + "loss": 0.8192, + "step": 233 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001968995743717171, + "loss": 0.9773, + "step": 234 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019686274947708848, + "loss": 0.8698, + "step": 235 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001968257106648724, + "loss": 0.9062, + "step": 236 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019678845801686764, + "loss": 0.8984, + "step": 237 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019675099161534521, + "loss": 0.8087, + "step": 238 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019671331154304822, + "loss": 0.8272, + "step": 239 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019667541788319162, + "loss": 0.784, + "step": 240 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019663731071946206, + "loss": 0.8777, + "step": 241 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019659899013601772, + "loss": 0.8534, + "step": 242 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019656045621748808, + "loss": 0.9645, + "step": 243 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019652170904897387, + "loss": 0.9692, + "step": 244 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019648274871604662, + "loss": 0.838, + "step": 245 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019644357530474872, + "loss": 0.7445, + "step": 246 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001964041889015931, + "loss": 0.9065, + "step": 247 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019636458959356316, + "loss": 0.7806, + "step": 248 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019632477746811232, + "loss": 0.7971, + "step": 249 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019628475261316417, + "loss": 0.8409, + "step": 250 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019624451511711198, + "loss": 0.7432, + "step": 251 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019620406506881875, + "loss": 0.9096, + "step": 252 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019616340255761676, + "loss": 0.8004, + "step": 253 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019612252767330763, + "loss": 0.7978, + "step": 254 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001960814405061619, + "loss": 0.9535, + "step": 255 + }, + { + "epoch": 0.59, + "learning_rate": 0.000196040141146919, + "loss": 0.9945, + "step": 256 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001959986296867869, + "loss": 0.9703, + "step": 257 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019595690621744208, + "loss": 0.9639, + "step": 258 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019591497083102914, + "loss": 0.9312, + "step": 259 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019587282362016083, + "loss": 0.7709, + "step": 260 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001958304646779175, + "loss": 0.8547, + "step": 261 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019578789409784727, + "loss": 0.8081, + "step": 262 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019574511197396563, + "loss": 0.8476, + "step": 263 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019570211840075517, + "loss": 0.9658, + "step": 264 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019565891347316552, + "loss": 0.7778, + "step": 265 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001956154972866131, + "loss": 0.9926, + "step": 266 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001955718699369808, + "loss": 0.957, + "step": 267 + }, + { + "epoch": 0.61, + "learning_rate": 0.000195528031520618, + "loss": 0.9396, + "step": 268 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019548398213434007, + "loss": 0.9049, + "step": 269 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019543972187542833, + "loss": 0.9683, + "step": 270 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019539525084162992, + "loss": 0.8555, + "step": 271 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019535056913115725, + "loss": 0.8489, + "step": 272 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001953056768426882, + "loss": 0.8728, + "step": 273 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019526057407536564, + "loss": 0.9443, + "step": 274 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019521526092879725, + "loss": 0.8161, + "step": 275 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019516973750305532, + "loss": 0.8936, + "step": 276 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019512400389867657, + "loss": 0.8315, + "step": 277 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019507806021666188, + "loss": 0.9298, + "step": 278 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019503190655847604, + "loss": 0.8235, + "step": 279 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019498554302604766, + "loss": 0.9245, + "step": 280 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001949389697217687, + "loss": 0.8302, + "step": 281 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019489218674849455, + "loss": 0.8488, + "step": 282 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019484519420954354, + "loss": 0.8177, + "step": 283 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019479799220869682, + "loss": 1.0039, + "step": 284 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019475058085019825, + "loss": 0.7685, + "step": 285 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019470296023875387, + "loss": 0.9174, + "step": 286 + }, + { + "epoch": 0.66, + "learning_rate": 0.000194655130479532, + "loss": 1.0997, + "step": 287 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019460709167816274, + "loss": 0.9759, + "step": 288 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001945588439407379, + "loss": 0.9397, + "step": 289 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019451038737381077, + "loss": 1.0367, + "step": 290 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019446172208439574, + "loss": 0.8298, + "step": 291 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001944128481799682, + "loss": 0.9094, + "step": 292 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019436376576846423, + "loss": 1.1234, + "step": 293 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019431447495828045, + "loss": 0.9103, + "step": 294 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001942649758582737, + "loss": 0.7841, + "step": 295 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019421526857776072, + "loss": 0.8817, + "step": 296 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019416535322651818, + "loss": 1.0682, + "step": 297 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019411522991478214, + "loss": 0.9201, + "step": 298 + }, + { + "epoch": 0.68, + "learning_rate": 0.000194064898753248, + "loss": 4.1834, + "step": 299 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019401435985307012, + "loss": 1.0391, + "step": 300 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019396361332586166, + "loss": 2.5015, + "step": 301 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001939126592836944, + "loss": 0.7927, + "step": 302 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001938614978390983, + "loss": 2.2345, + "step": 303 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019381012910506146, + "loss": 0.9311, + "step": 304 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019375855319502962, + "loss": 0.9713, + "step": 305 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019370677022290624, + "loss": 0.8967, + "step": 306 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019365478030305196, + "loss": 3.095, + "step": 307 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001936025835502845, + "loss": 1.1008, + "step": 308 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001935501800798783, + "loss": 1.5409, + "step": 309 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019349757000756444, + "loss": 1.02, + "step": 310 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019344475344953012, + "loss": 1.0101, + "step": 311 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001933917305224187, + "loss": 0.7686, + "step": 312 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001933385013433292, + "loss": 1.1061, + "step": 313 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001932850660298162, + "loss": 0.8083, + "step": 314 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001932314246998895, + "loss": 1.1942, + "step": 315 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019317757747201384, + "loss": 0.8551, + "step": 316 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019312352446510878, + "loss": 0.9049, + "step": 317 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019306926579854821, + "loss": 0.7072, + "step": 318 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019301480159216028, + "loss": 0.8552, + "step": 319 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019296013196622706, + "loss": 0.8414, + "step": 320 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001929052570414843, + "loss": 0.9198, + "step": 321 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019285017693912107, + "loss": 2.1953, + "step": 322 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019279489178077969, + "loss": 0.851, + "step": 323 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019273940168855518, + "loss": 1.0239, + "step": 324 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019268370678499533, + "loss": 1.5125, + "step": 325 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019262780719310008, + "loss": 0.9171, + "step": 326 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019257170303632148, + "loss": 0.9794, + "step": 327 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019251539443856344, + "loss": 0.9023, + "step": 328 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019245888152418124, + "loss": 1.058, + "step": 329 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019240216441798142, + "loss": 0.9411, + "step": 330 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001923452432452215, + "loss": 1.197, + "step": 331 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001922881181316097, + "loss": 0.9253, + "step": 332 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001922307892033046, + "loss": 1.156, + "step": 333 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019217325658691482, + "loss": 0.9424, + "step": 334 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019211552040949891, + "loss": 1.1147, + "step": 335 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019205758079856498, + "loss": 0.8528, + "step": 336 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001919994378820704, + "loss": 0.8105, + "step": 337 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019194109178842153, + "loss": 0.9279, + "step": 338 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019188254264647337, + "loss": 0.9231, + "step": 339 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019182379058552948, + "loss": 1.0425, + "step": 340 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019176483573534142, + "loss": 0.8794, + "step": 341 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019170567822610873, + "loss": 0.9873, + "step": 342 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001916463181884784, + "loss": 0.8146, + "step": 343 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019158675575354478, + "loss": 1.027, + "step": 344 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019152699105284913, + "loss": 0.8093, + "step": 345 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001914670242183795, + "loss": 0.951, + "step": 346 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019140685538257028, + "loss": 0.9268, + "step": 347 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019134648467830198, + "loss": 1.0205, + "step": 348 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019128591223890092, + "loss": 0.9043, + "step": 349 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019122513819813902, + "loss": 0.7387, + "step": 350 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001911641626902333, + "loss": 0.9422, + "step": 351 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019110298584984578, + "loss": 0.9015, + "step": 352 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001910416078120832, + "loss": 0.7522, + "step": 353 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019098002871249646, + "loss": 0.9722, + "step": 354 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001909182486870806, + "loss": 0.8358, + "step": 355 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019085626787227443, + "loss": 0.9859, + "step": 356 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019079408640496013, + "loss": 0.7796, + "step": 357 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019073170442246302, + "loss": 0.8617, + "step": 358 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001906691220625513, + "loss": 0.7727, + "step": 359 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001906063394634356, + "loss": 0.8786, + "step": 360 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001905433567637689, + "loss": 0.9117, + "step": 361 + }, + { + "epoch": 0.83, + "learning_rate": 0.000190480174102646, + "loss": 0.9182, + "step": 362 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001904167916196033, + "loss": 0.9706, + "step": 363 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001903532094546186, + "loss": 0.8036, + "step": 364 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001902894277481105, + "loss": 0.902, + "step": 365 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019022544664093854, + "loss": 0.9231, + "step": 366 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019016126627440237, + "loss": 0.9751, + "step": 367 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001900968867902419, + "loss": 0.8373, + "step": 368 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001900323083306367, + "loss": 0.8695, + "step": 369 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001899675310382057, + "loss": 0.8654, + "step": 370 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018990255505600706, + "loss": 0.98, + "step": 371 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018983738052753767, + "loss": 0.7454, + "step": 372 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018977200759673295, + "loss": 0.829, + "step": 373 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018970643640796642, + "loss": 0.8262, + "step": 374 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001896406671060495, + "loss": 1.0659, + "step": 375 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018957469983623112, + "loss": 0.8551, + "step": 376 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018950853474419742, + "loss": 0.7991, + "step": 377 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001894421719760714, + "loss": 0.8662, + "step": 378 + }, + { + "epoch": 0.87, + "learning_rate": 0.00018937561167841263, + "loss": 0.8817, + "step": 379 + }, + { + "epoch": 0.87, + "learning_rate": 0.00018930885399821693, + "loss": 1.0894, + "step": 380 + }, + { + "epoch": 0.87, + "learning_rate": 0.000189241899082916, + "loss": 0.8225, + "step": 381 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018917474708037718, + "loss": 0.9065, + "step": 382 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018910739813890302, + "loss": 0.8779, + "step": 383 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018903985240723104, + "loss": 0.7909, + "step": 384 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018897211003453328, + "loss": 0.7649, + "step": 385 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018890417117041619, + "loss": 0.9788, + "step": 386 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018883603596492004, + "loss": 0.938, + "step": 387 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018876770456851877, + "loss": 0.9032, + "step": 388 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018869917713211964, + "loss": 0.9059, + "step": 389 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018863045380706274, + "loss": 0.8896, + "step": 390 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001885615347451209, + "loss": 0.7614, + "step": 391 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001884924200984991, + "loss": 0.978, + "step": 392 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001884231100198344, + "loss": 0.9406, + "step": 393 + }, + { + "epoch": 0.9, + "learning_rate": 0.00018835360466219533, + "loss": 0.7555, + "step": 394 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001882839041790818, + "loss": 0.9049, + "step": 395 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018821400872442458, + "loss": 0.7041, + "step": 396 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018814391845258505, + "loss": 0.8995, + "step": 397 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001880736335183548, + "loss": 0.7461, + "step": 398 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018800315407695539, + "loss": 0.9954, + "step": 399 + }, + { + "epoch": 0.92, + "learning_rate": 0.00018793248028403788, + "loss": 0.9035, + "step": 400 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001878616122956826, + "loss": 0.9083, + "step": 401 + }, + { + "epoch": 0.92, + "learning_rate": 0.00018779055026839868, + "loss": 0.7286, + "step": 402 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001877192943591239, + "loss": 0.8001, + "step": 403 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018764784472522403, + "loss": 0.8795, + "step": 404 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001875762015244929, + "loss": 0.8912, + "step": 405 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018750436491515163, + "loss": 0.8848, + "step": 406 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018743233505584862, + "loss": 0.8512, + "step": 407 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018736011210565898, + "loss": 0.8537, + "step": 408 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018728769622408423, + "loss": 0.8777, + "step": 409 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018721508757105202, + "loss": 0.7849, + "step": 410 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018714228630691576, + "loss": 0.9669, + "step": 411 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001870692925924541, + "loss": 0.9299, + "step": 412 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018699610658887088, + "loss": 1.0188, + "step": 413 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018692272845779448, + "loss": 0.8388, + "step": 414 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018684915836127765, + "loss": 0.7904, + "step": 415 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018677539646179707, + "loss": 0.9689, + "step": 416 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018670144292225297, + "loss": 0.7339, + "step": 417 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018662729790596888, + "loss": 0.7894, + "step": 418 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018655296157669117, + "loss": 0.7163, + "step": 419 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018647843409858869, + "loss": 0.8642, + "step": 420 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018640371563625246, + "loss": 0.9281, + "step": 421 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018632880635469526, + "loss": 0.834, + "step": 422 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018625370641935129, + "loss": 0.7316, + "step": 423 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018617841599607586, + "loss": 0.8504, + "step": 424 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018610293525114492, + "loss": 0.8731, + "step": 425 + }, + { + "epoch": 0.98, + "learning_rate": 0.00018602726435125474, + "loss": 0.8803, + "step": 426 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001859514034635215, + "loss": 0.8417, + "step": 427 + }, + { + "epoch": 0.98, + "learning_rate": 0.000185875352755481, + "loss": 0.8947, + "step": 428 + }, + { + "epoch": 0.98, + "learning_rate": 0.00018579911239508827, + "loss": 0.8368, + "step": 429 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018572268255071718, + "loss": 0.8231, + "step": 430 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018564606339116, + "loss": 0.8576, + "step": 431 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001855692550856272, + "loss": 0.8753, + "step": 432 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018549225780374685, + "loss": 0.7778, + "step": 433 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018541507171556445, + "loss": 0.7516, + "step": 434 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001853376969915425, + "loss": 0.7466, + "step": 435 + }, + { + "epoch": 1.0, + "learning_rate": 0.00018526013380255999, + "loss": 0.917, + "step": 436 + }, + { + "epoch": 1.0, + "learning_rate": 0.00018518238231991218, + "loss": 0.9042, + "step": 437 + }, + { + "epoch": 1.0, + "learning_rate": 0.00018510444271531022, + "loss": 0.8587, + "step": 438 + }, + { + "epoch": 1.01, + "learning_rate": 0.00018502631516088066, + "loss": 0.9001, + "step": 439 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001849479998291651, + "loss": 0.7977, + "step": 440 + }, + { + "epoch": 1.01, + "learning_rate": 0.00018486949689311993, + "loss": 0.8711, + "step": 441 + }, + { + "epoch": 1.01, + "learning_rate": 0.00018479080652611583, + "loss": 0.7192, + "step": 442 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001847119289019373, + "loss": 0.9608, + "step": 443 + }, + { + "epoch": 1.02, + "learning_rate": 0.00018463286419478255, + "loss": 0.7097, + "step": 444 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001845536125792629, + "loss": 0.7354, + "step": 445 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001844741742304024, + "loss": 0.8711, + "step": 446 + }, + { + "epoch": 1.02, + "learning_rate": 0.00018439454932363755, + "loss": 0.8832, + "step": 447 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018431473803481684, + "loss": 0.932, + "step": 448 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018423474054020034, + "loss": 0.8394, + "step": 449 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018415455701645942, + "loss": 0.7698, + "step": 450 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018407418764067627, + "loss": 0.8856, + "step": 451 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018399363259034347, + "loss": 0.8529, + "step": 452 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018391289204336368, + "loss": 0.9898, + "step": 453 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018383196617804926, + "loss": 0.8312, + "step": 454 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018375085517312182, + "loss": 0.8234, + "step": 455 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018366955920771184, + "loss": 0.7871, + "step": 456 + }, + { + "epoch": 1.05, + "learning_rate": 0.00018358807846135825, + "loss": 0.9814, + "step": 457 + }, + { + "epoch": 1.05, + "learning_rate": 0.00018350641311400812, + "loss": 0.8183, + "step": 458 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001834245633460161, + "loss": 0.8961, + "step": 459 + }, + { + "epoch": 1.05, + "learning_rate": 0.00018334252933814427, + "loss": 0.9166, + "step": 460 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018326031127156148, + "loss": 1.0031, + "step": 461 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018317790932784317, + "loss": 0.8171, + "step": 462 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001830953236889707, + "loss": 0.83, + "step": 463 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018301255453733134, + "loss": 0.8134, + "step": 464 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001829296020557174, + "loss": 0.8561, + "step": 465 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001828464664273263, + "loss": 0.8669, + "step": 466 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001827631478357597, + "loss": 1.003, + "step": 467 + }, + { + "epoch": 1.07, + "learning_rate": 0.00018267964646502357, + "loss": 0.8715, + "step": 468 + }, + { + "epoch": 1.07, + "learning_rate": 0.00018259596249952731, + "loss": 0.7434, + "step": 469 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018251209612408373, + "loss": 0.9163, + "step": 470 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018242804752390844, + "loss": 1.0639, + "step": 471 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018234381688461942, + "loss": 0.8266, + "step": 472 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018225940439223684, + "loss": 0.7582, + "step": 473 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001821748102331823, + "loss": 0.8547, + "step": 474 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001820900345942787, + "loss": 0.7908, + "step": 475 + }, + { + "epoch": 1.09, + "learning_rate": 0.00018200507766274977, + "loss": 0.6203, + "step": 476 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001819199396262195, + "loss": 0.806, + "step": 477 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001818346206727119, + "loss": 0.8016, + "step": 478 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001817491209906506, + "loss": 0.8548, + "step": 479 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018166344076885827, + "loss": 0.9194, + "step": 480 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018157758019655634, + "loss": 0.8704, + "step": 481 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018149153946336446, + "loss": 0.8373, + "step": 482 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001814053187593003, + "loss": 0.8229, + "step": 483 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018131891827477884, + "loss": 0.8289, + "step": 484 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018123233820061218, + "loss": 0.7753, + "step": 485 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018114557872800905, + "loss": 1.029, + "step": 486 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001810586400485743, + "loss": 0.6198, + "step": 487 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001809715223543087, + "loss": 0.8418, + "step": 488 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018088422583760813, + "loss": 0.7421, + "step": 489 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001807967506912636, + "loss": 0.8032, + "step": 490 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018070909710846052, + "loss": 0.7956, + "step": 491 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018062126528277844, + "loss": 0.9013, + "step": 492 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018053325540819045, + "loss": 0.9582, + "step": 493 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018044506767906295, + "loss": 0.6845, + "step": 494 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018035670229015507, + "loss": 0.8731, + "step": 495 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001802681594366183, + "loss": 0.8369, + "step": 496 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018017943931399603, + "loss": 0.6557, + "step": 497 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018009054211822324, + "loss": 0.7997, + "step": 498 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001800014680456259, + "loss": 0.8348, + "step": 499 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001799122172929206, + "loss": 0.9043, + "step": 500 + }, + { + "epoch": 1.15, + "learning_rate": 0.00017982279005721407, + "loss": 0.8499, + "step": 501 + }, + { + "epoch": 1.15, + "learning_rate": 0.00017973318653600293, + "loss": 0.8595, + "step": 502 + }, + { + "epoch": 1.15, + "learning_rate": 0.00017964340692717303, + "loss": 0.9468, + "step": 503 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001795534514289991, + "loss": 0.9848, + "step": 504 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017946332024014434, + "loss": 0.7326, + "step": 505 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017937301355965996, + "loss": 0.8479, + "step": 506 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017928253158698473, + "loss": 0.8669, + "step": 507 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017919187452194454, + "loss": 0.8163, + "step": 508 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017910104256475194, + "loss": 0.926, + "step": 509 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017901003591600575, + "loss": 0.7956, + "step": 510 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017891885477669064, + "loss": 0.9002, + "step": 511 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017882749934817652, + "loss": 0.787, + "step": 512 + }, + { + "epoch": 1.18, + "learning_rate": 0.00017873596983221832, + "loss": 0.7519, + "step": 513 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001786442664309554, + "loss": 0.8067, + "step": 514 + }, + { + "epoch": 1.18, + "learning_rate": 0.00017855238934691108, + "loss": 0.8824, + "step": 515 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001784603387829923, + "loss": 0.8014, + "step": 516 + }, + { + "epoch": 1.18, + "learning_rate": 0.00017836811494248919, + "loss": 0.6672, + "step": 517 + }, + { + "epoch": 1.19, + "learning_rate": 0.00017827571802907444, + "loss": 0.8516, + "step": 518 + }, + { + "epoch": 1.19, + "learning_rate": 0.000178183148246803, + "loss": 0.8476, + "step": 519 + }, + { + "epoch": 1.19, + "learning_rate": 0.00017809040580011164, + "loss": 0.8493, + "step": 520 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001779974908938184, + "loss": 0.7288, + "step": 521 + }, + { + "epoch": 1.2, + "learning_rate": 0.00017790440373312223, + "loss": 0.7443, + "step": 522 + }, + { + "epoch": 1.2, + "learning_rate": 0.00017781114452360245, + "loss": 0.8767, + "step": 523 + }, + { + "epoch": 1.2, + "learning_rate": 0.00017771771347121842, + "loss": 0.8025, + "step": 524 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001776241107823089, + "loss": 0.8842, + "step": 525 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017753033666359177, + "loss": 0.9648, + "step": 526 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017743639132216353, + "loss": 0.7872, + "step": 527 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001773422749654988, + "loss": 0.9122, + "step": 528 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017724798780144983, + "loss": 0.7688, + "step": 529 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001771535300382461, + "loss": 0.8938, + "step": 530 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017705890188449394, + "loss": 0.7152, + "step": 531 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001769641035491759, + "loss": 0.7077, + "step": 532 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017686913524165036, + "loss": 0.8872, + "step": 533 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017677399717165116, + "loss": 0.8775, + "step": 534 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017667868954928694, + "loss": 0.8508, + "step": 535 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017658321258504092, + "loss": 0.8589, + "step": 536 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017648756648977018, + "loss": 0.6499, + "step": 537 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017639175147470538, + "loss": 0.8927, + "step": 538 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017629576775145026, + "loss": 0.8702, + "step": 539 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017619961553198108, + "loss": 0.7958, + "step": 540 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017610329502864625, + "loss": 0.8582, + "step": 541 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017600680645416583, + "loss": 0.7905, + "step": 542 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001759101500216311, + "loss": 0.7574, + "step": 543 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017581332594450392, + "loss": 0.861, + "step": 544 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017571633443661658, + "loss": 0.7682, + "step": 545 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017561917571217093, + "loss": 0.7547, + "step": 546 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017552184998573825, + "loss": 0.7852, + "step": 547 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001754243574722586, + "loss": 0.7635, + "step": 548 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017532669838704035, + "loss": 0.8714, + "step": 549 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017522887294575977, + "loss": 0.7839, + "step": 550 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017513088136446054, + "loss": 0.8551, + "step": 551 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017503272385955318, + "loss": 0.7367, + "step": 552 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017493440064781475, + "loss": 0.9257, + "step": 553 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017483591194638817, + "loss": 0.8246, + "step": 554 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017473725797278192, + "loss": 0.8319, + "step": 555 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017463843894486937, + "loss": 0.8304, + "step": 556 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017453945508088853, + "loss": 0.6536, + "step": 557 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017444030659944138, + "loss": 0.7606, + "step": 558 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017434099371949345, + "loss": 0.7084, + "step": 559 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017424151666037329, + "loss": 0.8891, + "step": 560 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017414187564177217, + "loss": 0.6199, + "step": 561 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017404207088374333, + "loss": 0.8676, + "step": 562 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001739421026067017, + "loss": 0.8477, + "step": 563 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017384197103142328, + "loss": 0.9234, + "step": 564 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001737416763790447, + "loss": 0.9103, + "step": 565 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017364121887106286, + "loss": 0.7859, + "step": 566 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017354059872933415, + "loss": 0.8623, + "step": 567 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017343981617607424, + "loss": 0.6266, + "step": 568 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017333887143385743, + "loss": 0.8105, + "step": 569 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017323776472561627, + "loss": 0.7752, + "step": 570 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001731364962746409, + "loss": 0.7873, + "step": 571 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001730350663045788, + "loss": 0.8425, + "step": 572 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017293347503943406, + "loss": 0.777, + "step": 573 + }, + { + "epoch": 1.32, + "learning_rate": 0.000172831722703567, + "loss": 0.7348, + "step": 574 + }, + { + "epoch": 1.32, + "learning_rate": 0.00017272980952169365, + "loss": 0.7797, + "step": 575 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001726277357188853, + "loss": 0.8328, + "step": 576 + }, + { + "epoch": 1.32, + "learning_rate": 0.00017252550152056795, + "loss": 0.7109, + "step": 577 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001724231071525218, + "loss": 0.7905, + "step": 578 + }, + { + "epoch": 1.33, + "learning_rate": 0.00017232055284088085, + "loss": 0.7541, + "step": 579 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001722178388121322, + "loss": 0.8954, + "step": 580 + }, + { + "epoch": 1.33, + "learning_rate": 0.00017211496529311582, + "loss": 0.8362, + "step": 581 + }, + { + "epoch": 1.33, + "learning_rate": 0.00017201193251102382, + "loss": 0.8436, + "step": 582 + }, + { + "epoch": 1.34, + "learning_rate": 0.00017190874069340014, + "loss": 0.7594, + "step": 583 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001718053900681397, + "loss": 0.9342, + "step": 584 + }, + { + "epoch": 1.34, + "learning_rate": 0.00017170188086348848, + "loss": 0.8934, + "step": 585 + }, + { + "epoch": 1.34, + "learning_rate": 0.00017159821330804236, + "loss": 0.831, + "step": 586 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001714943876307472, + "loss": 0.8053, + "step": 587 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017139040406089786, + "loss": 0.81, + "step": 588 + }, + { + "epoch": 1.35, + "learning_rate": 0.000171286262828138, + "loss": 0.8245, + "step": 589 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017118196416245947, + "loss": 0.8232, + "step": 590 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017107750829420176, + "loss": 0.8244, + "step": 591 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001709728954540516, + "loss": 0.7863, + "step": 592 + }, + { + "epoch": 1.36, + "learning_rate": 0.00017086812587304234, + "loss": 0.8274, + "step": 593 + }, + { + "epoch": 1.36, + "learning_rate": 0.00017076319978255345, + "loss": 0.6595, + "step": 594 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001706581174143101, + "loss": 0.8582, + "step": 595 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017055287900038263, + "loss": 0.6873, + "step": 596 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017044748477318593, + "loss": 0.8673, + "step": 597 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017034193496547902, + "loss": 0.8055, + "step": 598 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017023622981036455, + "loss": 0.8232, + "step": 599 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001701303695412881, + "loss": 0.8745, + "step": 600 + }, + { + "epoch": 1.38, + "learning_rate": 0.00017002435439203808, + "loss": 0.8034, + "step": 601 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016991818459674468, + "loss": 0.9006, + "step": 602 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001698118603898798, + "loss": 0.7828, + "step": 603 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016970538200625622, + "loss": 0.8413, + "step": 604 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016959874968102735, + "loss": 0.8669, + "step": 605 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016949196364968646, + "loss": 0.9277, + "step": 606 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016938502414806634, + "loss": 0.9256, + "step": 607 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016927793141233868, + "loss": 0.8613, + "step": 608 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016917068567901358, + "loss": 0.9439, + "step": 609 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016906328718493906, + "loss": 0.8606, + "step": 610 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016895573616730044, + "loss": 0.7483, + "step": 611 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016884803286362, + "loss": 0.8359, + "step": 612 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001687401775117562, + "loss": 0.7764, + "step": 613 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016863217034990342, + "loss": 0.9857, + "step": 614 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001685240116165912, + "loss": 0.8706, + "step": 615 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001684157015506839, + "loss": 0.867, + "step": 616 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016830724039138003, + "loss": 0.7974, + "step": 617 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016819862837821181, + "loss": 0.7835, + "step": 618 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016808986575104465, + "loss": 0.7987, + "step": 619 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001679809527500765, + "loss": 0.7383, + "step": 620 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001678718896158375, + "loss": 0.9224, + "step": 621 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016776267658918928, + "loss": 0.8959, + "step": 622 + }, + { + "epoch": 1.43, + "learning_rate": 0.00016765331391132456, + "loss": 0.6702, + "step": 623 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001675438018237665, + "loss": 0.6911, + "step": 624 + }, + { + "epoch": 1.43, + "learning_rate": 0.00016743414056836825, + "loss": 0.9364, + "step": 625 + }, + { + "epoch": 1.43, + "learning_rate": 0.00016732433038731242, + "loss": 0.7902, + "step": 626 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016721437152311054, + "loss": 0.8473, + "step": 627 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016710426421860235, + "loss": 0.8765, + "step": 628 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016699400871695555, + "loss": 0.7705, + "step": 629 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016688360526166514, + "loss": 0.8653, + "step": 630 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001667730540965528, + "loss": 0.9137, + "step": 631 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016666235546576648, + "loss": 0.9772, + "step": 632 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001665515096137797, + "loss": 0.6433, + "step": 633 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001664405167853912, + "loss": 0.8096, + "step": 634 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016632937722572434, + "loss": 0.7298, + "step": 635 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016621809118022647, + "loss": 0.6841, + "step": 636 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016610665889466838, + "loss": 0.9471, + "step": 637 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016599508061514404, + "loss": 0.8396, + "step": 638 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016588335658806962, + "loss": 0.8769, + "step": 639 + }, + { + "epoch": 1.47, + "learning_rate": 0.00016577148706018328, + "loss": 0.8328, + "step": 640 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001656594722785445, + "loss": 0.8932, + "step": 641 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001655473124905335, + "loss": 0.8203, + "step": 642 + }, + { + "epoch": 1.47, + "learning_rate": 0.00016543500794385084, + "loss": 0.8514, + "step": 643 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016532255888651666, + "loss": 0.7396, + "step": 644 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016520996556687028, + "loss": 0.9178, + "step": 645 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001650972282335697, + "loss": 0.6308, + "step": 646 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016498434713559088, + "loss": 0.9018, + "step": 647 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016487132252222727, + "loss": 0.8658, + "step": 648 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016475815464308933, + "loss": 0.8228, + "step": 649 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001646448437481039, + "loss": 0.8944, + "step": 650 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001645313900875136, + "loss": 0.8617, + "step": 651 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016441779391187646, + "loss": 0.9726, + "step": 652 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016430405547206516, + "loss": 0.693, + "step": 653 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016419017501926656, + "loss": 0.8272, + "step": 654 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016407615280498124, + "loss": 0.8523, + "step": 655 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016396198908102272, + "loss": 0.7444, + "step": 656 + }, + { + "epoch": 1.51, + "learning_rate": 0.00016384768409951714, + "loss": 0.8366, + "step": 657 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001637332381129026, + "loss": 0.7441, + "step": 658 + }, + { + "epoch": 1.51, + "learning_rate": 0.00016361865137392854, + "loss": 0.6694, + "step": 659 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001635039241356553, + "loss": 0.8103, + "step": 660 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001633890566514535, + "loss": 0.9135, + "step": 661 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016327404917500346, + "loss": 0.7327, + "step": 662 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016315890196029467, + "loss": 0.8425, + "step": 663 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016304361526162534, + "loss": 0.8812, + "step": 664 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016292818933360151, + "loss": 0.777, + "step": 665 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001628126244311369, + "loss": 0.8864, + "step": 666 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016269692080945198, + "loss": 0.9333, + "step": 667 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016258107872407375, + "loss": 0.906, + "step": 668 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016246509843083492, + "loss": 0.7346, + "step": 669 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016234898018587337, + "loss": 0.8555, + "step": 670 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016223272424563173, + "loss": 0.8449, + "step": 671 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016211633086685664, + "loss": 0.8559, + "step": 672 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016199980030659838, + "loss": 0.7468, + "step": 673 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016188313282221008, + "loss": 0.7986, + "step": 674 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001617663286713474, + "loss": 0.7757, + "step": 675 + }, + { + "epoch": 1.55, + "learning_rate": 0.00016164938811196757, + "loss": 0.8789, + "step": 676 + }, + { + "epoch": 1.55, + "learning_rate": 0.00016153231140232936, + "loss": 0.5499, + "step": 677 + }, + { + "epoch": 1.55, + "learning_rate": 0.00016141509880099206, + "loss": 0.9319, + "step": 678 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016129775056681513, + "loss": 0.6904, + "step": 679 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001611802669589575, + "loss": 0.8506, + "step": 680 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016106264823687716, + "loss": 0.7242, + "step": 681 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016094489466033043, + "loss": 0.6808, + "step": 682 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016082700648937146, + "loss": 0.8017, + "step": 683 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016070898398435167, + "loss": 0.9109, + "step": 684 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016059082740591915, + "loss": 0.7277, + "step": 685 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016047253701501808, + "loss": 0.8601, + "step": 686 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016035411307288813, + "loss": 0.9118, + "step": 687 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001602355558410639, + "loss": 0.8049, + "step": 688 + }, + { + "epoch": 1.58, + "learning_rate": 0.00016011686558137448, + "loss": 0.8174, + "step": 689 + }, + { + "epoch": 1.58, + "learning_rate": 0.00015999804255594258, + "loss": 0.8481, + "step": 690 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001598790870271843, + "loss": 0.7052, + "step": 691 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015975999925780813, + "loss": 0.8208, + "step": 692 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015964077951081485, + "loss": 0.7257, + "step": 693 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015952142804949652, + "loss": 0.858, + "step": 694 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015940194513743624, + "loss": 0.9242, + "step": 695 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001592823310385073, + "loss": 0.7924, + "step": 696 + }, + { + "epoch": 1.6, + "learning_rate": 0.00015916258601687274, + "loss": 0.8788, + "step": 697 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001590427103369848, + "loss": 0.7946, + "step": 698 + }, + { + "epoch": 1.6, + "learning_rate": 0.00015892270426358414, + "loss": 0.8318, + "step": 699 + }, + { + "epoch": 1.6, + "learning_rate": 0.00015880256806169953, + "loss": 0.8983, + "step": 700 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015868230199664711, + "loss": 0.8889, + "step": 701 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015856190633402968, + "loss": 0.9692, + "step": 702 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001584413813397364, + "loss": 0.7787, + "step": 703 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015832072727994193, + "loss": 0.6455, + "step": 704 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015819994442110616, + "loss": 1.0006, + "step": 705 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015807903302997317, + "loss": 0.7384, + "step": 706 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015795799337357114, + "loss": 0.8517, + "step": 707 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015783682571921133, + "loss": 0.8446, + "step": 708 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015771553033448775, + "loss": 0.8227, + "step": 709 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015759410748727662, + "loss": 0.8374, + "step": 710 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001574725574457354, + "loss": 0.7274, + "step": 711 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015735088047830268, + "loss": 0.8728, + "step": 712 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015722907685369723, + "loss": 1.0569, + "step": 713 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015710714684091762, + "loss": 0.9775, + "step": 714 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001569850907092415, + "loss": 0.6832, + "step": 715 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015686290872822504, + "loss": 0.7358, + "step": 716 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015674060116770236, + "loss": 0.9015, + "step": 717 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015661816829778494, + "loss": 0.8516, + "step": 718 + }, + { + "epoch": 1.65, + "learning_rate": 0.00015649561038886094, + "loss": 0.8911, + "step": 719 + }, + { + "epoch": 1.65, + "learning_rate": 0.00015637292771159472, + "loss": 0.7098, + "step": 720 + }, + { + "epoch": 1.65, + "learning_rate": 0.00015625012053692615, + "loss": 0.955, + "step": 721 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001561271891360701, + "loss": 0.6421, + "step": 722 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001560041337805157, + "loss": 0.8807, + "step": 723 + }, + { + "epoch": 1.66, + "learning_rate": 0.00015588095474202595, + "loss": 0.722, + "step": 724 + }, + { + "epoch": 1.66, + "learning_rate": 0.00015575765229263686, + "loss": 0.8055, + "step": 725 + }, + { + "epoch": 1.66, + "learning_rate": 0.00015563422670465712, + "loss": 0.7822, + "step": 726 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015551067825066728, + "loss": 0.8311, + "step": 727 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015538700720351924, + "loss": 0.8519, + "step": 728 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015526321383633568, + "loss": 0.7506, + "step": 729 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001551392984225094, + "loss": 0.8056, + "step": 730 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015501526123570277, + "loss": 0.6968, + "step": 731 + }, + { + "epoch": 1.68, + "learning_rate": 0.000154891102549847, + "loss": 0.829, + "step": 732 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001547668226391417, + "loss": 0.6682, + "step": 733 + }, + { + "epoch": 1.68, + "learning_rate": 0.00015464242177805422, + "loss": 0.8295, + "step": 734 + }, + { + "epoch": 1.68, + "learning_rate": 0.00015451790024131895, + "loss": 0.6911, + "step": 735 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015439325830393687, + "loss": 0.6785, + "step": 736 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015426849624117472, + "loss": 0.81, + "step": 737 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015414361432856475, + "loss": 0.9955, + "step": 738 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015401861284190368, + "loss": 0.8433, + "step": 739 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015389349205725242, + "loss": 0.618, + "step": 740 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015376825225093537, + "loss": 0.7747, + "step": 741 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015364289369953967, + "loss": 0.7673, + "step": 742 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001535174166799148, + "loss": 0.8066, + "step": 743 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015339182146917183, + "loss": 0.8392, + "step": 744 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001532661083446829, + "loss": 0.7949, + "step": 745 + }, + { + "epoch": 1.71, + "learning_rate": 0.00015314027758408044, + "loss": 0.8698, + "step": 746 + }, + { + "epoch": 1.71, + "learning_rate": 0.00015301432946525684, + "loss": 0.7715, + "step": 747 + }, + { + "epoch": 1.71, + "learning_rate": 0.00015288826426636354, + "loss": 0.7583, + "step": 748 + }, + { + "epoch": 1.72, + "learning_rate": 0.00015276208226581064, + "loss": 0.8544, + "step": 749 + }, + { + "epoch": 1.72, + "learning_rate": 0.00015263578374226605, + "loss": 0.8272, + "step": 750 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001525093689746552, + "loss": 0.857, + "step": 751 + }, + { + "epoch": 1.72, + "learning_rate": 0.00015238283824216015, + "loss": 0.9208, + "step": 752 + }, + { + "epoch": 1.73, + "learning_rate": 0.000152256191824219, + "loss": 0.8626, + "step": 753 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015212943000052545, + "loss": 0.9418, + "step": 754 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015200255305102803, + "loss": 0.8087, + "step": 755 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015187556125592945, + "loss": 0.7913, + "step": 756 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015174845489568622, + "loss": 0.8973, + "step": 757 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015162123425100762, + "loss": 0.701, + "step": 758 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015149389960285558, + "loss": 0.898, + "step": 759 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015136645123244366, + "loss": 0.8809, + "step": 760 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015123888942123652, + "loss": 0.7334, + "step": 761 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001511112144509495, + "loss": 0.8506, + "step": 762 + }, + { + "epoch": 1.75, + "learning_rate": 0.00015098342660354775, + "loss": 0.8469, + "step": 763 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001508555261612457, + "loss": 1.0353, + "step": 764 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001507275134065065, + "loss": 0.6269, + "step": 765 + }, + { + "epoch": 1.75, + "learning_rate": 0.00015059938862204127, + "loss": 0.7825, + "step": 766 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001504711520908086, + "loss": 0.8388, + "step": 767 + }, + { + "epoch": 1.76, + "learning_rate": 0.00015034280409601385, + "loss": 0.7383, + "step": 768 + }, + { + "epoch": 1.76, + "learning_rate": 0.00015021434492110852, + "loss": 0.8029, + "step": 769 + }, + { + "epoch": 1.76, + "learning_rate": 0.00015008577484978966, + "loss": 0.6527, + "step": 770 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014995709416599926, + "loss": 0.9434, + "step": 771 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014982830315392358, + "loss": 0.753, + "step": 772 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014969940209799248, + "loss": 0.8143, + "step": 773 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014957039128287892, + "loss": 0.8939, + "step": 774 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001494412709934982, + "loss": 0.9265, + "step": 775 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014931204151500747, + "loss": 0.8261, + "step": 776 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014918270313280495, + "loss": 0.8555, + "step": 777 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014905325613252937, + "loss": 0.8191, + "step": 778 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014892370080005936, + "loss": 0.9159, + "step": 779 + }, + { + "epoch": 1.79, + "learning_rate": 0.00014879403742151283, + "loss": 0.7936, + "step": 780 + }, + { + "epoch": 1.79, + "learning_rate": 0.00014866426628324625, + "loss": 0.8782, + "step": 781 + }, + { + "epoch": 1.79, + "learning_rate": 0.00014853438767185412, + "loss": 0.6078, + "step": 782 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001484044018741682, + "loss": 0.7182, + "step": 783 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014827430917725712, + "loss": 0.7528, + "step": 784 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014814410986842543, + "loss": 0.902, + "step": 785 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014801380423521324, + "loss": 0.8765, + "step": 786 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014788339256539544, + "loss": 0.6332, + "step": 787 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014775287514698105, + "loss": 0.7258, + "step": 788 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014762225226821273, + "loss": 0.7754, + "step": 789 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014749152421756595, + "loss": 0.7039, + "step": 790 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001473606912837485, + "loss": 0.8563, + "step": 791 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014722975375569978, + "loss": 0.8956, + "step": 792 + }, + { + "epoch": 1.82, + "learning_rate": 0.00014709871192259026, + "loss": 0.8724, + "step": 793 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001469675660738206, + "loss": 0.8885, + "step": 794 + }, + { + "epoch": 1.82, + "learning_rate": 0.00014683631649902132, + "loss": 0.7637, + "step": 795 + }, + { + "epoch": 1.82, + "learning_rate": 0.00014670496348805195, + "loss": 0.7596, + "step": 796 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014657350733100047, + "loss": 0.8221, + "step": 797 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014644194831818266, + "loss": 0.8475, + "step": 798 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014631028674014142, + "loss": 0.7966, + "step": 799 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014617852288764625, + "loss": 0.9186, + "step": 800 + }, + { + "epoch": 1.84, + "learning_rate": 0.00014604665705169237, + "loss": 0.9027, + "step": 801 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001459146895235004, + "loss": 0.9357, + "step": 802 + }, + { + "epoch": 1.84, + "learning_rate": 0.00014578262059451537, + "loss": 0.9202, + "step": 803 + }, + { + "epoch": 1.84, + "learning_rate": 0.00014565045055640638, + "loss": 0.9226, + "step": 804 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001455181797010658, + "loss": 0.8416, + "step": 805 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001453858083206086, + "loss": 0.8192, + "step": 806 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001452533367073718, + "loss": 0.8309, + "step": 807 + }, + { + "epoch": 1.85, + "learning_rate": 0.00014512076515391375, + "loss": 0.7646, + "step": 808 + }, + { + "epoch": 1.85, + "learning_rate": 0.00014498809395301356, + "loss": 0.9335, + "step": 809 + }, + { + "epoch": 1.86, + "learning_rate": 0.00014485532339767037, + "loss": 0.9696, + "step": 810 + }, + { + "epoch": 1.86, + "learning_rate": 0.00014472245378110277, + "loss": 0.7, + "step": 811 + }, + { + "epoch": 1.86, + "learning_rate": 0.000144589485396748, + "loss": 0.8206, + "step": 812 + }, + { + "epoch": 1.86, + "learning_rate": 0.0001444564185382617, + "loss": 0.7417, + "step": 813 + }, + { + "epoch": 1.86, + "learning_rate": 0.00014432325349951667, + "loss": 0.6384, + "step": 814 + }, + { + "epoch": 1.87, + "learning_rate": 0.00014418999057460276, + "loss": 0.7801, + "step": 815 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001440566300578259, + "loss": 0.8459, + "step": 816 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001439231722437075, + "loss": 0.8863, + "step": 817 + }, + { + "epoch": 1.87, + "learning_rate": 0.000143789617426984, + "loss": 0.8502, + "step": 818 + }, + { + "epoch": 1.88, + "learning_rate": 0.000143655965902606, + "loss": 0.8522, + "step": 819 + }, + { + "epoch": 1.88, + "learning_rate": 0.00014352221796573757, + "loss": 0.8612, + "step": 820 + }, + { + "epoch": 1.88, + "learning_rate": 0.00014338837391175582, + "loss": 0.8065, + "step": 821 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001432544340362501, + "loss": 0.8777, + "step": 822 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014312039863502145, + "loss": 0.7731, + "step": 823 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014298626800408166, + "loss": 0.8791, + "step": 824 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014285204243965306, + "loss": 0.9095, + "step": 825 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014271772223816757, + "loss": 0.8846, + "step": 826 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014258330769626606, + "loss": 0.701, + "step": 827 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014244879911079779, + "loss": 0.7598, + "step": 828 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014231419677881966, + "loss": 1.0411, + "step": 829 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014217950099759569, + "loss": 0.6915, + "step": 830 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014204471206459628, + "loss": 0.8048, + "step": 831 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001419098302774974, + "loss": 0.7688, + "step": 832 + }, + { + "epoch": 1.91, + "learning_rate": 0.00014177485593418028, + "loss": 0.7863, + "step": 833 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001416397893327304, + "loss": 0.7627, + "step": 834 + }, + { + "epoch": 1.91, + "learning_rate": 0.00014150463077143712, + "loss": 0.7423, + "step": 835 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014136938054879283, + "loss": 0.7236, + "step": 836 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014123403896349227, + "loss": 0.8978, + "step": 837 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014109860631443213, + "loss": 0.9403, + "step": 838 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014096308290071003, + "loss": 0.7267, + "step": 839 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014082746902162414, + "loss": 0.7905, + "step": 840 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014069176497667242, + "loss": 0.8848, + "step": 841 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014055597106555192, + "loss": 0.9057, + "step": 842 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014042008758815818, + "loss": 0.7363, + "step": 843 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014028411484458454, + "loss": 0.8193, + "step": 844 + }, + { + "epoch": 1.94, + "learning_rate": 0.00014014805313512145, + "loss": 0.7387, + "step": 845 + }, + { + "epoch": 1.94, + "learning_rate": 0.00014001190276025593, + "loss": 0.8871, + "step": 846 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001398756640206707, + "loss": 0.7342, + "step": 847 + }, + { + "epoch": 1.94, + "learning_rate": 0.00013973933721724363, + "loss": 0.8557, + "step": 848 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001396029226510472, + "loss": 0.8778, + "step": 849 + }, + { + "epoch": 1.95, + "learning_rate": 0.00013946642062334766, + "loss": 0.7844, + "step": 850 + }, + { + "epoch": 1.95, + "learning_rate": 0.00013932983143560433, + "loss": 0.7941, + "step": 851 + }, + { + "epoch": 1.95, + "learning_rate": 0.00013919315538946905, + "loss": 0.7505, + "step": 852 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001390563927867856, + "loss": 0.8371, + "step": 853 + }, + { + "epoch": 1.96, + "learning_rate": 0.00013891954392958878, + "loss": 0.8128, + "step": 854 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001387826091201039, + "loss": 0.7127, + "step": 855 + }, + { + "epoch": 1.96, + "learning_rate": 0.00013864558866074622, + "loss": 0.8165, + "step": 856 + }, + { + "epoch": 1.96, + "learning_rate": 0.00013850848285411994, + "loss": 0.7103, + "step": 857 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013837129200301794, + "loss": 0.8373, + "step": 858 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013823401641042084, + "loss": 0.6908, + "step": 859 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013809665637949637, + "loss": 0.7358, + "step": 860 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013795921221359877, + "loss": 0.7545, + "step": 861 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013782168421626816, + "loss": 0.7681, + "step": 862 + }, + { + "epoch": 1.98, + "learning_rate": 0.00013768407269122967, + "loss": 1.026, + "step": 863 + }, + { + "epoch": 1.98, + "learning_rate": 0.000137546377942393, + "loss": 0.761, + "step": 864 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001374086002738516, + "loss": 0.8442, + "step": 865 + }, + { + "epoch": 1.98, + "learning_rate": 0.00013727073998988202, + "loss": 0.7959, + "step": 866 + }, + { + "epoch": 1.99, + "learning_rate": 0.00013713279739494333, + "loss": 0.8061, + "step": 867 + }, + { + "epoch": 1.99, + "learning_rate": 0.00013699477279367636, + "loss": 0.7434, + "step": 868 + }, + { + "epoch": 1.99, + "learning_rate": 0.000136856666490903, + "loss": 0.7159, + "step": 869 + }, + { + "epoch": 1.99, + "learning_rate": 0.00013671847879162562, + "loss": 0.867, + "step": 870 + }, + { + "epoch": 2.0, + "learning_rate": 0.00013658021000102636, + "loss": 0.9237, + "step": 871 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001364418604244664, + "loss": 0.8545, + "step": 872 + }, + { + "epoch": 2.0, + "learning_rate": 0.00013630343036748535, + "loss": 0.893, + "step": 873 + }, + { + "epoch": 2.0, + "learning_rate": 0.00013616492013580062, + "loss": 0.9858, + "step": 874 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001360263300353066, + "loss": 0.6643, + "step": 875 + }, + { + "epoch": 2.01, + "learning_rate": 0.0001358876603720741, + "loss": 0.8081, + "step": 876 + }, + { + "epoch": 2.01, + "learning_rate": 0.00013574891145234962, + "loss": 0.7287, + "step": 877 + }, + { + "epoch": 2.01, + "learning_rate": 0.00013561008358255468, + "loss": 0.8078, + "step": 878 + }, + { + "epoch": 2.01, + "learning_rate": 0.0001354711770692853, + "loss": 0.6738, + "step": 879 + }, + { + "epoch": 2.02, + "learning_rate": 0.00013533219221931102, + "loss": 0.7508, + "step": 880 + }, + { + "epoch": 2.02, + "learning_rate": 0.0001351931293395744, + "loss": 0.8724, + "step": 881 + }, + { + "epoch": 2.02, + "learning_rate": 0.0001350539887371904, + "loss": 0.9317, + "step": 882 + }, + { + "epoch": 2.02, + "learning_rate": 0.00013491477071944557, + "loss": 0.7664, + "step": 883 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013477547559379748, + "loss": 0.8065, + "step": 884 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013463610366787392, + "loss": 0.738, + "step": 885 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013449665524947234, + "loss": 0.7554, + "step": 886 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013435713064655912, + "loss": 0.7769, + "step": 887 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013421753016726887, + "loss": 0.6507, + "step": 888 + }, + { + "epoch": 2.04, + "learning_rate": 0.0001340778541199038, + "loss": 0.7293, + "step": 889 + }, + { + "epoch": 2.04, + "learning_rate": 0.00013393810281293292, + "loss": 0.8305, + "step": 890 + }, + { + "epoch": 2.04, + "learning_rate": 0.00013379827655499163, + "loss": 0.7553, + "step": 891 + }, + { + "epoch": 2.04, + "learning_rate": 0.00013365837565488064, + "loss": 0.7724, + "step": 892 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013351840042156565, + "loss": 0.7061, + "step": 893 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013337835116417648, + "loss": 0.7078, + "step": 894 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013323822819200643, + "loss": 0.8201, + "step": 895 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013309803181451156, + "loss": 0.746, + "step": 896 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013295776234131015, + "loss": 0.8276, + "step": 897 + }, + { + "epoch": 2.06, + "learning_rate": 0.0001328174200821817, + "loss": 0.7922, + "step": 898 + }, + { + "epoch": 2.06, + "learning_rate": 0.0001326770053470668, + "loss": 0.7577, + "step": 899 + }, + { + "epoch": 2.06, + "learning_rate": 0.00013253651844606572, + "loss": 0.8217, + "step": 900 + }, + { + "epoch": 2.06, + "learning_rate": 0.00013239595968943832, + "loss": 0.7883, + "step": 901 + }, + { + "epoch": 2.07, + "learning_rate": 0.00013225532938760317, + "loss": 0.9568, + "step": 902 + }, + { + "epoch": 2.07, + "learning_rate": 0.00013211462785113666, + "loss": 0.7348, + "step": 903 + }, + { + "epoch": 2.07, + "learning_rate": 0.00013197385539077275, + "loss": 0.7558, + "step": 904 + }, + { + "epoch": 2.07, + "learning_rate": 0.00013183301231740183, + "loss": 0.7066, + "step": 905 + }, + { + "epoch": 2.08, + "learning_rate": 0.0001316920989420703, + "loss": 0.7663, + "step": 906 + }, + { + "epoch": 2.08, + "learning_rate": 0.00013155111557597985, + "loss": 0.79, + "step": 907 + }, + { + "epoch": 2.08, + "learning_rate": 0.00013141006253048672, + "loss": 0.8237, + "step": 908 + }, + { + "epoch": 2.08, + "learning_rate": 0.0001312689401171011, + "loss": 0.687, + "step": 909 + }, + { + "epoch": 2.08, + "learning_rate": 0.00013112774864748621, + "loss": 0.8254, + "step": 910 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001309864884334579, + "loss": 0.7641, + "step": 911 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001308451597869839, + "loss": 0.7845, + "step": 912 + }, + { + "epoch": 2.09, + "learning_rate": 0.00013070376302018287, + "loss": 0.8661, + "step": 913 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001305622984453241, + "loss": 0.9001, + "step": 914 + }, + { + "epoch": 2.1, + "learning_rate": 0.00013042076637482654, + "loss": 0.7261, + "step": 915 + }, + { + "epoch": 2.1, + "learning_rate": 0.00013027916712125826, + "loss": 0.7954, + "step": 916 + }, + { + "epoch": 2.1, + "learning_rate": 0.0001301375009973356, + "loss": 0.792, + "step": 917 + }, + { + "epoch": 2.1, + "learning_rate": 0.00012999576831592273, + "loss": 0.8423, + "step": 918 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012985396939003065, + "loss": 0.8529, + "step": 919 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012971210453281674, + "loss": 0.9086, + "step": 920 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012957017405758401, + "loss": 0.7099, + "step": 921 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012942817827778038, + "loss": 0.7515, + "step": 922 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012928611750699783, + "loss": 0.7972, + "step": 923 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001291439920589722, + "loss": 0.6615, + "step": 924 + }, + { + "epoch": 2.12, + "learning_rate": 0.00012900180224758185, + "loss": 0.8229, + "step": 925 + }, + { + "epoch": 2.12, + "learning_rate": 0.00012885954838684743, + "loss": 0.8146, + "step": 926 + }, + { + "epoch": 2.12, + "learning_rate": 0.000128717230790931, + "loss": 0.8941, + "step": 927 + }, + { + "epoch": 2.13, + "learning_rate": 0.00012857484977413545, + "loss": 0.7661, + "step": 928 + }, + { + "epoch": 2.13, + "learning_rate": 0.00012843240565090365, + "loss": 0.7404, + "step": 929 + }, + { + "epoch": 2.13, + "learning_rate": 0.00012828989873581785, + "loss": 0.7971, + "step": 930 + }, + { + "epoch": 2.13, + "learning_rate": 0.000128147329343599, + "loss": 0.6813, + "step": 931 + }, + { + "epoch": 2.14, + "learning_rate": 0.00012800469778910601, + "loss": 0.7704, + "step": 932 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001278620043873351, + "loss": 0.7751, + "step": 933 + }, + { + "epoch": 2.14, + "learning_rate": 0.00012771924945341906, + "loss": 0.841, + "step": 934 + }, + { + "epoch": 2.14, + "learning_rate": 0.00012757643330262657, + "loss": 0.858, + "step": 935 + }, + { + "epoch": 2.14, + "learning_rate": 0.00012743355625036143, + "loss": 0.6657, + "step": 936 + }, + { + "epoch": 2.15, + "learning_rate": 0.00012729061861216213, + "loss": 0.7735, + "step": 937 + }, + { + "epoch": 2.15, + "learning_rate": 0.00012714762070370077, + "loss": 0.8935, + "step": 938 + }, + { + "epoch": 2.15, + "learning_rate": 0.00012700456284078264, + "loss": 0.9684, + "step": 939 + }, + { + "epoch": 2.15, + "learning_rate": 0.0001268614453393454, + "loss": 0.9117, + "step": 940 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012671826851545851, + "loss": 0.7613, + "step": 941 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012657503268532236, + "loss": 0.9567, + "step": 942 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012643173816526764, + "loss": 0.8725, + "step": 943 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012628838527175464, + "loss": 0.8088, + "step": 944 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012614497432137273, + "loss": 0.7655, + "step": 945 + }, + { + "epoch": 2.17, + "learning_rate": 0.00012600150563083927, + "loss": 0.7585, + "step": 946 + }, + { + "epoch": 2.17, + "learning_rate": 0.0001258579795169993, + "loss": 0.6351, + "step": 947 + }, + { + "epoch": 2.17, + "learning_rate": 0.0001257143962968246, + "loss": 0.8408, + "step": 948 + }, + { + "epoch": 2.17, + "learning_rate": 0.00012557075628741307, + "loss": 0.7144, + "step": 949 + }, + { + "epoch": 2.18, + "learning_rate": 0.00012542705980598813, + "loss": 0.7022, + "step": 950 + }, + { + "epoch": 2.18, + "learning_rate": 0.00012528330716989769, + "loss": 0.8635, + "step": 951 + }, + { + "epoch": 2.18, + "learning_rate": 0.0001251394986966139, + "loss": 0.8489, + "step": 952 + }, + { + "epoch": 2.18, + "learning_rate": 0.00012499563470373212, + "loss": 0.7563, + "step": 953 + }, + { + "epoch": 2.19, + "learning_rate": 0.00012485171550897037, + "loss": 0.9245, + "step": 954 + }, + { + "epoch": 2.19, + "learning_rate": 0.00012470774143016853, + "loss": 0.9168, + "step": 955 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001245637127852877, + "loss": 0.803, + "step": 956 + }, + { + "epoch": 2.19, + "learning_rate": 0.00012441962989240952, + "loss": 0.722, + "step": 957 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001242754930697354, + "loss": 0.7944, + "step": 958 + }, + { + "epoch": 2.2, + "learning_rate": 0.00012413130263558587, + "loss": 0.7759, + "step": 959 + }, + { + "epoch": 2.2, + "learning_rate": 0.00012398705890839988, + "loss": 0.9407, + "step": 960 + }, + { + "epoch": 2.2, + "learning_rate": 0.00012384276220673402, + "loss": 0.726, + "step": 961 + }, + { + "epoch": 2.2, + "learning_rate": 0.00012369841284926188, + "loss": 0.7817, + "step": 962 + }, + { + "epoch": 2.21, + "learning_rate": 0.00012355401115477345, + "loss": 0.6845, + "step": 963 + }, + { + "epoch": 2.21, + "learning_rate": 0.00012340955744217412, + "loss": 0.7638, + "step": 964 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001232650520304843, + "loss": 0.8104, + "step": 965 + }, + { + "epoch": 2.21, + "learning_rate": 0.00012312049523883852, + "loss": 0.8676, + "step": 966 + }, + { + "epoch": 2.22, + "learning_rate": 0.0001229758873864848, + "loss": 0.7944, + "step": 967 + }, + { + "epoch": 2.22, + "learning_rate": 0.00012283122879278393, + "loss": 0.8001, + "step": 968 + }, + { + "epoch": 2.22, + "learning_rate": 0.00012268651977720866, + "loss": 0.7943, + "step": 969 + }, + { + "epoch": 2.22, + "learning_rate": 0.0001225417606593433, + "loss": 0.9679, + "step": 970 + }, + { + "epoch": 2.22, + "learning_rate": 0.00012239695175888263, + "loss": 0.773, + "step": 971 + }, + { + "epoch": 2.23, + "learning_rate": 0.00012225209339563145, + "loss": 0.7707, + "step": 972 + }, + { + "epoch": 2.23, + "learning_rate": 0.00012210718588950376, + "loss": 0.6727, + "step": 973 + }, + { + "epoch": 2.23, + "learning_rate": 0.00012196222956052214, + "loss": 0.7641, + "step": 974 + }, + { + "epoch": 2.23, + "learning_rate": 0.00012181722472881697, + "loss": 0.8506, + "step": 975 + }, + { + "epoch": 2.24, + "learning_rate": 0.00012167217171462566, + "loss": 0.8442, + "step": 976 + }, + { + "epoch": 2.24, + "learning_rate": 0.00012152707083829217, + "loss": 0.7853, + "step": 977 + }, + { + "epoch": 2.24, + "learning_rate": 0.00012138192242026614, + "loss": 0.7495, + "step": 978 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001212367267811021, + "loss": 0.739, + "step": 979 + }, + { + "epoch": 2.25, + "learning_rate": 0.00012109148424145898, + "loss": 0.6531, + "step": 980 + }, + { + "epoch": 2.25, + "learning_rate": 0.00012094619512209915, + "loss": 0.7721, + "step": 981 + }, + { + "epoch": 2.25, + "learning_rate": 0.00012080085974388802, + "loss": 0.7346, + "step": 982 + }, + { + "epoch": 2.25, + "learning_rate": 0.0001206554784277931, + "loss": 0.8709, + "step": 983 + }, + { + "epoch": 2.25, + "learning_rate": 0.00012051005149488326, + "loss": 0.8111, + "step": 984 + }, + { + "epoch": 2.26, + "learning_rate": 0.0001203645792663282, + "loss": 0.8296, + "step": 985 + }, + { + "epoch": 2.26, + "learning_rate": 0.00012021906206339766, + "loss": 0.7569, + "step": 986 + }, + { + "epoch": 2.26, + "learning_rate": 0.00012007350020746068, + "loss": 0.7945, + "step": 987 + }, + { + "epoch": 2.26, + "learning_rate": 0.00011992789401998492, + "loss": 0.7818, + "step": 988 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011978224382253589, + "loss": 0.59, + "step": 989 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011963654993677645, + "loss": 0.828, + "step": 990 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011949081268446571, + "loss": 0.7583, + "step": 991 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011934503238745878, + "loss": 0.7453, + "step": 992 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011919920936770568, + "loss": 0.826, + "step": 993 + }, + { + "epoch": 2.28, + "learning_rate": 0.00011905334394725085, + "loss": 0.7673, + "step": 994 + }, + { + "epoch": 2.28, + "learning_rate": 0.00011890743644823242, + "loss": 0.9637, + "step": 995 + }, + { + "epoch": 2.28, + "learning_rate": 0.00011876148719288128, + "loss": 0.702, + "step": 996 + }, + { + "epoch": 2.28, + "learning_rate": 0.00011861549650352069, + "loss": 0.856, + "step": 997 + }, + { + "epoch": 2.29, + "learning_rate": 0.00011846946470256538, + "loss": 0.725, + "step": 998 + }, + { + "epoch": 2.29, + "learning_rate": 0.00011832339211252084, + "loss": 0.7615, + "step": 999 + }, + { + "epoch": 2.29, + "learning_rate": 0.00011817727905598268, + "loss": 0.7691, + "step": 1000 + }, + { + "epoch": 2.29, + "learning_rate": 0.00011803112585563587, + "loss": 0.8347, + "step": 1001 + }, + { + "epoch": 2.3, + "learning_rate": 0.00011788493283425397, + "loss": 0.908, + "step": 1002 + }, + { + "epoch": 2.3, + "learning_rate": 0.00011773870031469862, + "loss": 0.8724, + "step": 1003 + }, + { + "epoch": 2.3, + "learning_rate": 0.00011759242861991855, + "loss": 0.8801, + "step": 1004 + }, + { + "epoch": 2.3, + "learning_rate": 0.0001174461180729491, + "loss": 0.861, + "step": 1005 + }, + { + "epoch": 2.3, + "learning_rate": 0.00011729976899691137, + "loss": 0.8878, + "step": 1006 + }, + { + "epoch": 2.31, + "learning_rate": 0.00011715338171501156, + "loss": 0.7662, + "step": 1007 + }, + { + "epoch": 2.31, + "learning_rate": 0.00011700695655054026, + "loss": 0.7814, + "step": 1008 + }, + { + "epoch": 2.31, + "learning_rate": 0.00011686049382687168, + "loss": 0.8727, + "step": 1009 + }, + { + "epoch": 2.31, + "learning_rate": 0.000116713993867463, + "loss": 0.8036, + "step": 1010 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011656745699585371, + "loss": 0.957, + "step": 1011 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011642088353566469, + "loss": 0.9257, + "step": 1012 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011627427381059772, + "loss": 0.7994, + "step": 1013 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011612762814443459, + "loss": 0.6582, + "step": 1014 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011598094686103653, + "loss": 0.7195, + "step": 1015 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011583423028434344, + "loss": 0.6673, + "step": 1016 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011568747873837307, + "loss": 0.8075, + "step": 1017 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011554069254722051, + "loss": 0.8945, + "step": 1018 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011539387203505727, + "loss": 0.6828, + "step": 1019 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011524701752613074, + "loss": 0.7014, + "step": 1020 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011510012934476338, + "loss": 0.8388, + "step": 1021 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011495320781535186, + "loss": 0.685, + "step": 1022 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011480625326236677, + "loss": 0.7141, + "step": 1023 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011465926601035137, + "loss": 0.8078, + "step": 1024 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011451224638392129, + "loss": 0.7924, + "step": 1025 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011436519470776362, + "loss": 0.9223, + "step": 1026 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011421811130663623, + "loss": 0.8251, + "step": 1027 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011407099650536706, + "loss": 0.9127, + "step": 1028 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011392385062885334, + "loss": 0.7634, + "step": 1029 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011377667400206101, + "loss": 0.7472, + "step": 1030 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011362946695002383, + "loss": 0.7838, + "step": 1031 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011348222979784289, + "loss": 0.9502, + "step": 1032 + }, + { + "epoch": 2.37, + "learning_rate": 0.00011333496287068563, + "loss": 0.7066, + "step": 1033 + }, + { + "epoch": 2.37, + "learning_rate": 0.00011318766649378532, + "loss": 0.9988, + "step": 1034 + }, + { + "epoch": 2.37, + "learning_rate": 0.00011304034099244014, + "loss": 0.9448, + "step": 1035 + }, + { + "epoch": 2.37, + "learning_rate": 0.00011289298669201282, + "loss": 0.7764, + "step": 1036 + }, + { + "epoch": 2.38, + "learning_rate": 0.00011274560391792948, + "loss": 0.7351, + "step": 1037 + }, + { + "epoch": 2.38, + "learning_rate": 0.00011259819299567922, + "loss": 0.895, + "step": 1038 + }, + { + "epoch": 2.38, + "learning_rate": 0.00011245075425081328, + "loss": 0.718, + "step": 1039 + }, + { + "epoch": 2.38, + "learning_rate": 0.00011230328800894437, + "loss": 0.7811, + "step": 1040 + }, + { + "epoch": 2.38, + "learning_rate": 0.0001121557945957459, + "loss": 0.7859, + "step": 1041 + }, + { + "epoch": 2.39, + "learning_rate": 0.00011200827433695127, + "loss": 0.7916, + "step": 1042 + }, + { + "epoch": 2.39, + "learning_rate": 0.00011186072755835322, + "loss": 0.8321, + "step": 1043 + }, + { + "epoch": 2.39, + "learning_rate": 0.00011171315458580303, + "loss": 0.7648, + "step": 1044 + }, + { + "epoch": 2.39, + "learning_rate": 0.00011156555574520981, + "loss": 0.7691, + "step": 1045 + }, + { + "epoch": 2.4, + "learning_rate": 0.00011141793136253986, + "loss": 0.6978, + "step": 1046 + }, + { + "epoch": 2.4, + "learning_rate": 0.00011127028176381578, + "loss": 0.6725, + "step": 1047 + }, + { + "epoch": 2.4, + "learning_rate": 0.00011112260727511596, + "loss": 0.8165, + "step": 1048 + }, + { + "epoch": 2.4, + "learning_rate": 0.00011097490822257377, + "loss": 0.8662, + "step": 1049 + }, + { + "epoch": 2.41, + "learning_rate": 0.00011082718493237669, + "loss": 0.8784, + "step": 1050 + }, + { + "epoch": 2.41, + "learning_rate": 0.00011067943773076586, + "loss": 0.8533, + "step": 1051 + }, + { + "epoch": 2.41, + "learning_rate": 0.00011053166694403521, + "loss": 0.6602, + "step": 1052 + }, + { + "epoch": 2.41, + "learning_rate": 0.0001103838728985307, + "loss": 0.8363, + "step": 1053 + }, + { + "epoch": 2.41, + "learning_rate": 0.0001102360559206497, + "loss": 0.8044, + "step": 1054 + }, + { + "epoch": 2.42, + "learning_rate": 0.00011008821633684019, + "loss": 0.8684, + "step": 1055 + }, + { + "epoch": 2.42, + "learning_rate": 0.00010994035447360018, + "loss": 0.7158, + "step": 1056 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001097924706574767, + "loss": 0.7729, + "step": 1057 + }, + { + "epoch": 2.42, + "learning_rate": 0.00010964456521506545, + "loss": 0.685, + "step": 1058 + }, + { + "epoch": 2.43, + "learning_rate": 0.00010949663847300976, + "loss": 0.8647, + "step": 1059 + }, + { + "epoch": 2.43, + "learning_rate": 0.000109348690758, + "loss": 0.836, + "step": 1060 + }, + { + "epoch": 2.43, + "learning_rate": 0.00010920072239677301, + "loss": 0.8494, + "step": 1061 + }, + { + "epoch": 2.43, + "learning_rate": 0.00010905273371611105, + "loss": 0.9494, + "step": 1062 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010890472504284133, + "loss": 0.7832, + "step": 1063 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010875669670383521, + "loss": 0.7709, + "step": 1064 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010860864902600747, + "loss": 0.8175, + "step": 1065 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010846058233631565, + "loss": 0.8179, + "step": 1066 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010831249696175918, + "loss": 0.7686, + "step": 1067 + }, + { + "epoch": 2.45, + "learning_rate": 0.00010816439322937879, + "loss": 0.8491, + "step": 1068 + }, + { + "epoch": 2.45, + "learning_rate": 0.00010801627146625588, + "loss": 0.7961, + "step": 1069 + }, + { + "epoch": 2.45, + "learning_rate": 0.00010786813199951145, + "loss": 0.8408, + "step": 1070 + }, + { + "epoch": 2.45, + "learning_rate": 0.00010771997515630574, + "loss": 0.8916, + "step": 1071 + }, + { + "epoch": 2.46, + "learning_rate": 0.00010757180126383735, + "loss": 0.8035, + "step": 1072 + }, + { + "epoch": 2.46, + "learning_rate": 0.0001074236106493425, + "loss": 0.9132, + "step": 1073 + }, + { + "epoch": 2.46, + "learning_rate": 0.0001072754036400944, + "loss": 0.8029, + "step": 1074 + }, + { + "epoch": 2.46, + "learning_rate": 0.00010712718056340236, + "loss": 0.6981, + "step": 1075 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010697894174661127, + "loss": 0.7829, + "step": 1076 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010683068751710075, + "loss": 0.7699, + "step": 1077 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010668241820228444, + "loss": 0.7342, + "step": 1078 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010653413412960935, + "loss": 0.7729, + "step": 1079 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010638583562655498, + "loss": 0.9097, + "step": 1080 + }, + { + "epoch": 2.48, + "learning_rate": 0.00010623752302063283, + "loss": 0.8692, + "step": 1081 + }, + { + "epoch": 2.48, + "learning_rate": 0.00010608919663938549, + "loss": 0.8861, + "step": 1082 + }, + { + "epoch": 2.48, + "learning_rate": 0.00010594085681038588, + "loss": 0.7454, + "step": 1083 + }, + { + "epoch": 2.48, + "learning_rate": 0.00010579250386123676, + "loss": 0.8291, + "step": 1084 + }, + { + "epoch": 2.49, + "learning_rate": 0.0001056441381195698, + "loss": 0.7643, + "step": 1085 + }, + { + "epoch": 2.49, + "learning_rate": 0.00010549575991304492, + "loss": 0.8242, + "step": 1086 + }, + { + "epoch": 2.49, + "learning_rate": 0.0001053473695693496, + "loss": 0.9521, + "step": 1087 + }, + { + "epoch": 2.49, + "learning_rate": 0.00010519896741619803, + "loss": 0.8142, + "step": 1088 + }, + { + "epoch": 2.49, + "learning_rate": 0.00010505055378133067, + "loss": 0.7955, + "step": 1089 + }, + { + "epoch": 2.5, + "learning_rate": 0.00010490212899251309, + "loss": 0.7363, + "step": 1090 + }, + { + "epoch": 2.5, + "learning_rate": 0.00010475369337753569, + "loss": 0.8173, + "step": 1091 + }, + { + "epoch": 2.5, + "learning_rate": 0.00010460524726421275, + "loss": 0.7659, + "step": 1092 + }, + { + "epoch": 2.5, + "learning_rate": 0.00010445679098038157, + "loss": 0.8618, + "step": 1093 + }, + { + "epoch": 2.51, + "learning_rate": 0.00010430832485390217, + "loss": 0.7606, + "step": 1094 + }, + { + "epoch": 2.51, + "learning_rate": 0.00010415984921265609, + "loss": 0.8721, + "step": 1095 + }, + { + "epoch": 2.51, + "learning_rate": 0.00010401136438454599, + "loss": 0.8152, + "step": 1096 + }, + { + "epoch": 2.51, + "learning_rate": 0.0001038628706974948, + "loss": 0.8934, + "step": 1097 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010371436847944503, + "loss": 0.8385, + "step": 1098 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010356585805835797, + "loss": 0.8581, + "step": 1099 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010341733976221313, + "loss": 0.788, + "step": 1100 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010326881391900724, + "loss": 0.7872, + "step": 1101 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010312028085675391, + "loss": 0.819, + "step": 1102 + }, + { + "epoch": 2.53, + "learning_rate": 0.00010297174090348255, + "loss": 0.854, + "step": 1103 + }, + { + "epoch": 2.53, + "learning_rate": 0.00010282319438723782, + "loss": 0.7121, + "step": 1104 + }, + { + "epoch": 2.53, + "learning_rate": 0.00010267464163607889, + "loss": 0.8977, + "step": 1105 + }, + { + "epoch": 2.53, + "learning_rate": 0.00010252608297807871, + "loss": 0.8411, + "step": 1106 + }, + { + "epoch": 2.54, + "learning_rate": 0.00010237751874132322, + "loss": 0.834, + "step": 1107 + }, + { + "epoch": 2.54, + "learning_rate": 0.00010222894925391073, + "loss": 0.7582, + "step": 1108 + }, + { + "epoch": 2.54, + "learning_rate": 0.00010208037484395114, + "loss": 0.7773, + "step": 1109 + }, + { + "epoch": 2.54, + "learning_rate": 0.00010193179583956523, + "loss": 0.7294, + "step": 1110 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010178321256888385, + "loss": 0.89, + "step": 1111 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010163462536004742, + "loss": 0.7675, + "step": 1112 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010148603454120487, + "loss": 0.7291, + "step": 1113 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010133744044051328, + "loss": 0.8403, + "step": 1114 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010118884338613688, + "loss": 0.8955, + "step": 1115 + }, + { + "epoch": 2.56, + "learning_rate": 0.00010104024370624644, + "loss": 0.7537, + "step": 1116 + }, + { + "epoch": 2.56, + "learning_rate": 0.00010089164172901851, + "loss": 0.8734, + "step": 1117 + }, + { + "epoch": 2.56, + "learning_rate": 0.00010074303778263474, + "loss": 0.7312, + "step": 1118 + }, + { + "epoch": 2.56, + "learning_rate": 0.00010059443219528117, + "loss": 0.7906, + "step": 1119 + }, + { + "epoch": 2.57, + "learning_rate": 0.00010044582529514739, + "loss": 0.7756, + "step": 1120 + }, + { + "epoch": 2.57, + "learning_rate": 0.00010029721741042586, + "loss": 0.9158, + "step": 1121 + }, + { + "epoch": 2.57, + "learning_rate": 0.00010014860886931139, + "loss": 0.8481, + "step": 1122 + }, + { + "epoch": 2.57, + "learning_rate": 0.0001, + "loss": 0.8187, + "step": 1123 + }, + { + "epoch": 2.58, + "learning_rate": 9.985139113068865e-05, + "loss": 0.8507, + "step": 1124 + }, + { + "epoch": 2.58, + "learning_rate": 9.970278258957415e-05, + "loss": 0.7585, + "step": 1125 + }, + { + "epoch": 2.58, + "learning_rate": 9.955417470485265e-05, + "loss": 0.7163, + "step": 1126 + }, + { + "epoch": 2.58, + "learning_rate": 9.940556780471885e-05, + "loss": 0.8124, + "step": 1127 + }, + { + "epoch": 2.58, + "learning_rate": 9.925696221736525e-05, + "loss": 0.924, + "step": 1128 + }, + { + "epoch": 2.59, + "learning_rate": 9.91083582709815e-05, + "loss": 0.843, + "step": 1129 + }, + { + "epoch": 2.59, + "learning_rate": 9.895975629375359e-05, + "loss": 0.8461, + "step": 1130 + }, + { + "epoch": 2.59, + "learning_rate": 9.881115661386314e-05, + "loss": 0.757, + "step": 1131 + }, + { + "epoch": 2.59, + "learning_rate": 9.866255955948676e-05, + "loss": 0.7779, + "step": 1132 + }, + { + "epoch": 2.6, + "learning_rate": 9.851396545879516e-05, + "loss": 0.8325, + "step": 1133 + }, + { + "epoch": 2.6, + "learning_rate": 9.836537463995262e-05, + "loss": 0.7117, + "step": 1134 + }, + { + "epoch": 2.6, + "learning_rate": 9.821678743111618e-05, + "loss": 0.7209, + "step": 1135 + }, + { + "epoch": 2.6, + "learning_rate": 9.806820416043478e-05, + "loss": 0.6621, + "step": 1136 + }, + { + "epoch": 2.6, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7836, + "step": 1137 + }, + { + "epoch": 2.61, + "learning_rate": 9.777105074608928e-05, + "loss": 0.8576, + "step": 1138 + }, + { + "epoch": 2.61, + "learning_rate": 9.762248125867678e-05, + "loss": 0.6352, + "step": 1139 + }, + { + "epoch": 2.61, + "learning_rate": 9.747391702192132e-05, + "loss": 0.7828, + "step": 1140 + }, + { + "epoch": 2.61, + "learning_rate": 9.732535836392113e-05, + "loss": 0.6583, + "step": 1141 + }, + { + "epoch": 2.62, + "learning_rate": 9.717680561276219e-05, + "loss": 0.9171, + "step": 1142 + }, + { + "epoch": 2.62, + "learning_rate": 9.702825909651748e-05, + "loss": 0.8694, + "step": 1143 + }, + { + "epoch": 2.62, + "learning_rate": 9.687971914324607e-05, + "loss": 0.9293, + "step": 1144 + }, + { + "epoch": 2.62, + "learning_rate": 9.673118608099276e-05, + "loss": 0.7273, + "step": 1145 + }, + { + "epoch": 2.63, + "learning_rate": 9.658266023778689e-05, + "loss": 0.8386, + "step": 1146 + }, + { + "epoch": 2.63, + "learning_rate": 9.643414194164204e-05, + "loss": 0.727, + "step": 1147 + }, + { + "epoch": 2.63, + "learning_rate": 9.628563152055498e-05, + "loss": 0.9991, + "step": 1148 + }, + { + "epoch": 2.63, + "learning_rate": 9.61371293025052e-05, + "loss": 0.7304, + "step": 1149 + }, + { + "epoch": 2.63, + "learning_rate": 9.598863561545404e-05, + "loss": 0.8146, + "step": 1150 + }, + { + "epoch": 2.64, + "learning_rate": 9.584015078734395e-05, + "loss": 0.8178, + "step": 1151 + }, + { + "epoch": 2.64, + "learning_rate": 9.569167514609786e-05, + "loss": 0.7202, + "step": 1152 + }, + { + "epoch": 2.64, + "learning_rate": 9.554320901961843e-05, + "loss": 0.728, + "step": 1153 + }, + { + "epoch": 2.64, + "learning_rate": 9.539475273578729e-05, + "loss": 0.7842, + "step": 1154 + }, + { + "epoch": 2.65, + "learning_rate": 9.524630662246432e-05, + "loss": 0.7706, + "step": 1155 + }, + { + "epoch": 2.65, + "learning_rate": 9.509787100748692e-05, + "loss": 0.802, + "step": 1156 + }, + { + "epoch": 2.65, + "learning_rate": 9.494944621866937e-05, + "loss": 0.9293, + "step": 1157 + }, + { + "epoch": 2.65, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8051, + "step": 1158 + }, + { + "epoch": 2.66, + "learning_rate": 9.465263043065045e-05, + "loss": 0.7449, + "step": 1159 + }, + { + "epoch": 2.66, + "learning_rate": 9.450424008695509e-05, + "loss": 0.7289, + "step": 1160 + }, + { + "epoch": 2.66, + "learning_rate": 9.43558618804302e-05, + "loss": 0.6778, + "step": 1161 + }, + { + "epoch": 2.66, + "learning_rate": 9.420749613876325e-05, + "loss": 0.7731, + "step": 1162 + }, + { + "epoch": 2.66, + "learning_rate": 9.405914318961414e-05, + "loss": 0.6934, + "step": 1163 + }, + { + "epoch": 2.67, + "learning_rate": 9.391080336061454e-05, + "loss": 0.9045, + "step": 1164 + }, + { + "epoch": 2.67, + "learning_rate": 9.376247697936719e-05, + "loss": 0.8016, + "step": 1165 + }, + { + "epoch": 2.67, + "learning_rate": 9.361416437344503e-05, + "loss": 0.6214, + "step": 1166 + }, + { + "epoch": 2.67, + "learning_rate": 9.34658658703907e-05, + "loss": 0.6771, + "step": 1167 + }, + { + "epoch": 2.68, + "learning_rate": 9.331758179771561e-05, + "loss": 0.748, + "step": 1168 + }, + { + "epoch": 2.68, + "learning_rate": 9.316931248289926e-05, + "loss": 0.665, + "step": 1169 + }, + { + "epoch": 2.68, + "learning_rate": 9.302105825338876e-05, + "loss": 0.901, + "step": 1170 + }, + { + "epoch": 2.68, + "learning_rate": 9.287281943659767e-05, + "loss": 0.8342, + "step": 1171 + }, + { + "epoch": 2.68, + "learning_rate": 9.272459635990562e-05, + "loss": 0.853, + "step": 1172 + }, + { + "epoch": 2.69, + "learning_rate": 9.257638935065753e-05, + "loss": 0.8093, + "step": 1173 + }, + { + "epoch": 2.69, + "learning_rate": 9.242819873616268e-05, + "loss": 0.8451, + "step": 1174 + }, + { + "epoch": 2.69, + "learning_rate": 9.228002484369429e-05, + "loss": 0.8628, + "step": 1175 + }, + { + "epoch": 2.69, + "learning_rate": 9.213186800048861e-05, + "loss": 0.7858, + "step": 1176 + }, + { + "epoch": 2.7, + "learning_rate": 9.198372853374415e-05, + "loss": 0.9236, + "step": 1177 + }, + { + "epoch": 2.7, + "learning_rate": 9.183560677062119e-05, + "loss": 0.7925, + "step": 1178 + }, + { + "epoch": 2.7, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7105, + "step": 1179 + }, + { + "epoch": 2.7, + "learning_rate": 9.153941766368439e-05, + "loss": 0.7521, + "step": 1180 + }, + { + "epoch": 2.71, + "learning_rate": 9.139135097399254e-05, + "loss": 0.8648, + "step": 1181 + }, + { + "epoch": 2.71, + "learning_rate": 9.124330329616482e-05, + "loss": 0.8409, + "step": 1182 + }, + { + "epoch": 2.71, + "learning_rate": 9.109527495715872e-05, + "loss": 0.7198, + "step": 1183 + }, + { + "epoch": 2.71, + "learning_rate": 9.094726628388899e-05, + "loss": 0.7365, + "step": 1184 + }, + { + "epoch": 2.71, + "learning_rate": 9.0799277603227e-05, + "loss": 0.7699, + "step": 1185 + }, + { + "epoch": 2.72, + "learning_rate": 9.065130924199998e-05, + "loss": 0.8041, + "step": 1186 + }, + { + "epoch": 2.72, + "learning_rate": 9.050336152699025e-05, + "loss": 0.8308, + "step": 1187 + }, + { + "epoch": 2.72, + "learning_rate": 9.035543478493458e-05, + "loss": 0.8139, + "step": 1188 + }, + { + "epoch": 2.72, + "learning_rate": 9.02075293425233e-05, + "loss": 0.7394, + "step": 1189 + }, + { + "epoch": 2.73, + "learning_rate": 9.005964552639984e-05, + "loss": 0.6738, + "step": 1190 + }, + { + "epoch": 2.73, + "learning_rate": 8.991178366315982e-05, + "loss": 0.9421, + "step": 1191 + }, + { + "epoch": 2.73, + "learning_rate": 8.976394407935034e-05, + "loss": 0.8747, + "step": 1192 + }, + { + "epoch": 2.73, + "learning_rate": 8.961612710146934e-05, + "loss": 0.8282, + "step": 1193 + }, + { + "epoch": 2.74, + "learning_rate": 8.94683330559648e-05, + "loss": 0.765, + "step": 1194 + }, + { + "epoch": 2.74, + "learning_rate": 8.932056226923416e-05, + "loss": 0.8515, + "step": 1195 + }, + { + "epoch": 2.74, + "learning_rate": 8.917281506762335e-05, + "loss": 0.6194, + "step": 1196 + }, + { + "epoch": 2.74, + "learning_rate": 8.902509177742626e-05, + "loss": 0.8852, + "step": 1197 + }, + { + "epoch": 2.74, + "learning_rate": 8.887739272488406e-05, + "loss": 0.7481, + "step": 1198 + }, + { + "epoch": 2.75, + "learning_rate": 8.872971823618424e-05, + "loss": 0.7979, + "step": 1199 + }, + { + "epoch": 2.75, + "learning_rate": 8.858206863746018e-05, + "loss": 0.8332, + "step": 1200 + }, + { + "epoch": 2.75, + "learning_rate": 8.843444425479022e-05, + "loss": 0.6716, + "step": 1201 + }, + { + "epoch": 2.75, + "learning_rate": 8.828684541419696e-05, + "loss": 0.9192, + "step": 1202 + }, + { + "epoch": 2.76, + "learning_rate": 8.813927244164679e-05, + "loss": 0.8463, + "step": 1203 + }, + { + "epoch": 2.76, + "learning_rate": 8.799172566304874e-05, + "loss": 0.6598, + "step": 1204 + }, + { + "epoch": 2.76, + "learning_rate": 8.784420540425412e-05, + "loss": 0.7823, + "step": 1205 + }, + { + "epoch": 2.76, + "learning_rate": 8.769671199105565e-05, + "loss": 0.8728, + "step": 1206 + }, + { + "epoch": 2.77, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7665, + "step": 1207 + }, + { + "epoch": 2.77, + "learning_rate": 8.74018070043208e-05, + "loss": 0.8008, + "step": 1208 + }, + { + "epoch": 2.77, + "learning_rate": 8.725439608207056e-05, + "loss": 0.6833, + "step": 1209 + }, + { + "epoch": 2.77, + "learning_rate": 8.710701330798719e-05, + "loss": 0.7801, + "step": 1210 + }, + { + "epoch": 2.77, + "learning_rate": 8.695965900755985e-05, + "loss": 0.6308, + "step": 1211 + }, + { + "epoch": 2.78, + "learning_rate": 8.68123335062147e-05, + "loss": 0.7851, + "step": 1212 + }, + { + "epoch": 2.78, + "learning_rate": 8.666503712931439e-05, + "loss": 0.7592, + "step": 1213 + }, + { + "epoch": 2.78, + "learning_rate": 8.651777020215712e-05, + "loss": 0.8727, + "step": 1214 + }, + { + "epoch": 2.78, + "learning_rate": 8.637053304997618e-05, + "loss": 0.903, + "step": 1215 + }, + { + "epoch": 2.79, + "learning_rate": 8.622332599793906e-05, + "loss": 0.8076, + "step": 1216 + }, + { + "epoch": 2.79, + "learning_rate": 8.607614937114671e-05, + "loss": 0.8975, + "step": 1217 + }, + { + "epoch": 2.79, + "learning_rate": 8.592900349463297e-05, + "loss": 0.8249, + "step": 1218 + }, + { + "epoch": 2.79, + "learning_rate": 8.578188869336377e-05, + "loss": 0.8529, + "step": 1219 + }, + { + "epoch": 2.79, + "learning_rate": 8.563480529223638e-05, + "loss": 0.8351, + "step": 1220 + }, + { + "epoch": 2.8, + "learning_rate": 8.548775361607872e-05, + "loss": 0.8934, + "step": 1221 + }, + { + "epoch": 2.8, + "learning_rate": 8.534073398964866e-05, + "loss": 0.8067, + "step": 1222 + }, + { + "epoch": 2.8, + "learning_rate": 8.519374673763326e-05, + "loss": 0.8508, + "step": 1223 + }, + { + "epoch": 2.8, + "learning_rate": 8.504679218464816e-05, + "loss": 0.7419, + "step": 1224 + }, + { + "epoch": 2.81, + "learning_rate": 8.489987065523668e-05, + "loss": 0.7808, + "step": 1225 + }, + { + "epoch": 2.81, + "learning_rate": 8.475298247386927e-05, + "loss": 0.8603, + "step": 1226 + }, + { + "epoch": 2.81, + "learning_rate": 8.460612796494272e-05, + "loss": 0.8818, + "step": 1227 + }, + { + "epoch": 2.81, + "learning_rate": 8.445930745277953e-05, + "loss": 0.779, + "step": 1228 + }, + { + "epoch": 2.82, + "learning_rate": 8.431252126162695e-05, + "loss": 0.766, + "step": 1229 + }, + { + "epoch": 2.82, + "learning_rate": 8.41657697156566e-05, + "loss": 0.8743, + "step": 1230 + }, + { + "epoch": 2.82, + "learning_rate": 8.40190531389635e-05, + "loss": 0.882, + "step": 1231 + }, + { + "epoch": 2.82, + "learning_rate": 8.387237185556545e-05, + "loss": 0.7422, + "step": 1232 + }, + { + "epoch": 2.82, + "learning_rate": 8.372572618940231e-05, + "loss": 0.9271, + "step": 1233 + }, + { + "epoch": 2.83, + "learning_rate": 8.357911646433535e-05, + "loss": 0.8051, + "step": 1234 + }, + { + "epoch": 2.83, + "learning_rate": 8.343254300414628e-05, + "loss": 0.782, + "step": 1235 + }, + { + "epoch": 2.83, + "learning_rate": 8.3286006132537e-05, + "loss": 0.8754, + "step": 1236 + }, + { + "epoch": 2.83, + "learning_rate": 8.313950617312835e-05, + "loss": 0.8249, + "step": 1237 + }, + { + "epoch": 2.84, + "learning_rate": 8.299304344945977e-05, + "loss": 0.8342, + "step": 1238 + }, + { + "epoch": 2.84, + "learning_rate": 8.284661828498847e-05, + "loss": 0.8593, + "step": 1239 + }, + { + "epoch": 2.84, + "learning_rate": 8.270023100308865e-05, + "loss": 0.7507, + "step": 1240 + }, + { + "epoch": 2.84, + "learning_rate": 8.255388192705093e-05, + "loss": 0.8462, + "step": 1241 + }, + { + "epoch": 2.85, + "learning_rate": 8.240757138008149e-05, + "loss": 0.8322, + "step": 1242 + }, + { + "epoch": 2.85, + "learning_rate": 8.22612996853014e-05, + "loss": 0.8963, + "step": 1243 + }, + { + "epoch": 2.85, + "learning_rate": 8.211506716574602e-05, + "loss": 0.7419, + "step": 1244 + }, + { + "epoch": 2.85, + "learning_rate": 8.196887414436416e-05, + "loss": 0.8225, + "step": 1245 + }, + { + "epoch": 2.85, + "learning_rate": 8.182272094401735e-05, + "loss": 0.8539, + "step": 1246 + }, + { + "epoch": 2.86, + "learning_rate": 8.167660788747919e-05, + "loss": 0.7852, + "step": 1247 + }, + { + "epoch": 2.86, + "learning_rate": 8.153053529743465e-05, + "loss": 0.9128, + "step": 1248 + }, + { + "epoch": 2.86, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7328, + "step": 1249 + }, + { + "epoch": 2.86, + "learning_rate": 8.123851280711877e-05, + "loss": 0.8816, + "step": 1250 + }, + { + "epoch": 2.87, + "learning_rate": 8.10925635517676e-05, + "loss": 0.7267, + "step": 1251 + }, + { + "epoch": 2.87, + "learning_rate": 8.094665605274913e-05, + "loss": 0.7362, + "step": 1252 + }, + { + "epoch": 2.87, + "learning_rate": 8.080079063229432e-05, + "loss": 0.7475, + "step": 1253 + }, + { + "epoch": 2.87, + "learning_rate": 8.065496761254126e-05, + "loss": 0.7727, + "step": 1254 + }, + { + "epoch": 2.88, + "learning_rate": 8.050918731553431e-05, + "loss": 0.746, + "step": 1255 + }, + { + "epoch": 2.88, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8132, + "step": 1256 + }, + { + "epoch": 2.88, + "learning_rate": 8.021775617746412e-05, + "loss": 0.6752, + "step": 1257 + }, + { + "epoch": 2.88, + "learning_rate": 8.007210598001512e-05, + "loss": 0.7468, + "step": 1258 + }, + { + "epoch": 2.88, + "learning_rate": 7.992649979253934e-05, + "loss": 0.9141, + "step": 1259 + }, + { + "epoch": 2.89, + "learning_rate": 7.978093793660233e-05, + "loss": 0.7706, + "step": 1260 + }, + { + "epoch": 2.89, + "learning_rate": 7.963542073367181e-05, + "loss": 0.8399, + "step": 1261 + }, + { + "epoch": 2.89, + "learning_rate": 7.948994850511677e-05, + "loss": 0.834, + "step": 1262 + }, + { + "epoch": 2.89, + "learning_rate": 7.934452157220694e-05, + "loss": 0.767, + "step": 1263 + }, + { + "epoch": 2.9, + "learning_rate": 7.9199140256112e-05, + "loss": 0.75, + "step": 1264 + }, + { + "epoch": 2.9, + "learning_rate": 7.905380487790088e-05, + "loss": 0.81, + "step": 1265 + }, + { + "epoch": 2.9, + "learning_rate": 7.890851575854108e-05, + "loss": 0.8931, + "step": 1266 + }, + { + "epoch": 2.9, + "learning_rate": 7.876327321889795e-05, + "loss": 0.8929, + "step": 1267 + }, + { + "epoch": 2.9, + "learning_rate": 7.861807757973387e-05, + "loss": 0.787, + "step": 1268 + }, + { + "epoch": 2.91, + "learning_rate": 7.847292916170784e-05, + "loss": 0.8072, + "step": 1269 + }, + { + "epoch": 2.91, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8121, + "step": 1270 + }, + { + "epoch": 2.91, + "learning_rate": 7.818277527118307e-05, + "loss": 0.7951, + "step": 1271 + }, + { + "epoch": 2.91, + "learning_rate": 7.803777043947789e-05, + "loss": 0.7093, + "step": 1272 + }, + { + "epoch": 2.92, + "learning_rate": 7.789281411049625e-05, + "loss": 0.7827, + "step": 1273 + }, + { + "epoch": 2.92, + "learning_rate": 7.774790660436858e-05, + "loss": 0.7433, + "step": 1274 + }, + { + "epoch": 2.92, + "learning_rate": 7.760304824111741e-05, + "loss": 0.7359, + "step": 1275 + }, + { + "epoch": 2.92, + "learning_rate": 7.745823934065671e-05, + "loss": 0.7157, + "step": 1276 + }, + { + "epoch": 2.93, + "learning_rate": 7.731348022279134e-05, + "loss": 0.961, + "step": 1277 + }, + { + "epoch": 2.93, + "learning_rate": 7.716877120721611e-05, + "loss": 0.7718, + "step": 1278 + }, + { + "epoch": 2.93, + "learning_rate": 7.702411261351523e-05, + "loss": 0.835, + "step": 1279 + }, + { + "epoch": 2.93, + "learning_rate": 7.68795047611615e-05, + "loss": 0.9129, + "step": 1280 + }, + { + "epoch": 2.93, + "learning_rate": 7.673494796951573e-05, + "loss": 0.7635, + "step": 1281 + }, + { + "epoch": 2.94, + "learning_rate": 7.659044255782593e-05, + "loss": 0.6873, + "step": 1282 + }, + { + "epoch": 2.94, + "learning_rate": 7.644598884522659e-05, + "loss": 0.6434, + "step": 1283 + }, + { + "epoch": 2.94, + "learning_rate": 7.630158715073813e-05, + "loss": 0.8408, + "step": 1284 + }, + { + "epoch": 2.94, + "learning_rate": 7.615723779326599e-05, + "loss": 0.9042, + "step": 1285 + }, + { + "epoch": 2.95, + "learning_rate": 7.601294109160012e-05, + "loss": 0.7996, + "step": 1286 + }, + { + "epoch": 2.95, + "learning_rate": 7.586869736441413e-05, + "loss": 0.923, + "step": 1287 + }, + { + "epoch": 2.95, + "learning_rate": 7.572450693026462e-05, + "loss": 0.7661, + "step": 1288 + }, + { + "epoch": 2.95, + "learning_rate": 7.55803701075905e-05, + "loss": 0.9105, + "step": 1289 + }, + { + "epoch": 2.96, + "learning_rate": 7.543628721471233e-05, + "loss": 0.8071, + "step": 1290 + }, + { + "epoch": 2.96, + "learning_rate": 7.52922585698315e-05, + "loss": 0.8234, + "step": 1291 + }, + { + "epoch": 2.96, + "learning_rate": 7.514828449102966e-05, + "loss": 0.8131, + "step": 1292 + }, + { + "epoch": 2.96, + "learning_rate": 7.500436529626786e-05, + "loss": 0.8149, + "step": 1293 + }, + { + "epoch": 2.96, + "learning_rate": 7.486050130338612e-05, + "loss": 0.8441, + "step": 1294 + }, + { + "epoch": 2.97, + "learning_rate": 7.471669283010232e-05, + "loss": 0.8269, + "step": 1295 + }, + { + "epoch": 2.97, + "learning_rate": 7.457294019401191e-05, + "loss": 0.632, + "step": 1296 + }, + { + "epoch": 2.97, + "learning_rate": 7.442924371258694e-05, + "loss": 0.8522, + "step": 1297 + }, + { + "epoch": 2.97, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8387, + "step": 1298 + }, + { + "epoch": 2.98, + "learning_rate": 7.414202048300072e-05, + "loss": 0.887, + "step": 1299 + }, + { + "epoch": 2.98, + "learning_rate": 7.399849436916077e-05, + "loss": 0.8273, + "step": 1300 + }, + { + "epoch": 2.98, + "learning_rate": 7.385502567862728e-05, + "loss": 0.7807, + "step": 1301 + }, + { + "epoch": 2.98, + "learning_rate": 7.371161472824536e-05, + "loss": 0.9077, + "step": 1302 + }, + { + "epoch": 2.99, + "learning_rate": 7.35682618347324e-05, + "loss": 0.8779, + "step": 1303 + }, + { + "epoch": 2.99, + "learning_rate": 7.342496731467767e-05, + "loss": 0.8595, + "step": 1304 + }, + { + "epoch": 2.99, + "learning_rate": 7.328173148454151e-05, + "loss": 0.8391, + "step": 1305 + }, + { + "epoch": 2.99, + "learning_rate": 7.31385546606546e-05, + "loss": 0.7559, + "step": 1306 + }, + { + "epoch": 2.99, + "learning_rate": 7.29954371592174e-05, + "loss": 0.8926, + "step": 1307 + }, + { + "epoch": 3.0, + "learning_rate": 7.285237929629928e-05, + "loss": 0.8443, + "step": 1308 + }, + { + "epoch": 3.0, + "learning_rate": 7.27093813878379e-05, + "loss": 0.7854, + "step": 1309 + } + ], + "max_steps": 2180, + "num_train_epochs": 5, + "total_flos": 376480566673408.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1309/training_args.bin b/checkpoint-1309/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4aa0907a784d65549a9c45257c4d455176479607 --- /dev/null +++ b/checkpoint-1309/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adff180a74f6fc1e6a420417eadde6ef8ff75561e442f481bfe772c93f46e2ae +size 6011 diff --git a/checkpoint-1309/zero_to_fp32.py b/checkpoint-1309/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..c98caae31534368be22b67fc4ae906836c992a8d --- /dev/null +++ b/checkpoint-1309/zero_to_fp32.py @@ -0,0 +1,587 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag) diff --git a/checkpoint-1746/README.md b/checkpoint-1746/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/checkpoint-1746/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-1746/adapter_config.json b/checkpoint-1746/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a764b53e4dc8b17af932aa1de32ced6a340469f0 --- /dev/null +++ b/checkpoint-1746/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "liuhaotian/llava-v1.5-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 256, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "up_proj", + "v_proj", + "down_proj", + "q_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-1746/adapter_model.bin b/checkpoint-1746/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..52a08033ec883b4e9598d4382d324c480e093684 --- /dev/null +++ b/checkpoint-1746/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:906dc5c5bb99eaf950366c88ecc2b749fa63808e3421dfff005a97cfc43d7467 +size 639786637 diff --git a/checkpoint-1746/global_step1746/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-1746/global_step1746/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d93ff473666fc124fc990d242db2ca83d302fba --- /dev/null +++ b/checkpoint-1746/global_step1746/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81c9d97360081a1c77e0e26a621dfde3c4eaafe0f0fe9f247ac7f38b9a7a9153 +size 1022391865 diff --git a/checkpoint-1746/global_step1746/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-1746/global_step1746/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7963c98bbe7a42a7244591495ab021fd343dba6a --- /dev/null +++ b/checkpoint-1746/global_step1746/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ebfe6bee35d30b30348f4763266b3769ea362ff3e8dab64e5730acb127024a5 +size 1022391865 diff --git a/checkpoint-1746/global_step1746/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-1746/global_step1746/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e56f53e2383f706ef331d7620aae52bc2222eba --- /dev/null +++ b/checkpoint-1746/global_step1746/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db91430a53ed84e3401608c0ceed99ec251203d736c6afe4607e80e4d81d8625 +size 1022391865 diff --git a/checkpoint-1746/global_step1746/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-1746/global_step1746/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..032e3c400bbbbd5e75e88255479030aa8d6e98ec --- /dev/null +++ b/checkpoint-1746/global_step1746/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:539b6c247d1888ee9fa27880939add813e52ac80223b6b14804a35da0ce127a0 +size 1022391865 diff --git a/checkpoint-1746/global_step1746/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-1746/global_step1746/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebe0cd4180102718ffdfc32ae93b0c07a1dbbf2a --- /dev/null +++ b/checkpoint-1746/global_step1746/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b5cc36425859d1dfede9065f86e70c3e1d156db8a55f5a09575fc95f36610ab +size 3521982567 diff --git a/checkpoint-1746/global_step1746/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-1746/global_step1746/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7ecfab088ed0c8afbbae3739a113c4e0f6a4e7d --- /dev/null +++ b/checkpoint-1746/global_step1746/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86ff2820f48b87fd999eec2965eafa69597d2dd41d2ce56b26403b70672670e +size 3521982567 diff --git a/checkpoint-1746/global_step1746/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-1746/global_step1746/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f5d517d80d02c14c2b9cf9fbfadc607fb72e17f --- /dev/null +++ b/checkpoint-1746/global_step1746/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d32b7c9466d83edc137c558dc8e0ef7c82d9d477260df0ff1fdaa45e3a161b5 +size 3521982567 diff --git a/checkpoint-1746/global_step1746/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-1746/global_step1746/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e1f55af0f1f551a51c4815b0265def7eb4beee0 --- /dev/null +++ b/checkpoint-1746/global_step1746/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:120b9333089113ddf8d34d810ca3692f96850750f47d1e222a06b08d86032133 +size 3521982567 diff --git a/checkpoint-1746/latest b/checkpoint-1746/latest new file mode 100644 index 0000000000000000000000000000000000000000..f5339b0f6163aa1dc5f8815ef5488a731634d25f --- /dev/null +++ b/checkpoint-1746/latest @@ -0,0 +1 @@ +global_step1746 \ No newline at end of file diff --git a/checkpoint-1746/rng_state_0.pth b/checkpoint-1746/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..357edc7520d402df922e0590879124786d43ec17 --- /dev/null +++ b/checkpoint-1746/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87e9fa6cf96314126076a9547cf00080882b414a358296a5be199ea573ee239f +size 17655 diff --git a/checkpoint-1746/rng_state_1.pth b/checkpoint-1746/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4fc323abd4f09079a65a950856cc182097896ac7 --- /dev/null +++ b/checkpoint-1746/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbb90645d323690b0d53453198a1d0c7028a388f50fc320c6c96a3239192f1c1 +size 17655 diff --git a/checkpoint-1746/rng_state_2.pth b/checkpoint-1746/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4b86b6df40ae9ebf26b843091c59c4b5c7fc680e --- /dev/null +++ b/checkpoint-1746/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23ec3f444edfe4d334010a9002aa8822e55c419392d1095f941447b506e0407f +size 17655 diff --git a/checkpoint-1746/rng_state_3.pth b/checkpoint-1746/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..1b67bf229ed264bc02bc397b3a99f1ed13f25a2a --- /dev/null +++ b/checkpoint-1746/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6db49e99bf02e53a4bd3a2d8426c53e725350c17dc3560a07564661f5921507a +size 17655 diff --git a/checkpoint-1746/special_tokens_map.json b/checkpoint-1746/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/checkpoint-1746/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1746/tokenizer.model b/checkpoint-1746/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-1746/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-1746/tokenizer_config.json b/checkpoint-1746/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..740756b4bef305e27d0bb4d2e1a40dd8847797f7 --- /dev/null +++ b/checkpoint-1746/tokenizer_config.json @@ -0,0 +1,35 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 2048, + "pad_token": null, + "padding_side": "right", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1746/trainer_state.json b/checkpoint-1746/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6f145f6992128ede7960a9569b0c22559b08f893 --- /dev/null +++ b/checkpoint-1746/trainer_state.json @@ -0,0 +1,10492 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "global_step": 1746, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.946, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 6.060606060606061e-06, + "loss": 1.908, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 9.090909090909091e-06, + "loss": 2.1083, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 1.2121212121212122e-05, + "loss": 2.3218, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 1.5151515151515153e-05, + "loss": 1.8338, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 1.8181818181818182e-05, + "loss": 2.0202, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 2.1212121212121215e-05, + "loss": 2.1332, + "step": 7 + }, + { + "epoch": 0.02, + "learning_rate": 2.4242424242424244e-05, + "loss": 1.8593, + "step": 8 + }, + { + "epoch": 0.02, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.5359, + "step": 9 + }, + { + "epoch": 0.02, + "learning_rate": 3.0303030303030306e-05, + "loss": 1.327, + "step": 10 + }, + { + "epoch": 0.03, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.7252, + "step": 11 + }, + { + "epoch": 0.03, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.4351, + "step": 12 + }, + { + "epoch": 0.03, + "learning_rate": 3.939393939393939e-05, + "loss": 1.2774, + "step": 13 + }, + { + "epoch": 0.03, + "learning_rate": 4.242424242424243e-05, + "loss": 1.5145, + "step": 14 + }, + { + "epoch": 0.03, + "learning_rate": 4.545454545454546e-05, + "loss": 1.1529, + "step": 15 + }, + { + "epoch": 0.04, + "learning_rate": 4.848484848484849e-05, + "loss": 1.0047, + "step": 16 + }, + { + "epoch": 0.04, + "learning_rate": 5.151515151515152e-05, + "loss": 1.3872, + "step": 17 + }, + { + "epoch": 0.04, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.1229, + "step": 18 + }, + { + "epoch": 0.04, + "learning_rate": 5.757575757575758e-05, + "loss": 1.3386, + "step": 19 + }, + { + "epoch": 0.05, + "learning_rate": 6.060606060606061e-05, + "loss": 1.2493, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 6.363636363636364e-05, + "loss": 1.1427, + "step": 21 + }, + { + "epoch": 0.05, + "learning_rate": 6.666666666666667e-05, + "loss": 1.0895, + "step": 22 + }, + { + "epoch": 0.05, + "learning_rate": 6.96969696969697e-05, + "loss": 1.1989, + "step": 23 + }, + { + "epoch": 0.05, + "learning_rate": 7.272727272727273e-05, + "loss": 1.0438, + "step": 24 + }, + { + "epoch": 0.06, + "learning_rate": 7.575757575757576e-05, + "loss": 1.176, + "step": 25 + }, + { + "epoch": 0.06, + "learning_rate": 7.878787878787879e-05, + "loss": 1.1372, + "step": 26 + }, + { + "epoch": 0.06, + "learning_rate": 8.181818181818183e-05, + "loss": 1.2983, + "step": 27 + }, + { + "epoch": 0.06, + "learning_rate": 8.484848484848486e-05, + "loss": 0.9371, + "step": 28 + }, + { + "epoch": 0.07, + "learning_rate": 8.787878787878789e-05, + "loss": 1.2299, + "step": 29 + }, + { + "epoch": 0.07, + "learning_rate": 9.090909090909092e-05, + "loss": 0.9441, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 9.393939393939395e-05, + "loss": 1.0011, + "step": 31 + }, + { + "epoch": 0.07, + "learning_rate": 9.696969696969698e-05, + "loss": 1.1704, + "step": 32 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 1.1193, + "step": 33 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010303030303030303, + "loss": 1.1559, + "step": 34 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010606060606060606, + "loss": 0.8677, + "step": 35 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010909090909090909, + "loss": 1.0865, + "step": 36 + }, + { + "epoch": 0.08, + "learning_rate": 0.00011212121212121212, + "loss": 1.0922, + "step": 37 + }, + { + "epoch": 0.09, + "learning_rate": 0.00011515151515151516, + "loss": 0.9434, + "step": 38 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001181818181818182, + "loss": 0.9144, + "step": 39 + }, + { + "epoch": 0.09, + "learning_rate": 0.00012121212121212122, + "loss": 0.9546, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.00012424242424242425, + "loss": 1.0654, + "step": 41 + }, + { + "epoch": 0.1, + "learning_rate": 0.00012727272727272728, + "loss": 0.8077, + "step": 42 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001303030303030303, + "loss": 1.0758, + "step": 43 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013333333333333334, + "loss": 1.1512, + "step": 44 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013636363636363637, + "loss": 0.84, + "step": 45 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001393939393939394, + "loss": 1.0567, + "step": 46 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014242424242424243, + "loss": 1.0165, + "step": 47 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014545454545454546, + "loss": 0.8678, + "step": 48 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014848484848484849, + "loss": 1.055, + "step": 49 + }, + { + "epoch": 0.11, + "learning_rate": 0.00015151515151515152, + "loss": 1.0669, + "step": 50 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015454545454545454, + "loss": 0.9915, + "step": 51 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015757575757575757, + "loss": 0.993, + "step": 52 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001606060606060606, + "loss": 1.1085, + "step": 53 + }, + { + "epoch": 0.12, + "learning_rate": 0.00016363636363636366, + "loss": 0.9391, + "step": 54 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001666666666666667, + "loss": 0.975, + "step": 55 + }, + { + "epoch": 0.13, + "learning_rate": 0.00016969696969696972, + "loss": 1.0697, + "step": 56 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017272727272727275, + "loss": 0.9462, + "step": 57 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017575757575757578, + "loss": 1.1209, + "step": 58 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001787878787878788, + "loss": 1.0648, + "step": 59 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018181818181818183, + "loss": 0.9964, + "step": 60 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018484848484848484, + "loss": 0.8451, + "step": 61 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001878787878787879, + "loss": 0.8437, + "step": 62 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019090909090909092, + "loss": 1.1271, + "step": 63 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019393939393939395, + "loss": 1.161, + "step": 64 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019696969696969698, + "loss": 1.0032, + "step": 65 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 66 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019999988957695886, + "loss": 0.9543, + "step": 67 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999955830807923, + "loss": 1.0274, + "step": 68 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999900619409279, + "loss": 0.9334, + "step": 69 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001999982332362188, + "loss": 1.0398, + "step": 70 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999723943616433, + "loss": 0.9049, + "step": 71 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999602479612417, + "loss": 0.7452, + "step": 72 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999458931878073, + "loss": 0.8762, + "step": 73 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999293300730427, + "loss": 1.0941, + "step": 74 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999105586535268, + "loss": 0.7713, + "step": 75 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019998895789707154, + "loss": 0.9233, + "step": 76 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998663910709416, + "loss": 0.8634, + "step": 77 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998409950054146, + "loss": 0.9697, + "step": 78 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998133908302209, + "loss": 1.0816, + "step": 79 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001999783578606323, + "loss": 0.9659, + "step": 80 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019997515583995603, + "loss": 0.9644, + "step": 81 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019997173302806478, + "loss": 0.8561, + "step": 82 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019996808943251773, + "loss": 1.0016, + "step": 83 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001999642250613616, + "loss": 0.8951, + "step": 84 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019996013992313073, + "loss": 1.0157, + "step": 85 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019995583402684694, + "loss": 0.9414, + "step": 86 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019995130738201966, + "loss": 0.8097, + "step": 87 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019994655999864582, + "loss": 0.8606, + "step": 88 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001999415918872098, + "loss": 1.0427, + "step": 89 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019993640305868352, + "loss": 0.9578, + "step": 90 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019993099352452623, + "loss": 1.1097, + "step": 91 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019992536329668478, + "loss": 0.8119, + "step": 92 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019991951238759325, + "loss": 0.9915, + "step": 93 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001999134408101731, + "loss": 0.838, + "step": 94 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019990714857783326, + "loss": 0.8935, + "step": 95 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019990063570446984, + "loss": 0.7914, + "step": 96 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019989390220446622, + "loss": 0.8724, + "step": 97 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019988694809269314, + "loss": 1.0374, + "step": 98 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019987977338450845, + "loss": 0.9028, + "step": 99 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019987237809575723, + "loss": 0.9986, + "step": 100 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019986476224277165, + "loss": 1.113, + "step": 101 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019985692584237108, + "loss": 0.8395, + "step": 102 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019984886891186184, + "loss": 1.0134, + "step": 103 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001998405914690374, + "loss": 0.8845, + "step": 104 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019983209353217812, + "loss": 0.7507, + "step": 105 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019982337512005138, + "loss": 0.9073, + "step": 106 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019981443625191148, + "loss": 0.9973, + "step": 107 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019980527694749952, + "loss": 1.0733, + "step": 108 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019979589722704346, + "loss": 0.9148, + "step": 109 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019978629711125812, + "loss": 0.8385, + "step": 110 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019977647662134488, + "loss": 0.75, + "step": 111 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019976643577899195, + "loss": 0.9002, + "step": 112 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019975617460637416, + "loss": 0.8754, + "step": 113 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001997456931261529, + "loss": 0.8886, + "step": 114 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019973499136147606, + "loss": 1.0058, + "step": 115 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019972406933597812, + "loss": 0.9276, + "step": 116 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019971292707377991, + "loss": 0.9922, + "step": 117 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019970156459948873, + "loss": 0.9507, + "step": 118 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001996899819381981, + "loss": 0.9619, + "step": 119 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019967817911548794, + "loss": 0.8163, + "step": 120 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019966615615742424, + "loss": 1.0647, + "step": 121 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001996539130905593, + "loss": 0.9348, + "step": 122 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019964144994193142, + "loss": 1.0523, + "step": 123 + }, + { + "epoch": 0.28, + "learning_rate": 0.000199628766739065, + "loss": 0.9063, + "step": 124 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019961586350997033, + "loss": 1.0227, + "step": 125 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001996027402831438, + "loss": 1.006, + "step": 126 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019958939708756746, + "loss": 0.9082, + "step": 127 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019957583395270923, + "loss": 0.8756, + "step": 128 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995620509085228, + "loss": 0.8311, + "step": 129 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019954804798544745, + "loss": 1.0332, + "step": 130 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019953382521440815, + "loss": 0.9427, + "step": 131 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019951938262681527, + "loss": 0.838, + "step": 132 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995047202545647, + "loss": 0.8509, + "step": 133 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019948983813003774, + "loss": 0.8944, + "step": 134 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019947473628610099, + "loss": 0.9569, + "step": 135 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019945941475610623, + "loss": 0.7805, + "step": 136 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019944387357389052, + "loss": 0.9337, + "step": 137 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994281127737759, + "loss": 0.8712, + "step": 138 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994121323905695, + "loss": 0.9264, + "step": 139 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001993959324595634, + "loss": 0.9323, + "step": 140 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019937951301653444, + "loss": 0.8331, + "step": 141 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001993628740977444, + "loss": 0.902, + "step": 142 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001993460157399396, + "loss": 0.8676, + "step": 143 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019932893798035116, + "loss": 0.8525, + "step": 144 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019931164085669456, + "loss": 0.8571, + "step": 145 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019929412440716985, + "loss": 1.0006, + "step": 146 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019927638867046142, + "loss": 0.9849, + "step": 147 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019925843368573794, + "loss": 0.9064, + "step": 148 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001992402594926523, + "loss": 0.9716, + "step": 149 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001992218661313415, + "loss": 0.7553, + "step": 150 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019920325364242654, + "loss": 0.7921, + "step": 151 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019918442206701245, + "loss": 0.7994, + "step": 152 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001991653714466879, + "loss": 0.8296, + "step": 153 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019914610182352548, + "loss": 0.8116, + "step": 154 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019912661324008148, + "loss": 0.9844, + "step": 155 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019910690573939557, + "loss": 0.865, + "step": 156 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019908697936499103, + "loss": 0.959, + "step": 157 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019906683416087448, + "loss": 0.7727, + "step": 158 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019904647017153582, + "loss": 0.707, + "step": 159 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019902588744194813, + "loss": 0.8597, + "step": 160 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019900508601756756, + "loss": 0.9146, + "step": 161 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001989840659443332, + "loss": 0.9571, + "step": 162 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001989628272686671, + "loss": 0.8537, + "step": 163 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019894137003747403, + "loss": 0.828, + "step": 164 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019891969429814145, + "loss": 0.8055, + "step": 165 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001988978000985394, + "loss": 0.8432, + "step": 166 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001988756874870203, + "loss": 0.8101, + "step": 167 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019885335651241903, + "loss": 0.9072, + "step": 168 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001988308072240527, + "loss": 0.7862, + "step": 169 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019880803967172047, + "loss": 0.8303, + "step": 170 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019878505390570362, + "loss": 0.9489, + "step": 171 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001987618499767653, + "loss": 1.0125, + "step": 172 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001987384279361505, + "loss": 0.809, + "step": 173 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019871478783558587, + "loss": 0.9488, + "step": 174 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001986909297272796, + "loss": 0.9664, + "step": 175 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001986668536639215, + "loss": 0.9657, + "step": 176 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001986425596986825, + "loss": 0.8123, + "step": 177 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019861804788521493, + "loss": 0.9482, + "step": 178 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019859331827765212, + "loss": 0.879, + "step": 179 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019856837093060848, + "loss": 0.896, + "step": 180 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019854320589917927, + "loss": 1.0729, + "step": 181 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019851782323894042, + "loss": 0.9844, + "step": 182 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001984922230059486, + "loss": 0.9131, + "step": 183 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019846640525674082, + "loss": 0.9417, + "step": 184 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019844037004833473, + "loss": 0.9633, + "step": 185 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001984141174382279, + "loss": 0.968, + "step": 186 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019838764748439827, + "loss": 0.8447, + "step": 187 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019836096024530373, + "loss": 0.8638, + "step": 188 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019833405577988195, + "loss": 0.9346, + "step": 189 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001983069341475504, + "loss": 0.8969, + "step": 190 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019827959540820613, + "loss": 0.8499, + "step": 191 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019825203962222572, + "loss": 0.8041, + "step": 192 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019822426685046497, + "loss": 0.9216, + "step": 193 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019819627715425903, + "loss": 0.906, + "step": 194 + }, + { + "epoch": 0.45, + "learning_rate": 0.000198168070595422, + "loss": 0.8969, + "step": 195 + }, + { + "epoch": 0.45, + "learning_rate": 0.000198139647236247, + "loss": 0.7949, + "step": 196 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019811100713950587, + "loss": 0.8996, + "step": 197 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019808215036844917, + "loss": 0.9118, + "step": 198 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001980530769868059, + "loss": 0.7355, + "step": 199 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019802378705878354, + "loss": 0.8344, + "step": 200 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019799428064906774, + "loss": 0.9639, + "step": 201 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001979645578228222, + "loss": 0.852, + "step": 202 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001979346186456887, + "loss": 0.8493, + "step": 203 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019790446318378665, + "loss": 0.851, + "step": 204 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019787409150371328, + "loss": 0.7161, + "step": 205 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019784350367254322, + "loss": 0.9846, + "step": 206 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001978126997578285, + "loss": 0.7883, + "step": 207 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019778167982759833, + "loss": 0.8691, + "step": 208 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019775044395035907, + "loss": 0.928, + "step": 209 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001977189921950939, + "loss": 0.8244, + "step": 210 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001976873246312628, + "loss": 1.0413, + "step": 211 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001976554413288023, + "loss": 0.8261, + "step": 212 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001976233423581255, + "loss": 0.823, + "step": 213 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019759102779012166, + "loss": 0.9386, + "step": 214 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019755849769615628, + "loss": 0.8156, + "step": 215 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019752575214807076, + "loss": 0.8556, + "step": 216 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019749279121818235, + "loss": 0.7769, + "step": 217 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019745961497928406, + "loss": 1.0772, + "step": 218 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019742622350464418, + "loss": 0.8147, + "step": 219 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001973926168680066, + "loss": 0.9529, + "step": 220 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019735879514359018, + "loss": 0.8688, + "step": 221 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019732475840608888, + "loss": 0.9647, + "step": 222 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019729050673067156, + "loss": 0.837, + "step": 223 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019725604019298163, + "loss": 0.9211, + "step": 224 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019722135886913715, + "loss": 0.9434, + "step": 225 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001971864628357304, + "loss": 0.6506, + "step": 226 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019715135216982798, + "loss": 0.8052, + "step": 227 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019711602694897037, + "loss": 0.7852, + "step": 228 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019708048725117192, + "loss": 0.9283, + "step": 229 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001970447331549207, + "loss": 0.9081, + "step": 230 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019700876473917824, + "loss": 0.9036, + "step": 231 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019697258208337934, + "loss": 0.716, + "step": 232 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019693618526743197, + "loss": 0.8192, + "step": 233 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001968995743717171, + "loss": 0.9773, + "step": 234 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019686274947708848, + "loss": 0.8698, + "step": 235 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001968257106648724, + "loss": 0.9062, + "step": 236 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019678845801686764, + "loss": 0.8984, + "step": 237 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019675099161534521, + "loss": 0.8087, + "step": 238 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019671331154304822, + "loss": 0.8272, + "step": 239 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019667541788319162, + "loss": 0.784, + "step": 240 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019663731071946206, + "loss": 0.8777, + "step": 241 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019659899013601772, + "loss": 0.8534, + "step": 242 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019656045621748808, + "loss": 0.9645, + "step": 243 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019652170904897387, + "loss": 0.9692, + "step": 244 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019648274871604662, + "loss": 0.838, + "step": 245 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019644357530474872, + "loss": 0.7445, + "step": 246 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001964041889015931, + "loss": 0.9065, + "step": 247 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019636458959356316, + "loss": 0.7806, + "step": 248 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019632477746811232, + "loss": 0.7971, + "step": 249 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019628475261316417, + "loss": 0.8409, + "step": 250 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019624451511711198, + "loss": 0.7432, + "step": 251 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019620406506881875, + "loss": 0.9096, + "step": 252 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019616340255761676, + "loss": 0.8004, + "step": 253 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019612252767330763, + "loss": 0.7978, + "step": 254 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001960814405061619, + "loss": 0.9535, + "step": 255 + }, + { + "epoch": 0.59, + "learning_rate": 0.000196040141146919, + "loss": 0.9945, + "step": 256 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001959986296867869, + "loss": 0.9703, + "step": 257 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019595690621744208, + "loss": 0.9639, + "step": 258 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019591497083102914, + "loss": 0.9312, + "step": 259 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019587282362016083, + "loss": 0.7709, + "step": 260 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001958304646779175, + "loss": 0.8547, + "step": 261 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019578789409784727, + "loss": 0.8081, + "step": 262 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019574511197396563, + "loss": 0.8476, + "step": 263 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019570211840075517, + "loss": 0.9658, + "step": 264 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019565891347316552, + "loss": 0.7778, + "step": 265 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001956154972866131, + "loss": 0.9926, + "step": 266 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001955718699369808, + "loss": 0.957, + "step": 267 + }, + { + "epoch": 0.61, + "learning_rate": 0.000195528031520618, + "loss": 0.9396, + "step": 268 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019548398213434007, + "loss": 0.9049, + "step": 269 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019543972187542833, + "loss": 0.9683, + "step": 270 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019539525084162992, + "loss": 0.8555, + "step": 271 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019535056913115725, + "loss": 0.8489, + "step": 272 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001953056768426882, + "loss": 0.8728, + "step": 273 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019526057407536564, + "loss": 0.9443, + "step": 274 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019521526092879725, + "loss": 0.8161, + "step": 275 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019516973750305532, + "loss": 0.8936, + "step": 276 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019512400389867657, + "loss": 0.8315, + "step": 277 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019507806021666188, + "loss": 0.9298, + "step": 278 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019503190655847604, + "loss": 0.8235, + "step": 279 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019498554302604766, + "loss": 0.9245, + "step": 280 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001949389697217687, + "loss": 0.8302, + "step": 281 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019489218674849455, + "loss": 0.8488, + "step": 282 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019484519420954354, + "loss": 0.8177, + "step": 283 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019479799220869682, + "loss": 1.0039, + "step": 284 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019475058085019825, + "loss": 0.7685, + "step": 285 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019470296023875387, + "loss": 0.9174, + "step": 286 + }, + { + "epoch": 0.66, + "learning_rate": 0.000194655130479532, + "loss": 1.0997, + "step": 287 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019460709167816274, + "loss": 0.9759, + "step": 288 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001945588439407379, + "loss": 0.9397, + "step": 289 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019451038737381077, + "loss": 1.0367, + "step": 290 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019446172208439574, + "loss": 0.8298, + "step": 291 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001944128481799682, + "loss": 0.9094, + "step": 292 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019436376576846423, + "loss": 1.1234, + "step": 293 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019431447495828045, + "loss": 0.9103, + "step": 294 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001942649758582737, + "loss": 0.7841, + "step": 295 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019421526857776072, + "loss": 0.8817, + "step": 296 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019416535322651818, + "loss": 1.0682, + "step": 297 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019411522991478214, + "loss": 0.9201, + "step": 298 + }, + { + "epoch": 0.68, + "learning_rate": 0.000194064898753248, + "loss": 4.1834, + "step": 299 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019401435985307012, + "loss": 1.0391, + "step": 300 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019396361332586166, + "loss": 2.5015, + "step": 301 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001939126592836944, + "loss": 0.7927, + "step": 302 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001938614978390983, + "loss": 2.2345, + "step": 303 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019381012910506146, + "loss": 0.9311, + "step": 304 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019375855319502962, + "loss": 0.9713, + "step": 305 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019370677022290624, + "loss": 0.8967, + "step": 306 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019365478030305196, + "loss": 3.095, + "step": 307 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001936025835502845, + "loss": 1.1008, + "step": 308 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001935501800798783, + "loss": 1.5409, + "step": 309 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019349757000756444, + "loss": 1.02, + "step": 310 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019344475344953012, + "loss": 1.0101, + "step": 311 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001933917305224187, + "loss": 0.7686, + "step": 312 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001933385013433292, + "loss": 1.1061, + "step": 313 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001932850660298162, + "loss": 0.8083, + "step": 314 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001932314246998895, + "loss": 1.1942, + "step": 315 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019317757747201384, + "loss": 0.8551, + "step": 316 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019312352446510878, + "loss": 0.9049, + "step": 317 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019306926579854821, + "loss": 0.7072, + "step": 318 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019301480159216028, + "loss": 0.8552, + "step": 319 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019296013196622706, + "loss": 0.8414, + "step": 320 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001929052570414843, + "loss": 0.9198, + "step": 321 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019285017693912107, + "loss": 2.1953, + "step": 322 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019279489178077969, + "loss": 0.851, + "step": 323 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019273940168855518, + "loss": 1.0239, + "step": 324 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019268370678499533, + "loss": 1.5125, + "step": 325 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019262780719310008, + "loss": 0.9171, + "step": 326 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019257170303632148, + "loss": 0.9794, + "step": 327 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019251539443856344, + "loss": 0.9023, + "step": 328 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019245888152418124, + "loss": 1.058, + "step": 329 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019240216441798142, + "loss": 0.9411, + "step": 330 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001923452432452215, + "loss": 1.197, + "step": 331 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001922881181316097, + "loss": 0.9253, + "step": 332 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001922307892033046, + "loss": 1.156, + "step": 333 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019217325658691482, + "loss": 0.9424, + "step": 334 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019211552040949891, + "loss": 1.1147, + "step": 335 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019205758079856498, + "loss": 0.8528, + "step": 336 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001919994378820704, + "loss": 0.8105, + "step": 337 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019194109178842153, + "loss": 0.9279, + "step": 338 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019188254264647337, + "loss": 0.9231, + "step": 339 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019182379058552948, + "loss": 1.0425, + "step": 340 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019176483573534142, + "loss": 0.8794, + "step": 341 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019170567822610873, + "loss": 0.9873, + "step": 342 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001916463181884784, + "loss": 0.8146, + "step": 343 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019158675575354478, + "loss": 1.027, + "step": 344 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019152699105284913, + "loss": 0.8093, + "step": 345 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001914670242183795, + "loss": 0.951, + "step": 346 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019140685538257028, + "loss": 0.9268, + "step": 347 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019134648467830198, + "loss": 1.0205, + "step": 348 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019128591223890092, + "loss": 0.9043, + "step": 349 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019122513819813902, + "loss": 0.7387, + "step": 350 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001911641626902333, + "loss": 0.9422, + "step": 351 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019110298584984578, + "loss": 0.9015, + "step": 352 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001910416078120832, + "loss": 0.7522, + "step": 353 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019098002871249646, + "loss": 0.9722, + "step": 354 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001909182486870806, + "loss": 0.8358, + "step": 355 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019085626787227443, + "loss": 0.9859, + "step": 356 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019079408640496013, + "loss": 0.7796, + "step": 357 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019073170442246302, + "loss": 0.8617, + "step": 358 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001906691220625513, + "loss": 0.7727, + "step": 359 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001906063394634356, + "loss": 0.8786, + "step": 360 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001905433567637689, + "loss": 0.9117, + "step": 361 + }, + { + "epoch": 0.83, + "learning_rate": 0.000190480174102646, + "loss": 0.9182, + "step": 362 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001904167916196033, + "loss": 0.9706, + "step": 363 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001903532094546186, + "loss": 0.8036, + "step": 364 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001902894277481105, + "loss": 0.902, + "step": 365 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019022544664093854, + "loss": 0.9231, + "step": 366 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019016126627440237, + "loss": 0.9751, + "step": 367 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001900968867902419, + "loss": 0.8373, + "step": 368 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001900323083306367, + "loss": 0.8695, + "step": 369 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001899675310382057, + "loss": 0.8654, + "step": 370 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018990255505600706, + "loss": 0.98, + "step": 371 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018983738052753767, + "loss": 0.7454, + "step": 372 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018977200759673295, + "loss": 0.829, + "step": 373 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018970643640796642, + "loss": 0.8262, + "step": 374 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001896406671060495, + "loss": 1.0659, + "step": 375 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018957469983623112, + "loss": 0.8551, + "step": 376 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018950853474419742, + "loss": 0.7991, + "step": 377 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001894421719760714, + "loss": 0.8662, + "step": 378 + }, + { + "epoch": 0.87, + "learning_rate": 0.00018937561167841263, + "loss": 0.8817, + "step": 379 + }, + { + "epoch": 0.87, + "learning_rate": 0.00018930885399821693, + "loss": 1.0894, + "step": 380 + }, + { + "epoch": 0.87, + "learning_rate": 0.000189241899082916, + "loss": 0.8225, + "step": 381 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018917474708037718, + "loss": 0.9065, + "step": 382 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018910739813890302, + "loss": 0.8779, + "step": 383 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018903985240723104, + "loss": 0.7909, + "step": 384 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018897211003453328, + "loss": 0.7649, + "step": 385 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018890417117041619, + "loss": 0.9788, + "step": 386 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018883603596492004, + "loss": 0.938, + "step": 387 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018876770456851877, + "loss": 0.9032, + "step": 388 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018869917713211964, + "loss": 0.9059, + "step": 389 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018863045380706274, + "loss": 0.8896, + "step": 390 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001885615347451209, + "loss": 0.7614, + "step": 391 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001884924200984991, + "loss": 0.978, + "step": 392 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001884231100198344, + "loss": 0.9406, + "step": 393 + }, + { + "epoch": 0.9, + "learning_rate": 0.00018835360466219533, + "loss": 0.7555, + "step": 394 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001882839041790818, + "loss": 0.9049, + "step": 395 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018821400872442458, + "loss": 0.7041, + "step": 396 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018814391845258505, + "loss": 0.8995, + "step": 397 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001880736335183548, + "loss": 0.7461, + "step": 398 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018800315407695539, + "loss": 0.9954, + "step": 399 + }, + { + "epoch": 0.92, + "learning_rate": 0.00018793248028403788, + "loss": 0.9035, + "step": 400 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001878616122956826, + "loss": 0.9083, + "step": 401 + }, + { + "epoch": 0.92, + "learning_rate": 0.00018779055026839868, + "loss": 0.7286, + "step": 402 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001877192943591239, + "loss": 0.8001, + "step": 403 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018764784472522403, + "loss": 0.8795, + "step": 404 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001875762015244929, + "loss": 0.8912, + "step": 405 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018750436491515163, + "loss": 0.8848, + "step": 406 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018743233505584862, + "loss": 0.8512, + "step": 407 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018736011210565898, + "loss": 0.8537, + "step": 408 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018728769622408423, + "loss": 0.8777, + "step": 409 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018721508757105202, + "loss": 0.7849, + "step": 410 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018714228630691576, + "loss": 0.9669, + "step": 411 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001870692925924541, + "loss": 0.9299, + "step": 412 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018699610658887088, + "loss": 1.0188, + "step": 413 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018692272845779448, + "loss": 0.8388, + "step": 414 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018684915836127765, + "loss": 0.7904, + "step": 415 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018677539646179707, + "loss": 0.9689, + "step": 416 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018670144292225297, + "loss": 0.7339, + "step": 417 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018662729790596888, + "loss": 0.7894, + "step": 418 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018655296157669117, + "loss": 0.7163, + "step": 419 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018647843409858869, + "loss": 0.8642, + "step": 420 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018640371563625246, + "loss": 0.9281, + "step": 421 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018632880635469526, + "loss": 0.834, + "step": 422 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018625370641935129, + "loss": 0.7316, + "step": 423 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018617841599607586, + "loss": 0.8504, + "step": 424 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018610293525114492, + "loss": 0.8731, + "step": 425 + }, + { + "epoch": 0.98, + "learning_rate": 0.00018602726435125474, + "loss": 0.8803, + "step": 426 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001859514034635215, + "loss": 0.8417, + "step": 427 + }, + { + "epoch": 0.98, + "learning_rate": 0.000185875352755481, + "loss": 0.8947, + "step": 428 + }, + { + "epoch": 0.98, + "learning_rate": 0.00018579911239508827, + "loss": 0.8368, + "step": 429 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018572268255071718, + "loss": 0.8231, + "step": 430 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018564606339116, + "loss": 0.8576, + "step": 431 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001855692550856272, + "loss": 0.8753, + "step": 432 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018549225780374685, + "loss": 0.7778, + "step": 433 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018541507171556445, + "loss": 0.7516, + "step": 434 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001853376969915425, + "loss": 0.7466, + "step": 435 + }, + { + "epoch": 1.0, + "learning_rate": 0.00018526013380255999, + "loss": 0.917, + "step": 436 + }, + { + "epoch": 1.0, + "learning_rate": 0.00018518238231991218, + "loss": 0.9042, + "step": 437 + }, + { + "epoch": 1.0, + "learning_rate": 0.00018510444271531022, + "loss": 0.8587, + "step": 438 + }, + { + "epoch": 1.01, + "learning_rate": 0.00018502631516088066, + "loss": 0.9001, + "step": 439 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001849479998291651, + "loss": 0.7977, + "step": 440 + }, + { + "epoch": 1.01, + "learning_rate": 0.00018486949689311993, + "loss": 0.8711, + "step": 441 + }, + { + "epoch": 1.01, + "learning_rate": 0.00018479080652611583, + "loss": 0.7192, + "step": 442 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001847119289019373, + "loss": 0.9608, + "step": 443 + }, + { + "epoch": 1.02, + "learning_rate": 0.00018463286419478255, + "loss": 0.7097, + "step": 444 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001845536125792629, + "loss": 0.7354, + "step": 445 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001844741742304024, + "loss": 0.8711, + "step": 446 + }, + { + "epoch": 1.02, + "learning_rate": 0.00018439454932363755, + "loss": 0.8832, + "step": 447 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018431473803481684, + "loss": 0.932, + "step": 448 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018423474054020034, + "loss": 0.8394, + "step": 449 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018415455701645942, + "loss": 0.7698, + "step": 450 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018407418764067627, + "loss": 0.8856, + "step": 451 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018399363259034347, + "loss": 0.8529, + "step": 452 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018391289204336368, + "loss": 0.9898, + "step": 453 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018383196617804926, + "loss": 0.8312, + "step": 454 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018375085517312182, + "loss": 0.8234, + "step": 455 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018366955920771184, + "loss": 0.7871, + "step": 456 + }, + { + "epoch": 1.05, + "learning_rate": 0.00018358807846135825, + "loss": 0.9814, + "step": 457 + }, + { + "epoch": 1.05, + "learning_rate": 0.00018350641311400812, + "loss": 0.8183, + "step": 458 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001834245633460161, + "loss": 0.8961, + "step": 459 + }, + { + "epoch": 1.05, + "learning_rate": 0.00018334252933814427, + "loss": 0.9166, + "step": 460 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018326031127156148, + "loss": 1.0031, + "step": 461 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018317790932784317, + "loss": 0.8171, + "step": 462 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001830953236889707, + "loss": 0.83, + "step": 463 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018301255453733134, + "loss": 0.8134, + "step": 464 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001829296020557174, + "loss": 0.8561, + "step": 465 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001828464664273263, + "loss": 0.8669, + "step": 466 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001827631478357597, + "loss": 1.003, + "step": 467 + }, + { + "epoch": 1.07, + "learning_rate": 0.00018267964646502357, + "loss": 0.8715, + "step": 468 + }, + { + "epoch": 1.07, + "learning_rate": 0.00018259596249952731, + "loss": 0.7434, + "step": 469 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018251209612408373, + "loss": 0.9163, + "step": 470 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018242804752390844, + "loss": 1.0639, + "step": 471 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018234381688461942, + "loss": 0.8266, + "step": 472 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018225940439223684, + "loss": 0.7582, + "step": 473 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001821748102331823, + "loss": 0.8547, + "step": 474 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001820900345942787, + "loss": 0.7908, + "step": 475 + }, + { + "epoch": 1.09, + "learning_rate": 0.00018200507766274977, + "loss": 0.6203, + "step": 476 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001819199396262195, + "loss": 0.806, + "step": 477 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001818346206727119, + "loss": 0.8016, + "step": 478 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001817491209906506, + "loss": 0.8548, + "step": 479 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018166344076885827, + "loss": 0.9194, + "step": 480 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018157758019655634, + "loss": 0.8704, + "step": 481 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018149153946336446, + "loss": 0.8373, + "step": 482 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001814053187593003, + "loss": 0.8229, + "step": 483 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018131891827477884, + "loss": 0.8289, + "step": 484 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018123233820061218, + "loss": 0.7753, + "step": 485 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018114557872800905, + "loss": 1.029, + "step": 486 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001810586400485743, + "loss": 0.6198, + "step": 487 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001809715223543087, + "loss": 0.8418, + "step": 488 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018088422583760813, + "loss": 0.7421, + "step": 489 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001807967506912636, + "loss": 0.8032, + "step": 490 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018070909710846052, + "loss": 0.7956, + "step": 491 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018062126528277844, + "loss": 0.9013, + "step": 492 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018053325540819045, + "loss": 0.9582, + "step": 493 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018044506767906295, + "loss": 0.6845, + "step": 494 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018035670229015507, + "loss": 0.8731, + "step": 495 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001802681594366183, + "loss": 0.8369, + "step": 496 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018017943931399603, + "loss": 0.6557, + "step": 497 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018009054211822324, + "loss": 0.7997, + "step": 498 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001800014680456259, + "loss": 0.8348, + "step": 499 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001799122172929206, + "loss": 0.9043, + "step": 500 + }, + { + "epoch": 1.15, + "learning_rate": 0.00017982279005721407, + "loss": 0.8499, + "step": 501 + }, + { + "epoch": 1.15, + "learning_rate": 0.00017973318653600293, + "loss": 0.8595, + "step": 502 + }, + { + "epoch": 1.15, + "learning_rate": 0.00017964340692717303, + "loss": 0.9468, + "step": 503 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001795534514289991, + "loss": 0.9848, + "step": 504 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017946332024014434, + "loss": 0.7326, + "step": 505 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017937301355965996, + "loss": 0.8479, + "step": 506 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017928253158698473, + "loss": 0.8669, + "step": 507 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017919187452194454, + "loss": 0.8163, + "step": 508 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017910104256475194, + "loss": 0.926, + "step": 509 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017901003591600575, + "loss": 0.7956, + "step": 510 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017891885477669064, + "loss": 0.9002, + "step": 511 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017882749934817652, + "loss": 0.787, + "step": 512 + }, + { + "epoch": 1.18, + "learning_rate": 0.00017873596983221832, + "loss": 0.7519, + "step": 513 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001786442664309554, + "loss": 0.8067, + "step": 514 + }, + { + "epoch": 1.18, + "learning_rate": 0.00017855238934691108, + "loss": 0.8824, + "step": 515 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001784603387829923, + "loss": 0.8014, + "step": 516 + }, + { + "epoch": 1.18, + "learning_rate": 0.00017836811494248919, + "loss": 0.6672, + "step": 517 + }, + { + "epoch": 1.19, + "learning_rate": 0.00017827571802907444, + "loss": 0.8516, + "step": 518 + }, + { + "epoch": 1.19, + "learning_rate": 0.000178183148246803, + "loss": 0.8476, + "step": 519 + }, + { + "epoch": 1.19, + "learning_rate": 0.00017809040580011164, + "loss": 0.8493, + "step": 520 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001779974908938184, + "loss": 0.7288, + "step": 521 + }, + { + "epoch": 1.2, + "learning_rate": 0.00017790440373312223, + "loss": 0.7443, + "step": 522 + }, + { + "epoch": 1.2, + "learning_rate": 0.00017781114452360245, + "loss": 0.8767, + "step": 523 + }, + { + "epoch": 1.2, + "learning_rate": 0.00017771771347121842, + "loss": 0.8025, + "step": 524 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001776241107823089, + "loss": 0.8842, + "step": 525 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017753033666359177, + "loss": 0.9648, + "step": 526 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017743639132216353, + "loss": 0.7872, + "step": 527 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001773422749654988, + "loss": 0.9122, + "step": 528 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017724798780144983, + "loss": 0.7688, + "step": 529 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001771535300382461, + "loss": 0.8938, + "step": 530 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017705890188449394, + "loss": 0.7152, + "step": 531 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001769641035491759, + "loss": 0.7077, + "step": 532 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017686913524165036, + "loss": 0.8872, + "step": 533 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017677399717165116, + "loss": 0.8775, + "step": 534 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017667868954928694, + "loss": 0.8508, + "step": 535 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017658321258504092, + "loss": 0.8589, + "step": 536 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017648756648977018, + "loss": 0.6499, + "step": 537 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017639175147470538, + "loss": 0.8927, + "step": 538 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017629576775145026, + "loss": 0.8702, + "step": 539 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017619961553198108, + "loss": 0.7958, + "step": 540 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017610329502864625, + "loss": 0.8582, + "step": 541 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017600680645416583, + "loss": 0.7905, + "step": 542 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001759101500216311, + "loss": 0.7574, + "step": 543 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017581332594450392, + "loss": 0.861, + "step": 544 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017571633443661658, + "loss": 0.7682, + "step": 545 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017561917571217093, + "loss": 0.7547, + "step": 546 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017552184998573825, + "loss": 0.7852, + "step": 547 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001754243574722586, + "loss": 0.7635, + "step": 548 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017532669838704035, + "loss": 0.8714, + "step": 549 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017522887294575977, + "loss": 0.7839, + "step": 550 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017513088136446054, + "loss": 0.8551, + "step": 551 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017503272385955318, + "loss": 0.7367, + "step": 552 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017493440064781475, + "loss": 0.9257, + "step": 553 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017483591194638817, + "loss": 0.8246, + "step": 554 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017473725797278192, + "loss": 0.8319, + "step": 555 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017463843894486937, + "loss": 0.8304, + "step": 556 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017453945508088853, + "loss": 0.6536, + "step": 557 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017444030659944138, + "loss": 0.7606, + "step": 558 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017434099371949345, + "loss": 0.7084, + "step": 559 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017424151666037329, + "loss": 0.8891, + "step": 560 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017414187564177217, + "loss": 0.6199, + "step": 561 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017404207088374333, + "loss": 0.8676, + "step": 562 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001739421026067017, + "loss": 0.8477, + "step": 563 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017384197103142328, + "loss": 0.9234, + "step": 564 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001737416763790447, + "loss": 0.9103, + "step": 565 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017364121887106286, + "loss": 0.7859, + "step": 566 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017354059872933415, + "loss": 0.8623, + "step": 567 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017343981617607424, + "loss": 0.6266, + "step": 568 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017333887143385743, + "loss": 0.8105, + "step": 569 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017323776472561627, + "loss": 0.7752, + "step": 570 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001731364962746409, + "loss": 0.7873, + "step": 571 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001730350663045788, + "loss": 0.8425, + "step": 572 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017293347503943406, + "loss": 0.777, + "step": 573 + }, + { + "epoch": 1.32, + "learning_rate": 0.000172831722703567, + "loss": 0.7348, + "step": 574 + }, + { + "epoch": 1.32, + "learning_rate": 0.00017272980952169365, + "loss": 0.7797, + "step": 575 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001726277357188853, + "loss": 0.8328, + "step": 576 + }, + { + "epoch": 1.32, + "learning_rate": 0.00017252550152056795, + "loss": 0.7109, + "step": 577 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001724231071525218, + "loss": 0.7905, + "step": 578 + }, + { + "epoch": 1.33, + "learning_rate": 0.00017232055284088085, + "loss": 0.7541, + "step": 579 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001722178388121322, + "loss": 0.8954, + "step": 580 + }, + { + "epoch": 1.33, + "learning_rate": 0.00017211496529311582, + "loss": 0.8362, + "step": 581 + }, + { + "epoch": 1.33, + "learning_rate": 0.00017201193251102382, + "loss": 0.8436, + "step": 582 + }, + { + "epoch": 1.34, + "learning_rate": 0.00017190874069340014, + "loss": 0.7594, + "step": 583 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001718053900681397, + "loss": 0.9342, + "step": 584 + }, + { + "epoch": 1.34, + "learning_rate": 0.00017170188086348848, + "loss": 0.8934, + "step": 585 + }, + { + "epoch": 1.34, + "learning_rate": 0.00017159821330804236, + "loss": 0.831, + "step": 586 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001714943876307472, + "loss": 0.8053, + "step": 587 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017139040406089786, + "loss": 0.81, + "step": 588 + }, + { + "epoch": 1.35, + "learning_rate": 0.000171286262828138, + "loss": 0.8245, + "step": 589 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017118196416245947, + "loss": 0.8232, + "step": 590 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017107750829420176, + "loss": 0.8244, + "step": 591 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001709728954540516, + "loss": 0.7863, + "step": 592 + }, + { + "epoch": 1.36, + "learning_rate": 0.00017086812587304234, + "loss": 0.8274, + "step": 593 + }, + { + "epoch": 1.36, + "learning_rate": 0.00017076319978255345, + "loss": 0.6595, + "step": 594 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001706581174143101, + "loss": 0.8582, + "step": 595 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017055287900038263, + "loss": 0.6873, + "step": 596 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017044748477318593, + "loss": 0.8673, + "step": 597 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017034193496547902, + "loss": 0.8055, + "step": 598 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017023622981036455, + "loss": 0.8232, + "step": 599 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001701303695412881, + "loss": 0.8745, + "step": 600 + }, + { + "epoch": 1.38, + "learning_rate": 0.00017002435439203808, + "loss": 0.8034, + "step": 601 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016991818459674468, + "loss": 0.9006, + "step": 602 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001698118603898798, + "loss": 0.7828, + "step": 603 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016970538200625622, + "loss": 0.8413, + "step": 604 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016959874968102735, + "loss": 0.8669, + "step": 605 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016949196364968646, + "loss": 0.9277, + "step": 606 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016938502414806634, + "loss": 0.9256, + "step": 607 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016927793141233868, + "loss": 0.8613, + "step": 608 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016917068567901358, + "loss": 0.9439, + "step": 609 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016906328718493906, + "loss": 0.8606, + "step": 610 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016895573616730044, + "loss": 0.7483, + "step": 611 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016884803286362, + "loss": 0.8359, + "step": 612 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001687401775117562, + "loss": 0.7764, + "step": 613 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016863217034990342, + "loss": 0.9857, + "step": 614 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001685240116165912, + "loss": 0.8706, + "step": 615 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001684157015506839, + "loss": 0.867, + "step": 616 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016830724039138003, + "loss": 0.7974, + "step": 617 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016819862837821181, + "loss": 0.7835, + "step": 618 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016808986575104465, + "loss": 0.7987, + "step": 619 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001679809527500765, + "loss": 0.7383, + "step": 620 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001678718896158375, + "loss": 0.9224, + "step": 621 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016776267658918928, + "loss": 0.8959, + "step": 622 + }, + { + "epoch": 1.43, + "learning_rate": 0.00016765331391132456, + "loss": 0.6702, + "step": 623 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001675438018237665, + "loss": 0.6911, + "step": 624 + }, + { + "epoch": 1.43, + "learning_rate": 0.00016743414056836825, + "loss": 0.9364, + "step": 625 + }, + { + "epoch": 1.43, + "learning_rate": 0.00016732433038731242, + "loss": 0.7902, + "step": 626 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016721437152311054, + "loss": 0.8473, + "step": 627 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016710426421860235, + "loss": 0.8765, + "step": 628 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016699400871695555, + "loss": 0.7705, + "step": 629 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016688360526166514, + "loss": 0.8653, + "step": 630 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001667730540965528, + "loss": 0.9137, + "step": 631 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016666235546576648, + "loss": 0.9772, + "step": 632 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001665515096137797, + "loss": 0.6433, + "step": 633 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001664405167853912, + "loss": 0.8096, + "step": 634 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016632937722572434, + "loss": 0.7298, + "step": 635 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016621809118022647, + "loss": 0.6841, + "step": 636 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016610665889466838, + "loss": 0.9471, + "step": 637 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016599508061514404, + "loss": 0.8396, + "step": 638 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016588335658806962, + "loss": 0.8769, + "step": 639 + }, + { + "epoch": 1.47, + "learning_rate": 0.00016577148706018328, + "loss": 0.8328, + "step": 640 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001656594722785445, + "loss": 0.8932, + "step": 641 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001655473124905335, + "loss": 0.8203, + "step": 642 + }, + { + "epoch": 1.47, + "learning_rate": 0.00016543500794385084, + "loss": 0.8514, + "step": 643 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016532255888651666, + "loss": 0.7396, + "step": 644 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016520996556687028, + "loss": 0.9178, + "step": 645 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001650972282335697, + "loss": 0.6308, + "step": 646 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016498434713559088, + "loss": 0.9018, + "step": 647 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016487132252222727, + "loss": 0.8658, + "step": 648 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016475815464308933, + "loss": 0.8228, + "step": 649 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001646448437481039, + "loss": 0.8944, + "step": 650 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001645313900875136, + "loss": 0.8617, + "step": 651 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016441779391187646, + "loss": 0.9726, + "step": 652 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016430405547206516, + "loss": 0.693, + "step": 653 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016419017501926656, + "loss": 0.8272, + "step": 654 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016407615280498124, + "loss": 0.8523, + "step": 655 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016396198908102272, + "loss": 0.7444, + "step": 656 + }, + { + "epoch": 1.51, + "learning_rate": 0.00016384768409951714, + "loss": 0.8366, + "step": 657 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001637332381129026, + "loss": 0.7441, + "step": 658 + }, + { + "epoch": 1.51, + "learning_rate": 0.00016361865137392854, + "loss": 0.6694, + "step": 659 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001635039241356553, + "loss": 0.8103, + "step": 660 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001633890566514535, + "loss": 0.9135, + "step": 661 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016327404917500346, + "loss": 0.7327, + "step": 662 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016315890196029467, + "loss": 0.8425, + "step": 663 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016304361526162534, + "loss": 0.8812, + "step": 664 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016292818933360151, + "loss": 0.777, + "step": 665 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001628126244311369, + "loss": 0.8864, + "step": 666 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016269692080945198, + "loss": 0.9333, + "step": 667 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016258107872407375, + "loss": 0.906, + "step": 668 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016246509843083492, + "loss": 0.7346, + "step": 669 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016234898018587337, + "loss": 0.8555, + "step": 670 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016223272424563173, + "loss": 0.8449, + "step": 671 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016211633086685664, + "loss": 0.8559, + "step": 672 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016199980030659838, + "loss": 0.7468, + "step": 673 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016188313282221008, + "loss": 0.7986, + "step": 674 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001617663286713474, + "loss": 0.7757, + "step": 675 + }, + { + "epoch": 1.55, + "learning_rate": 0.00016164938811196757, + "loss": 0.8789, + "step": 676 + }, + { + "epoch": 1.55, + "learning_rate": 0.00016153231140232936, + "loss": 0.5499, + "step": 677 + }, + { + "epoch": 1.55, + "learning_rate": 0.00016141509880099206, + "loss": 0.9319, + "step": 678 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016129775056681513, + "loss": 0.6904, + "step": 679 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001611802669589575, + "loss": 0.8506, + "step": 680 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016106264823687716, + "loss": 0.7242, + "step": 681 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016094489466033043, + "loss": 0.6808, + "step": 682 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016082700648937146, + "loss": 0.8017, + "step": 683 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016070898398435167, + "loss": 0.9109, + "step": 684 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016059082740591915, + "loss": 0.7277, + "step": 685 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016047253701501808, + "loss": 0.8601, + "step": 686 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016035411307288813, + "loss": 0.9118, + "step": 687 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001602355558410639, + "loss": 0.8049, + "step": 688 + }, + { + "epoch": 1.58, + "learning_rate": 0.00016011686558137448, + "loss": 0.8174, + "step": 689 + }, + { + "epoch": 1.58, + "learning_rate": 0.00015999804255594258, + "loss": 0.8481, + "step": 690 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001598790870271843, + "loss": 0.7052, + "step": 691 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015975999925780813, + "loss": 0.8208, + "step": 692 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015964077951081485, + "loss": 0.7257, + "step": 693 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015952142804949652, + "loss": 0.858, + "step": 694 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015940194513743624, + "loss": 0.9242, + "step": 695 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001592823310385073, + "loss": 0.7924, + "step": 696 + }, + { + "epoch": 1.6, + "learning_rate": 0.00015916258601687274, + "loss": 0.8788, + "step": 697 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001590427103369848, + "loss": 0.7946, + "step": 698 + }, + { + "epoch": 1.6, + "learning_rate": 0.00015892270426358414, + "loss": 0.8318, + "step": 699 + }, + { + "epoch": 1.6, + "learning_rate": 0.00015880256806169953, + "loss": 0.8983, + "step": 700 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015868230199664711, + "loss": 0.8889, + "step": 701 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015856190633402968, + "loss": 0.9692, + "step": 702 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001584413813397364, + "loss": 0.7787, + "step": 703 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015832072727994193, + "loss": 0.6455, + "step": 704 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015819994442110616, + "loss": 1.0006, + "step": 705 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015807903302997317, + "loss": 0.7384, + "step": 706 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015795799337357114, + "loss": 0.8517, + "step": 707 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015783682571921133, + "loss": 0.8446, + "step": 708 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015771553033448775, + "loss": 0.8227, + "step": 709 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015759410748727662, + "loss": 0.8374, + "step": 710 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001574725574457354, + "loss": 0.7274, + "step": 711 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015735088047830268, + "loss": 0.8728, + "step": 712 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015722907685369723, + "loss": 1.0569, + "step": 713 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015710714684091762, + "loss": 0.9775, + "step": 714 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001569850907092415, + "loss": 0.6832, + "step": 715 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015686290872822504, + "loss": 0.7358, + "step": 716 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015674060116770236, + "loss": 0.9015, + "step": 717 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015661816829778494, + "loss": 0.8516, + "step": 718 + }, + { + "epoch": 1.65, + "learning_rate": 0.00015649561038886094, + "loss": 0.8911, + "step": 719 + }, + { + "epoch": 1.65, + "learning_rate": 0.00015637292771159472, + "loss": 0.7098, + "step": 720 + }, + { + "epoch": 1.65, + "learning_rate": 0.00015625012053692615, + "loss": 0.955, + "step": 721 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001561271891360701, + "loss": 0.6421, + "step": 722 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001560041337805157, + "loss": 0.8807, + "step": 723 + }, + { + "epoch": 1.66, + "learning_rate": 0.00015588095474202595, + "loss": 0.722, + "step": 724 + }, + { + "epoch": 1.66, + "learning_rate": 0.00015575765229263686, + "loss": 0.8055, + "step": 725 + }, + { + "epoch": 1.66, + "learning_rate": 0.00015563422670465712, + "loss": 0.7822, + "step": 726 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015551067825066728, + "loss": 0.8311, + "step": 727 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015538700720351924, + "loss": 0.8519, + "step": 728 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015526321383633568, + "loss": 0.7506, + "step": 729 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001551392984225094, + "loss": 0.8056, + "step": 730 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015501526123570277, + "loss": 0.6968, + "step": 731 + }, + { + "epoch": 1.68, + "learning_rate": 0.000154891102549847, + "loss": 0.829, + "step": 732 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001547668226391417, + "loss": 0.6682, + "step": 733 + }, + { + "epoch": 1.68, + "learning_rate": 0.00015464242177805422, + "loss": 0.8295, + "step": 734 + }, + { + "epoch": 1.68, + "learning_rate": 0.00015451790024131895, + "loss": 0.6911, + "step": 735 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015439325830393687, + "loss": 0.6785, + "step": 736 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015426849624117472, + "loss": 0.81, + "step": 737 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015414361432856475, + "loss": 0.9955, + "step": 738 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015401861284190368, + "loss": 0.8433, + "step": 739 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015389349205725242, + "loss": 0.618, + "step": 740 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015376825225093537, + "loss": 0.7747, + "step": 741 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015364289369953967, + "loss": 0.7673, + "step": 742 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001535174166799148, + "loss": 0.8066, + "step": 743 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015339182146917183, + "loss": 0.8392, + "step": 744 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001532661083446829, + "loss": 0.7949, + "step": 745 + }, + { + "epoch": 1.71, + "learning_rate": 0.00015314027758408044, + "loss": 0.8698, + "step": 746 + }, + { + "epoch": 1.71, + "learning_rate": 0.00015301432946525684, + "loss": 0.7715, + "step": 747 + }, + { + "epoch": 1.71, + "learning_rate": 0.00015288826426636354, + "loss": 0.7583, + "step": 748 + }, + { + "epoch": 1.72, + "learning_rate": 0.00015276208226581064, + "loss": 0.8544, + "step": 749 + }, + { + "epoch": 1.72, + "learning_rate": 0.00015263578374226605, + "loss": 0.8272, + "step": 750 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001525093689746552, + "loss": 0.857, + "step": 751 + }, + { + "epoch": 1.72, + "learning_rate": 0.00015238283824216015, + "loss": 0.9208, + "step": 752 + }, + { + "epoch": 1.73, + "learning_rate": 0.000152256191824219, + "loss": 0.8626, + "step": 753 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015212943000052545, + "loss": 0.9418, + "step": 754 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015200255305102803, + "loss": 0.8087, + "step": 755 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015187556125592945, + "loss": 0.7913, + "step": 756 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015174845489568622, + "loss": 0.8973, + "step": 757 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015162123425100762, + "loss": 0.701, + "step": 758 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015149389960285558, + "loss": 0.898, + "step": 759 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015136645123244366, + "loss": 0.8809, + "step": 760 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015123888942123652, + "loss": 0.7334, + "step": 761 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001511112144509495, + "loss": 0.8506, + "step": 762 + }, + { + "epoch": 1.75, + "learning_rate": 0.00015098342660354775, + "loss": 0.8469, + "step": 763 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001508555261612457, + "loss": 1.0353, + "step": 764 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001507275134065065, + "loss": 0.6269, + "step": 765 + }, + { + "epoch": 1.75, + "learning_rate": 0.00015059938862204127, + "loss": 0.7825, + "step": 766 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001504711520908086, + "loss": 0.8388, + "step": 767 + }, + { + "epoch": 1.76, + "learning_rate": 0.00015034280409601385, + "loss": 0.7383, + "step": 768 + }, + { + "epoch": 1.76, + "learning_rate": 0.00015021434492110852, + "loss": 0.8029, + "step": 769 + }, + { + "epoch": 1.76, + "learning_rate": 0.00015008577484978966, + "loss": 0.6527, + "step": 770 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014995709416599926, + "loss": 0.9434, + "step": 771 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014982830315392358, + "loss": 0.753, + "step": 772 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014969940209799248, + "loss": 0.8143, + "step": 773 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014957039128287892, + "loss": 0.8939, + "step": 774 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001494412709934982, + "loss": 0.9265, + "step": 775 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014931204151500747, + "loss": 0.8261, + "step": 776 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014918270313280495, + "loss": 0.8555, + "step": 777 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014905325613252937, + "loss": 0.8191, + "step": 778 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014892370080005936, + "loss": 0.9159, + "step": 779 + }, + { + "epoch": 1.79, + "learning_rate": 0.00014879403742151283, + "loss": 0.7936, + "step": 780 + }, + { + "epoch": 1.79, + "learning_rate": 0.00014866426628324625, + "loss": 0.8782, + "step": 781 + }, + { + "epoch": 1.79, + "learning_rate": 0.00014853438767185412, + "loss": 0.6078, + "step": 782 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001484044018741682, + "loss": 0.7182, + "step": 783 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014827430917725712, + "loss": 0.7528, + "step": 784 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014814410986842543, + "loss": 0.902, + "step": 785 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014801380423521324, + "loss": 0.8765, + "step": 786 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014788339256539544, + "loss": 0.6332, + "step": 787 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014775287514698105, + "loss": 0.7258, + "step": 788 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014762225226821273, + "loss": 0.7754, + "step": 789 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014749152421756595, + "loss": 0.7039, + "step": 790 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001473606912837485, + "loss": 0.8563, + "step": 791 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014722975375569978, + "loss": 0.8956, + "step": 792 + }, + { + "epoch": 1.82, + "learning_rate": 0.00014709871192259026, + "loss": 0.8724, + "step": 793 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001469675660738206, + "loss": 0.8885, + "step": 794 + }, + { + "epoch": 1.82, + "learning_rate": 0.00014683631649902132, + "loss": 0.7637, + "step": 795 + }, + { + "epoch": 1.82, + "learning_rate": 0.00014670496348805195, + "loss": 0.7596, + "step": 796 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014657350733100047, + "loss": 0.8221, + "step": 797 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014644194831818266, + "loss": 0.8475, + "step": 798 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014631028674014142, + "loss": 0.7966, + "step": 799 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014617852288764625, + "loss": 0.9186, + "step": 800 + }, + { + "epoch": 1.84, + "learning_rate": 0.00014604665705169237, + "loss": 0.9027, + "step": 801 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001459146895235004, + "loss": 0.9357, + "step": 802 + }, + { + "epoch": 1.84, + "learning_rate": 0.00014578262059451537, + "loss": 0.9202, + "step": 803 + }, + { + "epoch": 1.84, + "learning_rate": 0.00014565045055640638, + "loss": 0.9226, + "step": 804 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001455181797010658, + "loss": 0.8416, + "step": 805 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001453858083206086, + "loss": 0.8192, + "step": 806 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001452533367073718, + "loss": 0.8309, + "step": 807 + }, + { + "epoch": 1.85, + "learning_rate": 0.00014512076515391375, + "loss": 0.7646, + "step": 808 + }, + { + "epoch": 1.85, + "learning_rate": 0.00014498809395301356, + "loss": 0.9335, + "step": 809 + }, + { + "epoch": 1.86, + "learning_rate": 0.00014485532339767037, + "loss": 0.9696, + "step": 810 + }, + { + "epoch": 1.86, + "learning_rate": 0.00014472245378110277, + "loss": 0.7, + "step": 811 + }, + { + "epoch": 1.86, + "learning_rate": 0.000144589485396748, + "loss": 0.8206, + "step": 812 + }, + { + "epoch": 1.86, + "learning_rate": 0.0001444564185382617, + "loss": 0.7417, + "step": 813 + }, + { + "epoch": 1.86, + "learning_rate": 0.00014432325349951667, + "loss": 0.6384, + "step": 814 + }, + { + "epoch": 1.87, + "learning_rate": 0.00014418999057460276, + "loss": 0.7801, + "step": 815 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001440566300578259, + "loss": 0.8459, + "step": 816 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001439231722437075, + "loss": 0.8863, + "step": 817 + }, + { + "epoch": 1.87, + "learning_rate": 0.000143789617426984, + "loss": 0.8502, + "step": 818 + }, + { + "epoch": 1.88, + "learning_rate": 0.000143655965902606, + "loss": 0.8522, + "step": 819 + }, + { + "epoch": 1.88, + "learning_rate": 0.00014352221796573757, + "loss": 0.8612, + "step": 820 + }, + { + "epoch": 1.88, + "learning_rate": 0.00014338837391175582, + "loss": 0.8065, + "step": 821 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001432544340362501, + "loss": 0.8777, + "step": 822 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014312039863502145, + "loss": 0.7731, + "step": 823 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014298626800408166, + "loss": 0.8791, + "step": 824 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014285204243965306, + "loss": 0.9095, + "step": 825 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014271772223816757, + "loss": 0.8846, + "step": 826 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014258330769626606, + "loss": 0.701, + "step": 827 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014244879911079779, + "loss": 0.7598, + "step": 828 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014231419677881966, + "loss": 1.0411, + "step": 829 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014217950099759569, + "loss": 0.6915, + "step": 830 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014204471206459628, + "loss": 0.8048, + "step": 831 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001419098302774974, + "loss": 0.7688, + "step": 832 + }, + { + "epoch": 1.91, + "learning_rate": 0.00014177485593418028, + "loss": 0.7863, + "step": 833 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001416397893327304, + "loss": 0.7627, + "step": 834 + }, + { + "epoch": 1.91, + "learning_rate": 0.00014150463077143712, + "loss": 0.7423, + "step": 835 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014136938054879283, + "loss": 0.7236, + "step": 836 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014123403896349227, + "loss": 0.8978, + "step": 837 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014109860631443213, + "loss": 0.9403, + "step": 838 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014096308290071003, + "loss": 0.7267, + "step": 839 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014082746902162414, + "loss": 0.7905, + "step": 840 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014069176497667242, + "loss": 0.8848, + "step": 841 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014055597106555192, + "loss": 0.9057, + "step": 842 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014042008758815818, + "loss": 0.7363, + "step": 843 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014028411484458454, + "loss": 0.8193, + "step": 844 + }, + { + "epoch": 1.94, + "learning_rate": 0.00014014805313512145, + "loss": 0.7387, + "step": 845 + }, + { + "epoch": 1.94, + "learning_rate": 0.00014001190276025593, + "loss": 0.8871, + "step": 846 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001398756640206707, + "loss": 0.7342, + "step": 847 + }, + { + "epoch": 1.94, + "learning_rate": 0.00013973933721724363, + "loss": 0.8557, + "step": 848 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001396029226510472, + "loss": 0.8778, + "step": 849 + }, + { + "epoch": 1.95, + "learning_rate": 0.00013946642062334766, + "loss": 0.7844, + "step": 850 + }, + { + "epoch": 1.95, + "learning_rate": 0.00013932983143560433, + "loss": 0.7941, + "step": 851 + }, + { + "epoch": 1.95, + "learning_rate": 0.00013919315538946905, + "loss": 0.7505, + "step": 852 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001390563927867856, + "loss": 0.8371, + "step": 853 + }, + { + "epoch": 1.96, + "learning_rate": 0.00013891954392958878, + "loss": 0.8128, + "step": 854 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001387826091201039, + "loss": 0.7127, + "step": 855 + }, + { + "epoch": 1.96, + "learning_rate": 0.00013864558866074622, + "loss": 0.8165, + "step": 856 + }, + { + "epoch": 1.96, + "learning_rate": 0.00013850848285411994, + "loss": 0.7103, + "step": 857 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013837129200301794, + "loss": 0.8373, + "step": 858 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013823401641042084, + "loss": 0.6908, + "step": 859 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013809665637949637, + "loss": 0.7358, + "step": 860 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013795921221359877, + "loss": 0.7545, + "step": 861 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013782168421626816, + "loss": 0.7681, + "step": 862 + }, + { + "epoch": 1.98, + "learning_rate": 0.00013768407269122967, + "loss": 1.026, + "step": 863 + }, + { + "epoch": 1.98, + "learning_rate": 0.000137546377942393, + "loss": 0.761, + "step": 864 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001374086002738516, + "loss": 0.8442, + "step": 865 + }, + { + "epoch": 1.98, + "learning_rate": 0.00013727073998988202, + "loss": 0.7959, + "step": 866 + }, + { + "epoch": 1.99, + "learning_rate": 0.00013713279739494333, + "loss": 0.8061, + "step": 867 + }, + { + "epoch": 1.99, + "learning_rate": 0.00013699477279367636, + "loss": 0.7434, + "step": 868 + }, + { + "epoch": 1.99, + "learning_rate": 0.000136856666490903, + "loss": 0.7159, + "step": 869 + }, + { + "epoch": 1.99, + "learning_rate": 0.00013671847879162562, + "loss": 0.867, + "step": 870 + }, + { + "epoch": 2.0, + "learning_rate": 0.00013658021000102636, + "loss": 0.9237, + "step": 871 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001364418604244664, + "loss": 0.8545, + "step": 872 + }, + { + "epoch": 2.0, + "learning_rate": 0.00013630343036748535, + "loss": 0.893, + "step": 873 + }, + { + "epoch": 2.0, + "learning_rate": 0.00013616492013580062, + "loss": 0.9858, + "step": 874 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001360263300353066, + "loss": 0.6643, + "step": 875 + }, + { + "epoch": 2.01, + "learning_rate": 0.0001358876603720741, + "loss": 0.8081, + "step": 876 + }, + { + "epoch": 2.01, + "learning_rate": 0.00013574891145234962, + "loss": 0.7287, + "step": 877 + }, + { + "epoch": 2.01, + "learning_rate": 0.00013561008358255468, + "loss": 0.8078, + "step": 878 + }, + { + "epoch": 2.01, + "learning_rate": 0.0001354711770692853, + "loss": 0.6738, + "step": 879 + }, + { + "epoch": 2.02, + "learning_rate": 0.00013533219221931102, + "loss": 0.7508, + "step": 880 + }, + { + "epoch": 2.02, + "learning_rate": 0.0001351931293395744, + "loss": 0.8724, + "step": 881 + }, + { + "epoch": 2.02, + "learning_rate": 0.0001350539887371904, + "loss": 0.9317, + "step": 882 + }, + { + "epoch": 2.02, + "learning_rate": 0.00013491477071944557, + "loss": 0.7664, + "step": 883 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013477547559379748, + "loss": 0.8065, + "step": 884 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013463610366787392, + "loss": 0.738, + "step": 885 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013449665524947234, + "loss": 0.7554, + "step": 886 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013435713064655912, + "loss": 0.7769, + "step": 887 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013421753016726887, + "loss": 0.6507, + "step": 888 + }, + { + "epoch": 2.04, + "learning_rate": 0.0001340778541199038, + "loss": 0.7293, + "step": 889 + }, + { + "epoch": 2.04, + "learning_rate": 0.00013393810281293292, + "loss": 0.8305, + "step": 890 + }, + { + "epoch": 2.04, + "learning_rate": 0.00013379827655499163, + "loss": 0.7553, + "step": 891 + }, + { + "epoch": 2.04, + "learning_rate": 0.00013365837565488064, + "loss": 0.7724, + "step": 892 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013351840042156565, + "loss": 0.7061, + "step": 893 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013337835116417648, + "loss": 0.7078, + "step": 894 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013323822819200643, + "loss": 0.8201, + "step": 895 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013309803181451156, + "loss": 0.746, + "step": 896 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013295776234131015, + "loss": 0.8276, + "step": 897 + }, + { + "epoch": 2.06, + "learning_rate": 0.0001328174200821817, + "loss": 0.7922, + "step": 898 + }, + { + "epoch": 2.06, + "learning_rate": 0.0001326770053470668, + "loss": 0.7577, + "step": 899 + }, + { + "epoch": 2.06, + "learning_rate": 0.00013253651844606572, + "loss": 0.8217, + "step": 900 + }, + { + "epoch": 2.06, + "learning_rate": 0.00013239595968943832, + "loss": 0.7883, + "step": 901 + }, + { + "epoch": 2.07, + "learning_rate": 0.00013225532938760317, + "loss": 0.9568, + "step": 902 + }, + { + "epoch": 2.07, + "learning_rate": 0.00013211462785113666, + "loss": 0.7348, + "step": 903 + }, + { + "epoch": 2.07, + "learning_rate": 0.00013197385539077275, + "loss": 0.7558, + "step": 904 + }, + { + "epoch": 2.07, + "learning_rate": 0.00013183301231740183, + "loss": 0.7066, + "step": 905 + }, + { + "epoch": 2.08, + "learning_rate": 0.0001316920989420703, + "loss": 0.7663, + "step": 906 + }, + { + "epoch": 2.08, + "learning_rate": 0.00013155111557597985, + "loss": 0.79, + "step": 907 + }, + { + "epoch": 2.08, + "learning_rate": 0.00013141006253048672, + "loss": 0.8237, + "step": 908 + }, + { + "epoch": 2.08, + "learning_rate": 0.0001312689401171011, + "loss": 0.687, + "step": 909 + }, + { + "epoch": 2.08, + "learning_rate": 0.00013112774864748621, + "loss": 0.8254, + "step": 910 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001309864884334579, + "loss": 0.7641, + "step": 911 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001308451597869839, + "loss": 0.7845, + "step": 912 + }, + { + "epoch": 2.09, + "learning_rate": 0.00013070376302018287, + "loss": 0.8661, + "step": 913 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001305622984453241, + "loss": 0.9001, + "step": 914 + }, + { + "epoch": 2.1, + "learning_rate": 0.00013042076637482654, + "loss": 0.7261, + "step": 915 + }, + { + "epoch": 2.1, + "learning_rate": 0.00013027916712125826, + "loss": 0.7954, + "step": 916 + }, + { + "epoch": 2.1, + "learning_rate": 0.0001301375009973356, + "loss": 0.792, + "step": 917 + }, + { + "epoch": 2.1, + "learning_rate": 0.00012999576831592273, + "loss": 0.8423, + "step": 918 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012985396939003065, + "loss": 0.8529, + "step": 919 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012971210453281674, + "loss": 0.9086, + "step": 920 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012957017405758401, + "loss": 0.7099, + "step": 921 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012942817827778038, + "loss": 0.7515, + "step": 922 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012928611750699783, + "loss": 0.7972, + "step": 923 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001291439920589722, + "loss": 0.6615, + "step": 924 + }, + { + "epoch": 2.12, + "learning_rate": 0.00012900180224758185, + "loss": 0.8229, + "step": 925 + }, + { + "epoch": 2.12, + "learning_rate": 0.00012885954838684743, + "loss": 0.8146, + "step": 926 + }, + { + "epoch": 2.12, + "learning_rate": 0.000128717230790931, + "loss": 0.8941, + "step": 927 + }, + { + "epoch": 2.13, + "learning_rate": 0.00012857484977413545, + "loss": 0.7661, + "step": 928 + }, + { + "epoch": 2.13, + "learning_rate": 0.00012843240565090365, + "loss": 0.7404, + "step": 929 + }, + { + "epoch": 2.13, + "learning_rate": 0.00012828989873581785, + "loss": 0.7971, + "step": 930 + }, + { + "epoch": 2.13, + "learning_rate": 0.000128147329343599, + "loss": 0.6813, + "step": 931 + }, + { + "epoch": 2.14, + "learning_rate": 0.00012800469778910601, + "loss": 0.7704, + "step": 932 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001278620043873351, + "loss": 0.7751, + "step": 933 + }, + { + "epoch": 2.14, + "learning_rate": 0.00012771924945341906, + "loss": 0.841, + "step": 934 + }, + { + "epoch": 2.14, + "learning_rate": 0.00012757643330262657, + "loss": 0.858, + "step": 935 + }, + { + "epoch": 2.14, + "learning_rate": 0.00012743355625036143, + "loss": 0.6657, + "step": 936 + }, + { + "epoch": 2.15, + "learning_rate": 0.00012729061861216213, + "loss": 0.7735, + "step": 937 + }, + { + "epoch": 2.15, + "learning_rate": 0.00012714762070370077, + "loss": 0.8935, + "step": 938 + }, + { + "epoch": 2.15, + "learning_rate": 0.00012700456284078264, + "loss": 0.9684, + "step": 939 + }, + { + "epoch": 2.15, + "learning_rate": 0.0001268614453393454, + "loss": 0.9117, + "step": 940 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012671826851545851, + "loss": 0.7613, + "step": 941 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012657503268532236, + "loss": 0.9567, + "step": 942 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012643173816526764, + "loss": 0.8725, + "step": 943 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012628838527175464, + "loss": 0.8088, + "step": 944 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012614497432137273, + "loss": 0.7655, + "step": 945 + }, + { + "epoch": 2.17, + "learning_rate": 0.00012600150563083927, + "loss": 0.7585, + "step": 946 + }, + { + "epoch": 2.17, + "learning_rate": 0.0001258579795169993, + "loss": 0.6351, + "step": 947 + }, + { + "epoch": 2.17, + "learning_rate": 0.0001257143962968246, + "loss": 0.8408, + "step": 948 + }, + { + "epoch": 2.17, + "learning_rate": 0.00012557075628741307, + "loss": 0.7144, + "step": 949 + }, + { + "epoch": 2.18, + "learning_rate": 0.00012542705980598813, + "loss": 0.7022, + "step": 950 + }, + { + "epoch": 2.18, + "learning_rate": 0.00012528330716989769, + "loss": 0.8635, + "step": 951 + }, + { + "epoch": 2.18, + "learning_rate": 0.0001251394986966139, + "loss": 0.8489, + "step": 952 + }, + { + "epoch": 2.18, + "learning_rate": 0.00012499563470373212, + "loss": 0.7563, + "step": 953 + }, + { + "epoch": 2.19, + "learning_rate": 0.00012485171550897037, + "loss": 0.9245, + "step": 954 + }, + { + "epoch": 2.19, + "learning_rate": 0.00012470774143016853, + "loss": 0.9168, + "step": 955 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001245637127852877, + "loss": 0.803, + "step": 956 + }, + { + "epoch": 2.19, + "learning_rate": 0.00012441962989240952, + "loss": 0.722, + "step": 957 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001242754930697354, + "loss": 0.7944, + "step": 958 + }, + { + "epoch": 2.2, + "learning_rate": 0.00012413130263558587, + "loss": 0.7759, + "step": 959 + }, + { + "epoch": 2.2, + "learning_rate": 0.00012398705890839988, + "loss": 0.9407, + "step": 960 + }, + { + "epoch": 2.2, + "learning_rate": 0.00012384276220673402, + "loss": 0.726, + "step": 961 + }, + { + "epoch": 2.2, + "learning_rate": 0.00012369841284926188, + "loss": 0.7817, + "step": 962 + }, + { + "epoch": 2.21, + "learning_rate": 0.00012355401115477345, + "loss": 0.6845, + "step": 963 + }, + { + "epoch": 2.21, + "learning_rate": 0.00012340955744217412, + "loss": 0.7638, + "step": 964 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001232650520304843, + "loss": 0.8104, + "step": 965 + }, + { + "epoch": 2.21, + "learning_rate": 0.00012312049523883852, + "loss": 0.8676, + "step": 966 + }, + { + "epoch": 2.22, + "learning_rate": 0.0001229758873864848, + "loss": 0.7944, + "step": 967 + }, + { + "epoch": 2.22, + "learning_rate": 0.00012283122879278393, + "loss": 0.8001, + "step": 968 + }, + { + "epoch": 2.22, + "learning_rate": 0.00012268651977720866, + "loss": 0.7943, + "step": 969 + }, + { + "epoch": 2.22, + "learning_rate": 0.0001225417606593433, + "loss": 0.9679, + "step": 970 + }, + { + "epoch": 2.22, + "learning_rate": 0.00012239695175888263, + "loss": 0.773, + "step": 971 + }, + { + "epoch": 2.23, + "learning_rate": 0.00012225209339563145, + "loss": 0.7707, + "step": 972 + }, + { + "epoch": 2.23, + "learning_rate": 0.00012210718588950376, + "loss": 0.6727, + "step": 973 + }, + { + "epoch": 2.23, + "learning_rate": 0.00012196222956052214, + "loss": 0.7641, + "step": 974 + }, + { + "epoch": 2.23, + "learning_rate": 0.00012181722472881697, + "loss": 0.8506, + "step": 975 + }, + { + "epoch": 2.24, + "learning_rate": 0.00012167217171462566, + "loss": 0.8442, + "step": 976 + }, + { + "epoch": 2.24, + "learning_rate": 0.00012152707083829217, + "loss": 0.7853, + "step": 977 + }, + { + "epoch": 2.24, + "learning_rate": 0.00012138192242026614, + "loss": 0.7495, + "step": 978 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001212367267811021, + "loss": 0.739, + "step": 979 + }, + { + "epoch": 2.25, + "learning_rate": 0.00012109148424145898, + "loss": 0.6531, + "step": 980 + }, + { + "epoch": 2.25, + "learning_rate": 0.00012094619512209915, + "loss": 0.7721, + "step": 981 + }, + { + "epoch": 2.25, + "learning_rate": 0.00012080085974388802, + "loss": 0.7346, + "step": 982 + }, + { + "epoch": 2.25, + "learning_rate": 0.0001206554784277931, + "loss": 0.8709, + "step": 983 + }, + { + "epoch": 2.25, + "learning_rate": 0.00012051005149488326, + "loss": 0.8111, + "step": 984 + }, + { + "epoch": 2.26, + "learning_rate": 0.0001203645792663282, + "loss": 0.8296, + "step": 985 + }, + { + "epoch": 2.26, + "learning_rate": 0.00012021906206339766, + "loss": 0.7569, + "step": 986 + }, + { + "epoch": 2.26, + "learning_rate": 0.00012007350020746068, + "loss": 0.7945, + "step": 987 + }, + { + "epoch": 2.26, + "learning_rate": 0.00011992789401998492, + "loss": 0.7818, + "step": 988 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011978224382253589, + "loss": 0.59, + "step": 989 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011963654993677645, + "loss": 0.828, + "step": 990 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011949081268446571, + "loss": 0.7583, + "step": 991 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011934503238745878, + "loss": 0.7453, + "step": 992 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011919920936770568, + "loss": 0.826, + "step": 993 + }, + { + "epoch": 2.28, + "learning_rate": 0.00011905334394725085, + "loss": 0.7673, + "step": 994 + }, + { + "epoch": 2.28, + "learning_rate": 0.00011890743644823242, + "loss": 0.9637, + "step": 995 + }, + { + "epoch": 2.28, + "learning_rate": 0.00011876148719288128, + "loss": 0.702, + "step": 996 + }, + { + "epoch": 2.28, + "learning_rate": 0.00011861549650352069, + "loss": 0.856, + "step": 997 + }, + { + "epoch": 2.29, + "learning_rate": 0.00011846946470256538, + "loss": 0.725, + "step": 998 + }, + { + "epoch": 2.29, + "learning_rate": 0.00011832339211252084, + "loss": 0.7615, + "step": 999 + }, + { + "epoch": 2.29, + "learning_rate": 0.00011817727905598268, + "loss": 0.7691, + "step": 1000 + }, + { + "epoch": 2.29, + "learning_rate": 0.00011803112585563587, + "loss": 0.8347, + "step": 1001 + }, + { + "epoch": 2.3, + "learning_rate": 0.00011788493283425397, + "loss": 0.908, + "step": 1002 + }, + { + "epoch": 2.3, + "learning_rate": 0.00011773870031469862, + "loss": 0.8724, + "step": 1003 + }, + { + "epoch": 2.3, + "learning_rate": 0.00011759242861991855, + "loss": 0.8801, + "step": 1004 + }, + { + "epoch": 2.3, + "learning_rate": 0.0001174461180729491, + "loss": 0.861, + "step": 1005 + }, + { + "epoch": 2.3, + "learning_rate": 0.00011729976899691137, + "loss": 0.8878, + "step": 1006 + }, + { + "epoch": 2.31, + "learning_rate": 0.00011715338171501156, + "loss": 0.7662, + "step": 1007 + }, + { + "epoch": 2.31, + "learning_rate": 0.00011700695655054026, + "loss": 0.7814, + "step": 1008 + }, + { + "epoch": 2.31, + "learning_rate": 0.00011686049382687168, + "loss": 0.8727, + "step": 1009 + }, + { + "epoch": 2.31, + "learning_rate": 0.000116713993867463, + "loss": 0.8036, + "step": 1010 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011656745699585371, + "loss": 0.957, + "step": 1011 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011642088353566469, + "loss": 0.9257, + "step": 1012 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011627427381059772, + "loss": 0.7994, + "step": 1013 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011612762814443459, + "loss": 0.6582, + "step": 1014 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011598094686103653, + "loss": 0.7195, + "step": 1015 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011583423028434344, + "loss": 0.6673, + "step": 1016 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011568747873837307, + "loss": 0.8075, + "step": 1017 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011554069254722051, + "loss": 0.8945, + "step": 1018 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011539387203505727, + "loss": 0.6828, + "step": 1019 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011524701752613074, + "loss": 0.7014, + "step": 1020 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011510012934476338, + "loss": 0.8388, + "step": 1021 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011495320781535186, + "loss": 0.685, + "step": 1022 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011480625326236677, + "loss": 0.7141, + "step": 1023 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011465926601035137, + "loss": 0.8078, + "step": 1024 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011451224638392129, + "loss": 0.7924, + "step": 1025 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011436519470776362, + "loss": 0.9223, + "step": 1026 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011421811130663623, + "loss": 0.8251, + "step": 1027 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011407099650536706, + "loss": 0.9127, + "step": 1028 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011392385062885334, + "loss": 0.7634, + "step": 1029 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011377667400206101, + "loss": 0.7472, + "step": 1030 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011362946695002383, + "loss": 0.7838, + "step": 1031 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011348222979784289, + "loss": 0.9502, + "step": 1032 + }, + { + "epoch": 2.37, + "learning_rate": 0.00011333496287068563, + "loss": 0.7066, + "step": 1033 + }, + { + "epoch": 2.37, + "learning_rate": 0.00011318766649378532, + "loss": 0.9988, + "step": 1034 + }, + { + "epoch": 2.37, + "learning_rate": 0.00011304034099244014, + "loss": 0.9448, + "step": 1035 + }, + { + "epoch": 2.37, + "learning_rate": 0.00011289298669201282, + "loss": 0.7764, + "step": 1036 + }, + { + "epoch": 2.38, + "learning_rate": 0.00011274560391792948, + "loss": 0.7351, + "step": 1037 + }, + { + "epoch": 2.38, + "learning_rate": 0.00011259819299567922, + "loss": 0.895, + "step": 1038 + }, + { + "epoch": 2.38, + "learning_rate": 0.00011245075425081328, + "loss": 0.718, + "step": 1039 + }, + { + "epoch": 2.38, + "learning_rate": 0.00011230328800894437, + "loss": 0.7811, + "step": 1040 + }, + { + "epoch": 2.38, + "learning_rate": 0.0001121557945957459, + "loss": 0.7859, + "step": 1041 + }, + { + "epoch": 2.39, + "learning_rate": 0.00011200827433695127, + "loss": 0.7916, + "step": 1042 + }, + { + "epoch": 2.39, + "learning_rate": 0.00011186072755835322, + "loss": 0.8321, + "step": 1043 + }, + { + "epoch": 2.39, + "learning_rate": 0.00011171315458580303, + "loss": 0.7648, + "step": 1044 + }, + { + "epoch": 2.39, + "learning_rate": 0.00011156555574520981, + "loss": 0.7691, + "step": 1045 + }, + { + "epoch": 2.4, + "learning_rate": 0.00011141793136253986, + "loss": 0.6978, + "step": 1046 + }, + { + "epoch": 2.4, + "learning_rate": 0.00011127028176381578, + "loss": 0.6725, + "step": 1047 + }, + { + "epoch": 2.4, + "learning_rate": 0.00011112260727511596, + "loss": 0.8165, + "step": 1048 + }, + { + "epoch": 2.4, + "learning_rate": 0.00011097490822257377, + "loss": 0.8662, + "step": 1049 + }, + { + "epoch": 2.41, + "learning_rate": 0.00011082718493237669, + "loss": 0.8784, + "step": 1050 + }, + { + "epoch": 2.41, + "learning_rate": 0.00011067943773076586, + "loss": 0.8533, + "step": 1051 + }, + { + "epoch": 2.41, + "learning_rate": 0.00011053166694403521, + "loss": 0.6602, + "step": 1052 + }, + { + "epoch": 2.41, + "learning_rate": 0.0001103838728985307, + "loss": 0.8363, + "step": 1053 + }, + { + "epoch": 2.41, + "learning_rate": 0.0001102360559206497, + "loss": 0.8044, + "step": 1054 + }, + { + "epoch": 2.42, + "learning_rate": 0.00011008821633684019, + "loss": 0.8684, + "step": 1055 + }, + { + "epoch": 2.42, + "learning_rate": 0.00010994035447360018, + "loss": 0.7158, + "step": 1056 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001097924706574767, + "loss": 0.7729, + "step": 1057 + }, + { + "epoch": 2.42, + "learning_rate": 0.00010964456521506545, + "loss": 0.685, + "step": 1058 + }, + { + "epoch": 2.43, + "learning_rate": 0.00010949663847300976, + "loss": 0.8647, + "step": 1059 + }, + { + "epoch": 2.43, + "learning_rate": 0.000109348690758, + "loss": 0.836, + "step": 1060 + }, + { + "epoch": 2.43, + "learning_rate": 0.00010920072239677301, + "loss": 0.8494, + "step": 1061 + }, + { + "epoch": 2.43, + "learning_rate": 0.00010905273371611105, + "loss": 0.9494, + "step": 1062 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010890472504284133, + "loss": 0.7832, + "step": 1063 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010875669670383521, + "loss": 0.7709, + "step": 1064 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010860864902600747, + "loss": 0.8175, + "step": 1065 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010846058233631565, + "loss": 0.8179, + "step": 1066 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010831249696175918, + "loss": 0.7686, + "step": 1067 + }, + { + "epoch": 2.45, + "learning_rate": 0.00010816439322937879, + "loss": 0.8491, + "step": 1068 + }, + { + "epoch": 2.45, + "learning_rate": 0.00010801627146625588, + "loss": 0.7961, + "step": 1069 + }, + { + "epoch": 2.45, + "learning_rate": 0.00010786813199951145, + "loss": 0.8408, + "step": 1070 + }, + { + "epoch": 2.45, + "learning_rate": 0.00010771997515630574, + "loss": 0.8916, + "step": 1071 + }, + { + "epoch": 2.46, + "learning_rate": 0.00010757180126383735, + "loss": 0.8035, + "step": 1072 + }, + { + "epoch": 2.46, + "learning_rate": 0.0001074236106493425, + "loss": 0.9132, + "step": 1073 + }, + { + "epoch": 2.46, + "learning_rate": 0.0001072754036400944, + "loss": 0.8029, + "step": 1074 + }, + { + "epoch": 2.46, + "learning_rate": 0.00010712718056340236, + "loss": 0.6981, + "step": 1075 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010697894174661127, + "loss": 0.7829, + "step": 1076 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010683068751710075, + "loss": 0.7699, + "step": 1077 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010668241820228444, + "loss": 0.7342, + "step": 1078 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010653413412960935, + "loss": 0.7729, + "step": 1079 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010638583562655498, + "loss": 0.9097, + "step": 1080 + }, + { + "epoch": 2.48, + "learning_rate": 0.00010623752302063283, + "loss": 0.8692, + "step": 1081 + }, + { + "epoch": 2.48, + "learning_rate": 0.00010608919663938549, + "loss": 0.8861, + "step": 1082 + }, + { + "epoch": 2.48, + "learning_rate": 0.00010594085681038588, + "loss": 0.7454, + "step": 1083 + }, + { + "epoch": 2.48, + "learning_rate": 0.00010579250386123676, + "loss": 0.8291, + "step": 1084 + }, + { + "epoch": 2.49, + "learning_rate": 0.0001056441381195698, + "loss": 0.7643, + "step": 1085 + }, + { + "epoch": 2.49, + "learning_rate": 0.00010549575991304492, + "loss": 0.8242, + "step": 1086 + }, + { + "epoch": 2.49, + "learning_rate": 0.0001053473695693496, + "loss": 0.9521, + "step": 1087 + }, + { + "epoch": 2.49, + "learning_rate": 0.00010519896741619803, + "loss": 0.8142, + "step": 1088 + }, + { + "epoch": 2.49, + "learning_rate": 0.00010505055378133067, + "loss": 0.7955, + "step": 1089 + }, + { + "epoch": 2.5, + "learning_rate": 0.00010490212899251309, + "loss": 0.7363, + "step": 1090 + }, + { + "epoch": 2.5, + "learning_rate": 0.00010475369337753569, + "loss": 0.8173, + "step": 1091 + }, + { + "epoch": 2.5, + "learning_rate": 0.00010460524726421275, + "loss": 0.7659, + "step": 1092 + }, + { + "epoch": 2.5, + "learning_rate": 0.00010445679098038157, + "loss": 0.8618, + "step": 1093 + }, + { + "epoch": 2.51, + "learning_rate": 0.00010430832485390217, + "loss": 0.7606, + "step": 1094 + }, + { + "epoch": 2.51, + "learning_rate": 0.00010415984921265609, + "loss": 0.8721, + "step": 1095 + }, + { + "epoch": 2.51, + "learning_rate": 0.00010401136438454599, + "loss": 0.8152, + "step": 1096 + }, + { + "epoch": 2.51, + "learning_rate": 0.0001038628706974948, + "loss": 0.8934, + "step": 1097 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010371436847944503, + "loss": 0.8385, + "step": 1098 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010356585805835797, + "loss": 0.8581, + "step": 1099 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010341733976221313, + "loss": 0.788, + "step": 1100 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010326881391900724, + "loss": 0.7872, + "step": 1101 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010312028085675391, + "loss": 0.819, + "step": 1102 + }, + { + "epoch": 2.53, + "learning_rate": 0.00010297174090348255, + "loss": 0.854, + "step": 1103 + }, + { + "epoch": 2.53, + "learning_rate": 0.00010282319438723782, + "loss": 0.7121, + "step": 1104 + }, + { + "epoch": 2.53, + "learning_rate": 0.00010267464163607889, + "loss": 0.8977, + "step": 1105 + }, + { + "epoch": 2.53, + "learning_rate": 0.00010252608297807871, + "loss": 0.8411, + "step": 1106 + }, + { + "epoch": 2.54, + "learning_rate": 0.00010237751874132322, + "loss": 0.834, + "step": 1107 + }, + { + "epoch": 2.54, + "learning_rate": 0.00010222894925391073, + "loss": 0.7582, + "step": 1108 + }, + { + "epoch": 2.54, + "learning_rate": 0.00010208037484395114, + "loss": 0.7773, + "step": 1109 + }, + { + "epoch": 2.54, + "learning_rate": 0.00010193179583956523, + "loss": 0.7294, + "step": 1110 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010178321256888385, + "loss": 0.89, + "step": 1111 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010163462536004742, + "loss": 0.7675, + "step": 1112 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010148603454120487, + "loss": 0.7291, + "step": 1113 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010133744044051328, + "loss": 0.8403, + "step": 1114 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010118884338613688, + "loss": 0.8955, + "step": 1115 + }, + { + "epoch": 2.56, + "learning_rate": 0.00010104024370624644, + "loss": 0.7537, + "step": 1116 + }, + { + "epoch": 2.56, + "learning_rate": 0.00010089164172901851, + "loss": 0.8734, + "step": 1117 + }, + { + "epoch": 2.56, + "learning_rate": 0.00010074303778263474, + "loss": 0.7312, + "step": 1118 + }, + { + "epoch": 2.56, + "learning_rate": 0.00010059443219528117, + "loss": 0.7906, + "step": 1119 + }, + { + "epoch": 2.57, + "learning_rate": 0.00010044582529514739, + "loss": 0.7756, + "step": 1120 + }, + { + "epoch": 2.57, + "learning_rate": 0.00010029721741042586, + "loss": 0.9158, + "step": 1121 + }, + { + "epoch": 2.57, + "learning_rate": 0.00010014860886931139, + "loss": 0.8481, + "step": 1122 + }, + { + "epoch": 2.57, + "learning_rate": 0.0001, + "loss": 0.8187, + "step": 1123 + }, + { + "epoch": 2.58, + "learning_rate": 9.985139113068865e-05, + "loss": 0.8507, + "step": 1124 + }, + { + "epoch": 2.58, + "learning_rate": 9.970278258957415e-05, + "loss": 0.7585, + "step": 1125 + }, + { + "epoch": 2.58, + "learning_rate": 9.955417470485265e-05, + "loss": 0.7163, + "step": 1126 + }, + { + "epoch": 2.58, + "learning_rate": 9.940556780471885e-05, + "loss": 0.8124, + "step": 1127 + }, + { + "epoch": 2.58, + "learning_rate": 9.925696221736525e-05, + "loss": 0.924, + "step": 1128 + }, + { + "epoch": 2.59, + "learning_rate": 9.91083582709815e-05, + "loss": 0.843, + "step": 1129 + }, + { + "epoch": 2.59, + "learning_rate": 9.895975629375359e-05, + "loss": 0.8461, + "step": 1130 + }, + { + "epoch": 2.59, + "learning_rate": 9.881115661386314e-05, + "loss": 0.757, + "step": 1131 + }, + { + "epoch": 2.59, + "learning_rate": 9.866255955948676e-05, + "loss": 0.7779, + "step": 1132 + }, + { + "epoch": 2.6, + "learning_rate": 9.851396545879516e-05, + "loss": 0.8325, + "step": 1133 + }, + { + "epoch": 2.6, + "learning_rate": 9.836537463995262e-05, + "loss": 0.7117, + "step": 1134 + }, + { + "epoch": 2.6, + "learning_rate": 9.821678743111618e-05, + "loss": 0.7209, + "step": 1135 + }, + { + "epoch": 2.6, + "learning_rate": 9.806820416043478e-05, + "loss": 0.6621, + "step": 1136 + }, + { + "epoch": 2.6, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7836, + "step": 1137 + }, + { + "epoch": 2.61, + "learning_rate": 9.777105074608928e-05, + "loss": 0.8576, + "step": 1138 + }, + { + "epoch": 2.61, + "learning_rate": 9.762248125867678e-05, + "loss": 0.6352, + "step": 1139 + }, + { + "epoch": 2.61, + "learning_rate": 9.747391702192132e-05, + "loss": 0.7828, + "step": 1140 + }, + { + "epoch": 2.61, + "learning_rate": 9.732535836392113e-05, + "loss": 0.6583, + "step": 1141 + }, + { + "epoch": 2.62, + "learning_rate": 9.717680561276219e-05, + "loss": 0.9171, + "step": 1142 + }, + { + "epoch": 2.62, + "learning_rate": 9.702825909651748e-05, + "loss": 0.8694, + "step": 1143 + }, + { + "epoch": 2.62, + "learning_rate": 9.687971914324607e-05, + "loss": 0.9293, + "step": 1144 + }, + { + "epoch": 2.62, + "learning_rate": 9.673118608099276e-05, + "loss": 0.7273, + "step": 1145 + }, + { + "epoch": 2.63, + "learning_rate": 9.658266023778689e-05, + "loss": 0.8386, + "step": 1146 + }, + { + "epoch": 2.63, + "learning_rate": 9.643414194164204e-05, + "loss": 0.727, + "step": 1147 + }, + { + "epoch": 2.63, + "learning_rate": 9.628563152055498e-05, + "loss": 0.9991, + "step": 1148 + }, + { + "epoch": 2.63, + "learning_rate": 9.61371293025052e-05, + "loss": 0.7304, + "step": 1149 + }, + { + "epoch": 2.63, + "learning_rate": 9.598863561545404e-05, + "loss": 0.8146, + "step": 1150 + }, + { + "epoch": 2.64, + "learning_rate": 9.584015078734395e-05, + "loss": 0.8178, + "step": 1151 + }, + { + "epoch": 2.64, + "learning_rate": 9.569167514609786e-05, + "loss": 0.7202, + "step": 1152 + }, + { + "epoch": 2.64, + "learning_rate": 9.554320901961843e-05, + "loss": 0.728, + "step": 1153 + }, + { + "epoch": 2.64, + "learning_rate": 9.539475273578729e-05, + "loss": 0.7842, + "step": 1154 + }, + { + "epoch": 2.65, + "learning_rate": 9.524630662246432e-05, + "loss": 0.7706, + "step": 1155 + }, + { + "epoch": 2.65, + "learning_rate": 9.509787100748692e-05, + "loss": 0.802, + "step": 1156 + }, + { + "epoch": 2.65, + "learning_rate": 9.494944621866937e-05, + "loss": 0.9293, + "step": 1157 + }, + { + "epoch": 2.65, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8051, + "step": 1158 + }, + { + "epoch": 2.66, + "learning_rate": 9.465263043065045e-05, + "loss": 0.7449, + "step": 1159 + }, + { + "epoch": 2.66, + "learning_rate": 9.450424008695509e-05, + "loss": 0.7289, + "step": 1160 + }, + { + "epoch": 2.66, + "learning_rate": 9.43558618804302e-05, + "loss": 0.6778, + "step": 1161 + }, + { + "epoch": 2.66, + "learning_rate": 9.420749613876325e-05, + "loss": 0.7731, + "step": 1162 + }, + { + "epoch": 2.66, + "learning_rate": 9.405914318961414e-05, + "loss": 0.6934, + "step": 1163 + }, + { + "epoch": 2.67, + "learning_rate": 9.391080336061454e-05, + "loss": 0.9045, + "step": 1164 + }, + { + "epoch": 2.67, + "learning_rate": 9.376247697936719e-05, + "loss": 0.8016, + "step": 1165 + }, + { + "epoch": 2.67, + "learning_rate": 9.361416437344503e-05, + "loss": 0.6214, + "step": 1166 + }, + { + "epoch": 2.67, + "learning_rate": 9.34658658703907e-05, + "loss": 0.6771, + "step": 1167 + }, + { + "epoch": 2.68, + "learning_rate": 9.331758179771561e-05, + "loss": 0.748, + "step": 1168 + }, + { + "epoch": 2.68, + "learning_rate": 9.316931248289926e-05, + "loss": 0.665, + "step": 1169 + }, + { + "epoch": 2.68, + "learning_rate": 9.302105825338876e-05, + "loss": 0.901, + "step": 1170 + }, + { + "epoch": 2.68, + "learning_rate": 9.287281943659767e-05, + "loss": 0.8342, + "step": 1171 + }, + { + "epoch": 2.68, + "learning_rate": 9.272459635990562e-05, + "loss": 0.853, + "step": 1172 + }, + { + "epoch": 2.69, + "learning_rate": 9.257638935065753e-05, + "loss": 0.8093, + "step": 1173 + }, + { + "epoch": 2.69, + "learning_rate": 9.242819873616268e-05, + "loss": 0.8451, + "step": 1174 + }, + { + "epoch": 2.69, + "learning_rate": 9.228002484369429e-05, + "loss": 0.8628, + "step": 1175 + }, + { + "epoch": 2.69, + "learning_rate": 9.213186800048861e-05, + "loss": 0.7858, + "step": 1176 + }, + { + "epoch": 2.7, + "learning_rate": 9.198372853374415e-05, + "loss": 0.9236, + "step": 1177 + }, + { + "epoch": 2.7, + "learning_rate": 9.183560677062119e-05, + "loss": 0.7925, + "step": 1178 + }, + { + "epoch": 2.7, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7105, + "step": 1179 + }, + { + "epoch": 2.7, + "learning_rate": 9.153941766368439e-05, + "loss": 0.7521, + "step": 1180 + }, + { + "epoch": 2.71, + "learning_rate": 9.139135097399254e-05, + "loss": 0.8648, + "step": 1181 + }, + { + "epoch": 2.71, + "learning_rate": 9.124330329616482e-05, + "loss": 0.8409, + "step": 1182 + }, + { + "epoch": 2.71, + "learning_rate": 9.109527495715872e-05, + "loss": 0.7198, + "step": 1183 + }, + { + "epoch": 2.71, + "learning_rate": 9.094726628388899e-05, + "loss": 0.7365, + "step": 1184 + }, + { + "epoch": 2.71, + "learning_rate": 9.0799277603227e-05, + "loss": 0.7699, + "step": 1185 + }, + { + "epoch": 2.72, + "learning_rate": 9.065130924199998e-05, + "loss": 0.8041, + "step": 1186 + }, + { + "epoch": 2.72, + "learning_rate": 9.050336152699025e-05, + "loss": 0.8308, + "step": 1187 + }, + { + "epoch": 2.72, + "learning_rate": 9.035543478493458e-05, + "loss": 0.8139, + "step": 1188 + }, + { + "epoch": 2.72, + "learning_rate": 9.02075293425233e-05, + "loss": 0.7394, + "step": 1189 + }, + { + "epoch": 2.73, + "learning_rate": 9.005964552639984e-05, + "loss": 0.6738, + "step": 1190 + }, + { + "epoch": 2.73, + "learning_rate": 8.991178366315982e-05, + "loss": 0.9421, + "step": 1191 + }, + { + "epoch": 2.73, + "learning_rate": 8.976394407935034e-05, + "loss": 0.8747, + "step": 1192 + }, + { + "epoch": 2.73, + "learning_rate": 8.961612710146934e-05, + "loss": 0.8282, + "step": 1193 + }, + { + "epoch": 2.74, + "learning_rate": 8.94683330559648e-05, + "loss": 0.765, + "step": 1194 + }, + { + "epoch": 2.74, + "learning_rate": 8.932056226923416e-05, + "loss": 0.8515, + "step": 1195 + }, + { + "epoch": 2.74, + "learning_rate": 8.917281506762335e-05, + "loss": 0.6194, + "step": 1196 + }, + { + "epoch": 2.74, + "learning_rate": 8.902509177742626e-05, + "loss": 0.8852, + "step": 1197 + }, + { + "epoch": 2.74, + "learning_rate": 8.887739272488406e-05, + "loss": 0.7481, + "step": 1198 + }, + { + "epoch": 2.75, + "learning_rate": 8.872971823618424e-05, + "loss": 0.7979, + "step": 1199 + }, + { + "epoch": 2.75, + "learning_rate": 8.858206863746018e-05, + "loss": 0.8332, + "step": 1200 + }, + { + "epoch": 2.75, + "learning_rate": 8.843444425479022e-05, + "loss": 0.6716, + "step": 1201 + }, + { + "epoch": 2.75, + "learning_rate": 8.828684541419696e-05, + "loss": 0.9192, + "step": 1202 + }, + { + "epoch": 2.76, + "learning_rate": 8.813927244164679e-05, + "loss": 0.8463, + "step": 1203 + }, + { + "epoch": 2.76, + "learning_rate": 8.799172566304874e-05, + "loss": 0.6598, + "step": 1204 + }, + { + "epoch": 2.76, + "learning_rate": 8.784420540425412e-05, + "loss": 0.7823, + "step": 1205 + }, + { + "epoch": 2.76, + "learning_rate": 8.769671199105565e-05, + "loss": 0.8728, + "step": 1206 + }, + { + "epoch": 2.77, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7665, + "step": 1207 + }, + { + "epoch": 2.77, + "learning_rate": 8.74018070043208e-05, + "loss": 0.8008, + "step": 1208 + }, + { + "epoch": 2.77, + "learning_rate": 8.725439608207056e-05, + "loss": 0.6833, + "step": 1209 + }, + { + "epoch": 2.77, + "learning_rate": 8.710701330798719e-05, + "loss": 0.7801, + "step": 1210 + }, + { + "epoch": 2.77, + "learning_rate": 8.695965900755985e-05, + "loss": 0.6308, + "step": 1211 + }, + { + "epoch": 2.78, + "learning_rate": 8.68123335062147e-05, + "loss": 0.7851, + "step": 1212 + }, + { + "epoch": 2.78, + "learning_rate": 8.666503712931439e-05, + "loss": 0.7592, + "step": 1213 + }, + { + "epoch": 2.78, + "learning_rate": 8.651777020215712e-05, + "loss": 0.8727, + "step": 1214 + }, + { + "epoch": 2.78, + "learning_rate": 8.637053304997618e-05, + "loss": 0.903, + "step": 1215 + }, + { + "epoch": 2.79, + "learning_rate": 8.622332599793906e-05, + "loss": 0.8076, + "step": 1216 + }, + { + "epoch": 2.79, + "learning_rate": 8.607614937114671e-05, + "loss": 0.8975, + "step": 1217 + }, + { + "epoch": 2.79, + "learning_rate": 8.592900349463297e-05, + "loss": 0.8249, + "step": 1218 + }, + { + "epoch": 2.79, + "learning_rate": 8.578188869336377e-05, + "loss": 0.8529, + "step": 1219 + }, + { + "epoch": 2.79, + "learning_rate": 8.563480529223638e-05, + "loss": 0.8351, + "step": 1220 + }, + { + "epoch": 2.8, + "learning_rate": 8.548775361607872e-05, + "loss": 0.8934, + "step": 1221 + }, + { + "epoch": 2.8, + "learning_rate": 8.534073398964866e-05, + "loss": 0.8067, + "step": 1222 + }, + { + "epoch": 2.8, + "learning_rate": 8.519374673763326e-05, + "loss": 0.8508, + "step": 1223 + }, + { + "epoch": 2.8, + "learning_rate": 8.504679218464816e-05, + "loss": 0.7419, + "step": 1224 + }, + { + "epoch": 2.81, + "learning_rate": 8.489987065523668e-05, + "loss": 0.7808, + "step": 1225 + }, + { + "epoch": 2.81, + "learning_rate": 8.475298247386927e-05, + "loss": 0.8603, + "step": 1226 + }, + { + "epoch": 2.81, + "learning_rate": 8.460612796494272e-05, + "loss": 0.8818, + "step": 1227 + }, + { + "epoch": 2.81, + "learning_rate": 8.445930745277953e-05, + "loss": 0.779, + "step": 1228 + }, + { + "epoch": 2.82, + "learning_rate": 8.431252126162695e-05, + "loss": 0.766, + "step": 1229 + }, + { + "epoch": 2.82, + "learning_rate": 8.41657697156566e-05, + "loss": 0.8743, + "step": 1230 + }, + { + "epoch": 2.82, + "learning_rate": 8.40190531389635e-05, + "loss": 0.882, + "step": 1231 + }, + { + "epoch": 2.82, + "learning_rate": 8.387237185556545e-05, + "loss": 0.7422, + "step": 1232 + }, + { + "epoch": 2.82, + "learning_rate": 8.372572618940231e-05, + "loss": 0.9271, + "step": 1233 + }, + { + "epoch": 2.83, + "learning_rate": 8.357911646433535e-05, + "loss": 0.8051, + "step": 1234 + }, + { + "epoch": 2.83, + "learning_rate": 8.343254300414628e-05, + "loss": 0.782, + "step": 1235 + }, + { + "epoch": 2.83, + "learning_rate": 8.3286006132537e-05, + "loss": 0.8754, + "step": 1236 + }, + { + "epoch": 2.83, + "learning_rate": 8.313950617312835e-05, + "loss": 0.8249, + "step": 1237 + }, + { + "epoch": 2.84, + "learning_rate": 8.299304344945977e-05, + "loss": 0.8342, + "step": 1238 + }, + { + "epoch": 2.84, + "learning_rate": 8.284661828498847e-05, + "loss": 0.8593, + "step": 1239 + }, + { + "epoch": 2.84, + "learning_rate": 8.270023100308865e-05, + "loss": 0.7507, + "step": 1240 + }, + { + "epoch": 2.84, + "learning_rate": 8.255388192705093e-05, + "loss": 0.8462, + "step": 1241 + }, + { + "epoch": 2.85, + "learning_rate": 8.240757138008149e-05, + "loss": 0.8322, + "step": 1242 + }, + { + "epoch": 2.85, + "learning_rate": 8.22612996853014e-05, + "loss": 0.8963, + "step": 1243 + }, + { + "epoch": 2.85, + "learning_rate": 8.211506716574602e-05, + "loss": 0.7419, + "step": 1244 + }, + { + "epoch": 2.85, + "learning_rate": 8.196887414436416e-05, + "loss": 0.8225, + "step": 1245 + }, + { + "epoch": 2.85, + "learning_rate": 8.182272094401735e-05, + "loss": 0.8539, + "step": 1246 + }, + { + "epoch": 2.86, + "learning_rate": 8.167660788747919e-05, + "loss": 0.7852, + "step": 1247 + }, + { + "epoch": 2.86, + "learning_rate": 8.153053529743465e-05, + "loss": 0.9128, + "step": 1248 + }, + { + "epoch": 2.86, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7328, + "step": 1249 + }, + { + "epoch": 2.86, + "learning_rate": 8.123851280711877e-05, + "loss": 0.8816, + "step": 1250 + }, + { + "epoch": 2.87, + "learning_rate": 8.10925635517676e-05, + "loss": 0.7267, + "step": 1251 + }, + { + "epoch": 2.87, + "learning_rate": 8.094665605274913e-05, + "loss": 0.7362, + "step": 1252 + }, + { + "epoch": 2.87, + "learning_rate": 8.080079063229432e-05, + "loss": 0.7475, + "step": 1253 + }, + { + "epoch": 2.87, + "learning_rate": 8.065496761254126e-05, + "loss": 0.7727, + "step": 1254 + }, + { + "epoch": 2.88, + "learning_rate": 8.050918731553431e-05, + "loss": 0.746, + "step": 1255 + }, + { + "epoch": 2.88, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8132, + "step": 1256 + }, + { + "epoch": 2.88, + "learning_rate": 8.021775617746412e-05, + "loss": 0.6752, + "step": 1257 + }, + { + "epoch": 2.88, + "learning_rate": 8.007210598001512e-05, + "loss": 0.7468, + "step": 1258 + }, + { + "epoch": 2.88, + "learning_rate": 7.992649979253934e-05, + "loss": 0.9141, + "step": 1259 + }, + { + "epoch": 2.89, + "learning_rate": 7.978093793660233e-05, + "loss": 0.7706, + "step": 1260 + }, + { + "epoch": 2.89, + "learning_rate": 7.963542073367181e-05, + "loss": 0.8399, + "step": 1261 + }, + { + "epoch": 2.89, + "learning_rate": 7.948994850511677e-05, + "loss": 0.834, + "step": 1262 + }, + { + "epoch": 2.89, + "learning_rate": 7.934452157220694e-05, + "loss": 0.767, + "step": 1263 + }, + { + "epoch": 2.9, + "learning_rate": 7.9199140256112e-05, + "loss": 0.75, + "step": 1264 + }, + { + "epoch": 2.9, + "learning_rate": 7.905380487790088e-05, + "loss": 0.81, + "step": 1265 + }, + { + "epoch": 2.9, + "learning_rate": 7.890851575854108e-05, + "loss": 0.8931, + "step": 1266 + }, + { + "epoch": 2.9, + "learning_rate": 7.876327321889795e-05, + "loss": 0.8929, + "step": 1267 + }, + { + "epoch": 2.9, + "learning_rate": 7.861807757973387e-05, + "loss": 0.787, + "step": 1268 + }, + { + "epoch": 2.91, + "learning_rate": 7.847292916170784e-05, + "loss": 0.8072, + "step": 1269 + }, + { + "epoch": 2.91, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8121, + "step": 1270 + }, + { + "epoch": 2.91, + "learning_rate": 7.818277527118307e-05, + "loss": 0.7951, + "step": 1271 + }, + { + "epoch": 2.91, + "learning_rate": 7.803777043947789e-05, + "loss": 0.7093, + "step": 1272 + }, + { + "epoch": 2.92, + "learning_rate": 7.789281411049625e-05, + "loss": 0.7827, + "step": 1273 + }, + { + "epoch": 2.92, + "learning_rate": 7.774790660436858e-05, + "loss": 0.7433, + "step": 1274 + }, + { + "epoch": 2.92, + "learning_rate": 7.760304824111741e-05, + "loss": 0.7359, + "step": 1275 + }, + { + "epoch": 2.92, + "learning_rate": 7.745823934065671e-05, + "loss": 0.7157, + "step": 1276 + }, + { + "epoch": 2.93, + "learning_rate": 7.731348022279134e-05, + "loss": 0.961, + "step": 1277 + }, + { + "epoch": 2.93, + "learning_rate": 7.716877120721611e-05, + "loss": 0.7718, + "step": 1278 + }, + { + "epoch": 2.93, + "learning_rate": 7.702411261351523e-05, + "loss": 0.835, + "step": 1279 + }, + { + "epoch": 2.93, + "learning_rate": 7.68795047611615e-05, + "loss": 0.9129, + "step": 1280 + }, + { + "epoch": 2.93, + "learning_rate": 7.673494796951573e-05, + "loss": 0.7635, + "step": 1281 + }, + { + "epoch": 2.94, + "learning_rate": 7.659044255782593e-05, + "loss": 0.6873, + "step": 1282 + }, + { + "epoch": 2.94, + "learning_rate": 7.644598884522659e-05, + "loss": 0.6434, + "step": 1283 + }, + { + "epoch": 2.94, + "learning_rate": 7.630158715073813e-05, + "loss": 0.8408, + "step": 1284 + }, + { + "epoch": 2.94, + "learning_rate": 7.615723779326599e-05, + "loss": 0.9042, + "step": 1285 + }, + { + "epoch": 2.95, + "learning_rate": 7.601294109160012e-05, + "loss": 0.7996, + "step": 1286 + }, + { + "epoch": 2.95, + "learning_rate": 7.586869736441413e-05, + "loss": 0.923, + "step": 1287 + }, + { + "epoch": 2.95, + "learning_rate": 7.572450693026462e-05, + "loss": 0.7661, + "step": 1288 + }, + { + "epoch": 2.95, + "learning_rate": 7.55803701075905e-05, + "loss": 0.9105, + "step": 1289 + }, + { + "epoch": 2.96, + "learning_rate": 7.543628721471233e-05, + "loss": 0.8071, + "step": 1290 + }, + { + "epoch": 2.96, + "learning_rate": 7.52922585698315e-05, + "loss": 0.8234, + "step": 1291 + }, + { + "epoch": 2.96, + "learning_rate": 7.514828449102966e-05, + "loss": 0.8131, + "step": 1292 + }, + { + "epoch": 2.96, + "learning_rate": 7.500436529626786e-05, + "loss": 0.8149, + "step": 1293 + }, + { + "epoch": 2.96, + "learning_rate": 7.486050130338612e-05, + "loss": 0.8441, + "step": 1294 + }, + { + "epoch": 2.97, + "learning_rate": 7.471669283010232e-05, + "loss": 0.8269, + "step": 1295 + }, + { + "epoch": 2.97, + "learning_rate": 7.457294019401191e-05, + "loss": 0.632, + "step": 1296 + }, + { + "epoch": 2.97, + "learning_rate": 7.442924371258694e-05, + "loss": 0.8522, + "step": 1297 + }, + { + "epoch": 2.97, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8387, + "step": 1298 + }, + { + "epoch": 2.98, + "learning_rate": 7.414202048300072e-05, + "loss": 0.887, + "step": 1299 + }, + { + "epoch": 2.98, + "learning_rate": 7.399849436916077e-05, + "loss": 0.8273, + "step": 1300 + }, + { + "epoch": 2.98, + "learning_rate": 7.385502567862728e-05, + "loss": 0.7807, + "step": 1301 + }, + { + "epoch": 2.98, + "learning_rate": 7.371161472824536e-05, + "loss": 0.9077, + "step": 1302 + }, + { + "epoch": 2.99, + "learning_rate": 7.35682618347324e-05, + "loss": 0.8779, + "step": 1303 + }, + { + "epoch": 2.99, + "learning_rate": 7.342496731467767e-05, + "loss": 0.8595, + "step": 1304 + }, + { + "epoch": 2.99, + "learning_rate": 7.328173148454151e-05, + "loss": 0.8391, + "step": 1305 + }, + { + "epoch": 2.99, + "learning_rate": 7.31385546606546e-05, + "loss": 0.7559, + "step": 1306 + }, + { + "epoch": 2.99, + "learning_rate": 7.29954371592174e-05, + "loss": 0.8926, + "step": 1307 + }, + { + "epoch": 3.0, + "learning_rate": 7.285237929629928e-05, + "loss": 0.8443, + "step": 1308 + }, + { + "epoch": 3.0, + "learning_rate": 7.27093813878379e-05, + "loss": 0.7854, + "step": 1309 + }, + { + "epoch": 3.0, + "learning_rate": 7.256644374963857e-05, + "loss": 0.9361, + "step": 1310 + }, + { + "epoch": 3.0, + "learning_rate": 7.242356669737344e-05, + "loss": 0.7515, + "step": 1311 + }, + { + "epoch": 3.01, + "learning_rate": 7.228075054658096e-05, + "loss": 0.5228, + "step": 1312 + }, + { + "epoch": 3.01, + "learning_rate": 7.213799561266489e-05, + "loss": 0.8614, + "step": 1313 + }, + { + "epoch": 3.01, + "learning_rate": 7.199530221089398e-05, + "loss": 0.6461, + "step": 1314 + }, + { + "epoch": 3.01, + "learning_rate": 7.185267065640104e-05, + "loss": 0.6926, + "step": 1315 + }, + { + "epoch": 3.01, + "learning_rate": 7.171010126418218e-05, + "loss": 0.8601, + "step": 1316 + }, + { + "epoch": 3.02, + "learning_rate": 7.156759434909639e-05, + "loss": 0.784, + "step": 1317 + }, + { + "epoch": 3.02, + "learning_rate": 7.142515022586456e-05, + "loss": 1.0793, + "step": 1318 + }, + { + "epoch": 3.02, + "learning_rate": 7.1282769209069e-05, + "loss": 0.71, + "step": 1319 + }, + { + "epoch": 3.02, + "learning_rate": 7.114045161315261e-05, + "loss": 0.7129, + "step": 1320 + }, + { + "epoch": 3.03, + "learning_rate": 7.099819775241819e-05, + "loss": 0.6223, + "step": 1321 + }, + { + "epoch": 3.03, + "learning_rate": 7.085600794102783e-05, + "loss": 0.643, + "step": 1322 + }, + { + "epoch": 3.03, + "learning_rate": 7.071388249300218e-05, + "loss": 0.7678, + "step": 1323 + }, + { + "epoch": 3.03, + "learning_rate": 7.057182172221967e-05, + "loss": 0.6995, + "step": 1324 + }, + { + "epoch": 3.04, + "learning_rate": 7.042982594241601e-05, + "loss": 0.6812, + "step": 1325 + }, + { + "epoch": 3.04, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7234, + "step": 1326 + }, + { + "epoch": 3.04, + "learning_rate": 7.014603060996938e-05, + "loss": 0.8338, + "step": 1327 + }, + { + "epoch": 3.04, + "learning_rate": 7.00042316840773e-05, + "loss": 0.9738, + "step": 1328 + }, + { + "epoch": 3.04, + "learning_rate": 6.98624990026644e-05, + "loss": 0.6211, + "step": 1329 + }, + { + "epoch": 3.05, + "learning_rate": 6.972083287874177e-05, + "loss": 0.7343, + "step": 1330 + }, + { + "epoch": 3.05, + "learning_rate": 6.957923362517348e-05, + "loss": 0.7291, + "step": 1331 + }, + { + "epoch": 3.05, + "learning_rate": 6.943770155467593e-05, + "loss": 0.7687, + "step": 1332 + }, + { + "epoch": 3.05, + "learning_rate": 6.929623697981718e-05, + "loss": 0.7509, + "step": 1333 + }, + { + "epoch": 3.06, + "learning_rate": 6.915484021301613e-05, + "loss": 0.769, + "step": 1334 + }, + { + "epoch": 3.06, + "learning_rate": 6.90135115665421e-05, + "loss": 0.7605, + "step": 1335 + }, + { + "epoch": 3.06, + "learning_rate": 6.887225135251381e-05, + "loss": 0.7519, + "step": 1336 + }, + { + "epoch": 3.06, + "learning_rate": 6.873105988289892e-05, + "loss": 0.7648, + "step": 1337 + }, + { + "epoch": 3.07, + "learning_rate": 6.858993746951328e-05, + "loss": 0.8969, + "step": 1338 + }, + { + "epoch": 3.07, + "learning_rate": 6.844888442402018e-05, + "loss": 0.7229, + "step": 1339 + }, + { + "epoch": 3.07, + "learning_rate": 6.830790105792973e-05, + "loss": 0.6294, + "step": 1340 + }, + { + "epoch": 3.07, + "learning_rate": 6.816698768259824e-05, + "loss": 0.7872, + "step": 1341 + }, + { + "epoch": 3.07, + "learning_rate": 6.802614460922728e-05, + "loss": 0.7555, + "step": 1342 + }, + { + "epoch": 3.08, + "learning_rate": 6.788537214886335e-05, + "loss": 0.7431, + "step": 1343 + }, + { + "epoch": 3.08, + "learning_rate": 6.774467061239687e-05, + "loss": 0.7502, + "step": 1344 + }, + { + "epoch": 3.08, + "learning_rate": 6.760404031056169e-05, + "loss": 0.9202, + "step": 1345 + }, + { + "epoch": 3.08, + "learning_rate": 6.74634815539343e-05, + "loss": 0.8221, + "step": 1346 + }, + { + "epoch": 3.09, + "learning_rate": 6.732299465293322e-05, + "loss": 0.8935, + "step": 1347 + }, + { + "epoch": 3.09, + "learning_rate": 6.718257991781828e-05, + "loss": 0.6869, + "step": 1348 + }, + { + "epoch": 3.09, + "learning_rate": 6.704223765868991e-05, + "loss": 0.6931, + "step": 1349 + }, + { + "epoch": 3.09, + "learning_rate": 6.690196818548846e-05, + "loss": 0.7308, + "step": 1350 + }, + { + "epoch": 3.1, + "learning_rate": 6.67617718079936e-05, + "loss": 0.779, + "step": 1351 + }, + { + "epoch": 3.1, + "learning_rate": 6.662164883582354e-05, + "loss": 0.7807, + "step": 1352 + }, + { + "epoch": 3.1, + "learning_rate": 6.648159957843438e-05, + "loss": 0.7942, + "step": 1353 + }, + { + "epoch": 3.1, + "learning_rate": 6.63416243451194e-05, + "loss": 0.842, + "step": 1354 + }, + { + "epoch": 3.1, + "learning_rate": 6.62017234450084e-05, + "loss": 0.9713, + "step": 1355 + }, + { + "epoch": 3.11, + "learning_rate": 6.60618971870671e-05, + "loss": 0.5946, + "step": 1356 + }, + { + "epoch": 3.11, + "learning_rate": 6.592214588009625e-05, + "loss": 0.656, + "step": 1357 + }, + { + "epoch": 3.11, + "learning_rate": 6.578246983273118e-05, + "loss": 0.7192, + "step": 1358 + }, + { + "epoch": 3.11, + "learning_rate": 6.564286935344089e-05, + "loss": 0.7485, + "step": 1359 + }, + { + "epoch": 3.12, + "learning_rate": 6.550334475052767e-05, + "loss": 0.8379, + "step": 1360 + }, + { + "epoch": 3.12, + "learning_rate": 6.536389633212609e-05, + "loss": 0.9204, + "step": 1361 + }, + { + "epoch": 3.12, + "learning_rate": 6.522452440620254e-05, + "loss": 0.7924, + "step": 1362 + }, + { + "epoch": 3.12, + "learning_rate": 6.508522928055445e-05, + "loss": 0.7988, + "step": 1363 + }, + { + "epoch": 3.12, + "learning_rate": 6.494601126280963e-05, + "loss": 0.7678, + "step": 1364 + }, + { + "epoch": 3.13, + "learning_rate": 6.480687066042561e-05, + "loss": 0.7079, + "step": 1365 + }, + { + "epoch": 3.13, + "learning_rate": 6.466780778068903e-05, + "loss": 0.7104, + "step": 1366 + }, + { + "epoch": 3.13, + "learning_rate": 6.452882293071468e-05, + "loss": 0.7226, + "step": 1367 + }, + { + "epoch": 3.13, + "learning_rate": 6.43899164174453e-05, + "loss": 0.8358, + "step": 1368 + }, + { + "epoch": 3.14, + "learning_rate": 6.42510885476504e-05, + "loss": 0.6752, + "step": 1369 + }, + { + "epoch": 3.14, + "learning_rate": 6.411233962792593e-05, + "loss": 0.7962, + "step": 1370 + }, + { + "epoch": 3.14, + "learning_rate": 6.397366996469343e-05, + "loss": 0.8052, + "step": 1371 + }, + { + "epoch": 3.14, + "learning_rate": 6.383507986419939e-05, + "loss": 0.9013, + "step": 1372 + }, + { + "epoch": 3.15, + "learning_rate": 6.369656963251467e-05, + "loss": 0.798, + "step": 1373 + }, + { + "epoch": 3.15, + "learning_rate": 6.355813957553364e-05, + "loss": 0.7121, + "step": 1374 + }, + { + "epoch": 3.15, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7275, + "step": 1375 + }, + { + "epoch": 3.15, + "learning_rate": 6.328152120837439e-05, + "loss": 0.7393, + "step": 1376 + }, + { + "epoch": 3.15, + "learning_rate": 6.314333350909701e-05, + "loss": 0.9145, + "step": 1377 + }, + { + "epoch": 3.16, + "learning_rate": 6.300522720632367e-05, + "loss": 0.8225, + "step": 1378 + }, + { + "epoch": 3.16, + "learning_rate": 6.286720260505668e-05, + "loss": 0.842, + "step": 1379 + }, + { + "epoch": 3.16, + "learning_rate": 6.2729260010118e-05, + "loss": 0.9227, + "step": 1380 + }, + { + "epoch": 3.16, + "learning_rate": 6.259139972614845e-05, + "loss": 0.8438, + "step": 1381 + }, + { + "epoch": 3.17, + "learning_rate": 6.245362205760704e-05, + "loss": 0.9213, + "step": 1382 + }, + { + "epoch": 3.17, + "learning_rate": 6.231592730877035e-05, + "loss": 0.7469, + "step": 1383 + }, + { + "epoch": 3.17, + "learning_rate": 6.217831578373185e-05, + "loss": 0.7289, + "step": 1384 + }, + { + "epoch": 3.17, + "learning_rate": 6.204078778640121e-05, + "loss": 0.8306, + "step": 1385 + }, + { + "epoch": 3.18, + "learning_rate": 6.190334362050365e-05, + "loss": 0.7807, + "step": 1386 + }, + { + "epoch": 3.18, + "learning_rate": 6.176598358957919e-05, + "loss": 0.7564, + "step": 1387 + }, + { + "epoch": 3.18, + "learning_rate": 6.162870799698209e-05, + "loss": 0.8306, + "step": 1388 + }, + { + "epoch": 3.18, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7317, + "step": 1389 + }, + { + "epoch": 3.18, + "learning_rate": 6.135441133925382e-05, + "loss": 0.8923, + "step": 1390 + }, + { + "epoch": 3.19, + "learning_rate": 6.121739087989613e-05, + "loss": 0.7723, + "step": 1391 + }, + { + "epoch": 3.19, + "learning_rate": 6.108045607041125e-05, + "loss": 0.796, + "step": 1392 + }, + { + "epoch": 3.19, + "learning_rate": 6.0943607213214425e-05, + "loss": 0.7907, + "step": 1393 + }, + { + "epoch": 3.19, + "learning_rate": 6.0806844610530956e-05, + "loss": 0.7709, + "step": 1394 + }, + { + "epoch": 3.2, + "learning_rate": 6.0670168564395705e-05, + "loss": 0.8841, + "step": 1395 + }, + { + "epoch": 3.2, + "learning_rate": 6.053357937665237e-05, + "loss": 0.6325, + "step": 1396 + }, + { + "epoch": 3.2, + "learning_rate": 6.039707734895279e-05, + "loss": 0.8047, + "step": 1397 + }, + { + "epoch": 3.2, + "learning_rate": 6.0260662782756374e-05, + "loss": 0.7933, + "step": 1398 + }, + { + "epoch": 3.21, + "learning_rate": 6.012433597932936e-05, + "loss": 0.8016, + "step": 1399 + }, + { + "epoch": 3.21, + "learning_rate": 5.998809723974407e-05, + "loss": 0.8992, + "step": 1400 + }, + { + "epoch": 3.21, + "learning_rate": 5.985194686487854e-05, + "loss": 0.7384, + "step": 1401 + }, + { + "epoch": 3.21, + "learning_rate": 5.971588515541546e-05, + "loss": 0.7214, + "step": 1402 + }, + { + "epoch": 3.21, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7394, + "step": 1403 + }, + { + "epoch": 3.22, + "learning_rate": 5.94440289344481e-05, + "loss": 0.6268, + "step": 1404 + }, + { + "epoch": 3.22, + "learning_rate": 5.9308235023327604e-05, + "loss": 0.8049, + "step": 1405 + }, + { + "epoch": 3.22, + "learning_rate": 5.9172530978375894e-05, + "loss": 0.8396, + "step": 1406 + }, + { + "epoch": 3.22, + "learning_rate": 5.9036917099290026e-05, + "loss": 0.7694, + "step": 1407 + }, + { + "epoch": 3.23, + "learning_rate": 5.890139368556791e-05, + "loss": 0.7289, + "step": 1408 + }, + { + "epoch": 3.23, + "learning_rate": 5.8765961036507736e-05, + "loss": 0.7949, + "step": 1409 + }, + { + "epoch": 3.23, + "learning_rate": 5.863061945120719e-05, + "loss": 0.9371, + "step": 1410 + }, + { + "epoch": 3.23, + "learning_rate": 5.8495369228562894e-05, + "loss": 0.7323, + "step": 1411 + }, + { + "epoch": 3.23, + "learning_rate": 5.836021066726962e-05, + "loss": 0.8331, + "step": 1412 + }, + { + "epoch": 3.24, + "learning_rate": 5.8225144065819745e-05, + "loss": 0.768, + "step": 1413 + }, + { + "epoch": 3.24, + "learning_rate": 5.809016972250263e-05, + "loss": 0.7804, + "step": 1414 + }, + { + "epoch": 3.24, + "learning_rate": 5.795528793540379e-05, + "loss": 0.771, + "step": 1415 + }, + { + "epoch": 3.24, + "learning_rate": 5.782049900240432e-05, + "loss": 0.7431, + "step": 1416 + }, + { + "epoch": 3.25, + "learning_rate": 5.768580322118034e-05, + "loss": 0.8618, + "step": 1417 + }, + { + "epoch": 3.25, + "learning_rate": 5.755120088920225e-05, + "loss": 0.7639, + "step": 1418 + }, + { + "epoch": 3.25, + "learning_rate": 5.7416692303733946e-05, + "loss": 0.8375, + "step": 1419 + }, + { + "epoch": 3.25, + "learning_rate": 5.728227776183244e-05, + "loss": 0.7409, + "step": 1420 + }, + { + "epoch": 3.26, + "learning_rate": 5.714795756034695e-05, + "loss": 0.7529, + "step": 1421 + }, + { + "epoch": 3.26, + "learning_rate": 5.701373199591835e-05, + "loss": 0.8878, + "step": 1422 + }, + { + "epoch": 3.26, + "learning_rate": 5.687960136497861e-05, + "loss": 0.6923, + "step": 1423 + }, + { + "epoch": 3.26, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.8628, + "step": 1424 + }, + { + "epoch": 3.26, + "learning_rate": 5.6611626088244194e-05, + "loss": 0.6949, + "step": 1425 + }, + { + "epoch": 3.27, + "learning_rate": 5.6477782034262436e-05, + "loss": 0.7278, + "step": 1426 + }, + { + "epoch": 3.27, + "learning_rate": 5.634403409739402e-05, + "loss": 0.8781, + "step": 1427 + }, + { + "epoch": 3.27, + "learning_rate": 5.621038257301601e-05, + "loss": 0.7329, + "step": 1428 + }, + { + "epoch": 3.27, + "learning_rate": 5.6076827756292495e-05, + "loss": 0.7195, + "step": 1429 + }, + { + "epoch": 3.28, + "learning_rate": 5.594336994217415e-05, + "loss": 0.7283, + "step": 1430 + }, + { + "epoch": 3.28, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.8064, + "step": 1431 + }, + { + "epoch": 3.28, + "learning_rate": 5.5676746500483336e-05, + "loss": 0.8488, + "step": 1432 + }, + { + "epoch": 3.28, + "learning_rate": 5.55435814617383e-05, + "loss": 0.8925, + "step": 1433 + }, + { + "epoch": 3.29, + "learning_rate": 5.5410514603251985e-05, + "loss": 0.7677, + "step": 1434 + }, + { + "epoch": 3.29, + "learning_rate": 5.5277546218897294e-05, + "loss": 0.8037, + "step": 1435 + }, + { + "epoch": 3.29, + "learning_rate": 5.514467660232965e-05, + "loss": 0.8046, + "step": 1436 + }, + { + "epoch": 3.29, + "learning_rate": 5.5011906046986473e-05, + "loss": 0.7885, + "step": 1437 + }, + { + "epoch": 3.29, + "learning_rate": 5.487923484608629e-05, + "loss": 0.8264, + "step": 1438 + }, + { + "epoch": 3.3, + "learning_rate": 5.4746663292628234e-05, + "loss": 0.7551, + "step": 1439 + }, + { + "epoch": 3.3, + "learning_rate": 5.4614191679391444e-05, + "loss": 0.8766, + "step": 1440 + }, + { + "epoch": 3.3, + "learning_rate": 5.448182029893423e-05, + "loss": 0.8992, + "step": 1441 + }, + { + "epoch": 3.3, + "learning_rate": 5.434954944359365e-05, + "loss": 0.6505, + "step": 1442 + }, + { + "epoch": 3.31, + "learning_rate": 5.4217379405484636e-05, + "loss": 0.8743, + "step": 1443 + }, + { + "epoch": 3.31, + "learning_rate": 5.408531047649964e-05, + "loss": 0.6965, + "step": 1444 + }, + { + "epoch": 3.31, + "learning_rate": 5.395334294830765e-05, + "loss": 0.7663, + "step": 1445 + }, + { + "epoch": 3.31, + "learning_rate": 5.382147711235377e-05, + "loss": 0.86, + "step": 1446 + }, + { + "epoch": 3.32, + "learning_rate": 5.3689713259858586e-05, + "loss": 0.7524, + "step": 1447 + }, + { + "epoch": 3.32, + "learning_rate": 5.355805168181738e-05, + "loss": 0.9115, + "step": 1448 + }, + { + "epoch": 3.32, + "learning_rate": 5.342649266899955e-05, + "loss": 0.8342, + "step": 1449 + }, + { + "epoch": 3.32, + "learning_rate": 5.329503651194805e-05, + "loss": 0.8447, + "step": 1450 + }, + { + "epoch": 3.32, + "learning_rate": 5.316368350097869e-05, + "loss": 0.7877, + "step": 1451 + }, + { + "epoch": 3.33, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7965, + "step": 1452 + }, + { + "epoch": 3.33, + "learning_rate": 5.290128807740976e-05, + "loss": 0.7844, + "step": 1453 + }, + { + "epoch": 3.33, + "learning_rate": 5.2770246244300224e-05, + "loss": 0.7405, + "step": 1454 + }, + { + "epoch": 3.33, + "learning_rate": 5.263930871625151e-05, + "loss": 0.7782, + "step": 1455 + }, + { + "epoch": 3.34, + "learning_rate": 5.2508475782434093e-05, + "loss": 0.7789, + "step": 1456 + }, + { + "epoch": 3.34, + "learning_rate": 5.237774773178734e-05, + "loss": 0.8943, + "step": 1457 + }, + { + "epoch": 3.34, + "learning_rate": 5.224712485301898e-05, + "loss": 0.7712, + "step": 1458 + }, + { + "epoch": 3.34, + "learning_rate": 5.211660743460458e-05, + "loss": 0.8608, + "step": 1459 + }, + { + "epoch": 3.34, + "learning_rate": 5.198619576478678e-05, + "loss": 0.7212, + "step": 1460 + }, + { + "epoch": 3.35, + "learning_rate": 5.1855890131574614e-05, + "loss": 0.7588, + "step": 1461 + }, + { + "epoch": 3.35, + "learning_rate": 5.17256908227429e-05, + "loss": 0.8001, + "step": 1462 + }, + { + "epoch": 3.35, + "learning_rate": 5.159559812583181e-05, + "loss": 0.8327, + "step": 1463 + }, + { + "epoch": 3.35, + "learning_rate": 5.146561232814593e-05, + "loss": 0.8874, + "step": 1464 + }, + { + "epoch": 3.36, + "learning_rate": 5.133573371675375e-05, + "loss": 0.6802, + "step": 1465 + }, + { + "epoch": 3.36, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7581, + "step": 1466 + }, + { + "epoch": 3.36, + "learning_rate": 5.1076299199940645e-05, + "loss": 0.8714, + "step": 1467 + }, + { + "epoch": 3.36, + "learning_rate": 5.094674386747067e-05, + "loss": 0.6667, + "step": 1468 + }, + { + "epoch": 3.37, + "learning_rate": 5.081729686719508e-05, + "loss": 0.8107, + "step": 1469 + }, + { + "epoch": 3.37, + "learning_rate": 5.068795848499257e-05, + "loss": 0.8891, + "step": 1470 + }, + { + "epoch": 3.37, + "learning_rate": 5.0558729006501846e-05, + "loss": 0.7259, + "step": 1471 + }, + { + "epoch": 3.37, + "learning_rate": 5.042960871712112e-05, + "loss": 0.8035, + "step": 1472 + }, + { + "epoch": 3.37, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7042, + "step": 1473 + }, + { + "epoch": 3.38, + "learning_rate": 5.0171696846076446e-05, + "loss": 0.7852, + "step": 1474 + }, + { + "epoch": 3.38, + "learning_rate": 5.004290583400075e-05, + "loss": 0.8489, + "step": 1475 + }, + { + "epoch": 3.38, + "learning_rate": 4.9914225150210335e-05, + "loss": 0.7696, + "step": 1476 + }, + { + "epoch": 3.38, + "learning_rate": 4.97856550788915e-05, + "loss": 0.7, + "step": 1477 + }, + { + "epoch": 3.39, + "learning_rate": 4.9657195903986185e-05, + "loss": 0.8373, + "step": 1478 + }, + { + "epoch": 3.39, + "learning_rate": 4.952884790919141e-05, + "loss": 0.8822, + "step": 1479 + }, + { + "epoch": 3.39, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7292, + "step": 1480 + }, + { + "epoch": 3.39, + "learning_rate": 4.927248659349355e-05, + "loss": 0.8165, + "step": 1481 + }, + { + "epoch": 3.4, + "learning_rate": 4.914447383875432e-05, + "loss": 0.7782, + "step": 1482 + }, + { + "epoch": 3.4, + "learning_rate": 4.901657339645226e-05, + "loss": 0.8172, + "step": 1483 + }, + { + "epoch": 3.4, + "learning_rate": 4.888878554905051e-05, + "loss": 0.8072, + "step": 1484 + }, + { + "epoch": 3.4, + "learning_rate": 4.876111057876347e-05, + "loss": 0.7715, + "step": 1485 + }, + { + "epoch": 3.4, + "learning_rate": 4.863354876755637e-05, + "loss": 0.7384, + "step": 1486 + }, + { + "epoch": 3.41, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7881, + "step": 1487 + }, + { + "epoch": 3.41, + "learning_rate": 4.837876574899237e-05, + "loss": 0.7962, + "step": 1488 + }, + { + "epoch": 3.41, + "learning_rate": 4.8251545104313836e-05, + "loss": 0.5635, + "step": 1489 + }, + { + "epoch": 3.41, + "learning_rate": 4.812443874407059e-05, + "loss": 0.7454, + "step": 1490 + }, + { + "epoch": 3.42, + "learning_rate": 4.7997446948972015e-05, + "loss": 0.8505, + "step": 1491 + }, + { + "epoch": 3.42, + "learning_rate": 4.787056999947455e-05, + "loss": 0.6157, + "step": 1492 + }, + { + "epoch": 3.42, + "learning_rate": 4.774380817578101e-05, + "loss": 0.7731, + "step": 1493 + }, + { + "epoch": 3.42, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8062, + "step": 1494 + }, + { + "epoch": 3.42, + "learning_rate": 4.74906310253448e-05, + "loss": 0.7027, + "step": 1495 + }, + { + "epoch": 3.43, + "learning_rate": 4.736421625773396e-05, + "loss": 0.7, + "step": 1496 + }, + { + "epoch": 3.43, + "learning_rate": 4.723791773418942e-05, + "loss": 0.7822, + "step": 1497 + }, + { + "epoch": 3.43, + "learning_rate": 4.7111735733636466e-05, + "loss": 0.6308, + "step": 1498 + }, + { + "epoch": 3.43, + "learning_rate": 4.698567053474315e-05, + "loss": 0.6722, + "step": 1499 + }, + { + "epoch": 3.44, + "learning_rate": 4.685972241591956e-05, + "loss": 0.749, + "step": 1500 + }, + { + "epoch": 3.44, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7784, + "step": 1501 + }, + { + "epoch": 3.44, + "learning_rate": 4.6608178530828174e-05, + "loss": 0.7971, + "step": 1502 + }, + { + "epoch": 3.44, + "learning_rate": 4.648258332008523e-05, + "loss": 0.8398, + "step": 1503 + }, + { + "epoch": 3.45, + "learning_rate": 4.6357106300460374e-05, + "loss": 0.6559, + "step": 1504 + }, + { + "epoch": 3.45, + "learning_rate": 4.6231747749064644e-05, + "loss": 0.7837, + "step": 1505 + }, + { + "epoch": 3.45, + "learning_rate": 4.610650794274759e-05, + "loss": 0.8072, + "step": 1506 + }, + { + "epoch": 3.45, + "learning_rate": 4.598138715809633e-05, + "loss": 0.7441, + "step": 1507 + }, + { + "epoch": 3.45, + "learning_rate": 4.585638567143529e-05, + "loss": 0.8233, + "step": 1508 + }, + { + "epoch": 3.46, + "learning_rate": 4.573150375882527e-05, + "loss": 0.8868, + "step": 1509 + }, + { + "epoch": 3.46, + "learning_rate": 4.560674169606317e-05, + "loss": 0.7059, + "step": 1510 + }, + { + "epoch": 3.46, + "learning_rate": 4.548209975868108e-05, + "loss": 0.8349, + "step": 1511 + }, + { + "epoch": 3.46, + "learning_rate": 4.5357578221945794e-05, + "loss": 0.817, + "step": 1512 + }, + { + "epoch": 3.47, + "learning_rate": 4.523317736085831e-05, + "loss": 0.7375, + "step": 1513 + }, + { + "epoch": 3.47, + "learning_rate": 4.5108897450153054e-05, + "loss": 0.8338, + "step": 1514 + }, + { + "epoch": 3.47, + "learning_rate": 4.498473876429726e-05, + "loss": 0.9212, + "step": 1515 + }, + { + "epoch": 3.47, + "learning_rate": 4.4860701577490595e-05, + "loss": 0.7182, + "step": 1516 + }, + { + "epoch": 3.48, + "learning_rate": 4.473678616366433e-05, + "loss": 0.8677, + "step": 1517 + }, + { + "epoch": 3.48, + "learning_rate": 4.461299279648077e-05, + "loss": 0.7868, + "step": 1518 + }, + { + "epoch": 3.48, + "learning_rate": 4.4489321749332744e-05, + "loss": 0.7078, + "step": 1519 + }, + { + "epoch": 3.48, + "learning_rate": 4.436577329534291e-05, + "loss": 0.6872, + "step": 1520 + }, + { + "epoch": 3.48, + "learning_rate": 4.424234770736314e-05, + "loss": 0.7523, + "step": 1521 + }, + { + "epoch": 3.49, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7107, + "step": 1522 + }, + { + "epoch": 3.49, + "learning_rate": 4.3995866219484326e-05, + "loss": 0.8932, + "step": 1523 + }, + { + "epoch": 3.49, + "learning_rate": 4.387281086392994e-05, + "loss": 0.7811, + "step": 1524 + }, + { + "epoch": 3.49, + "learning_rate": 4.374987946307385e-05, + "loss": 0.8946, + "step": 1525 + }, + { + "epoch": 3.5, + "learning_rate": 4.362707228840531e-05, + "loss": 0.8496, + "step": 1526 + }, + { + "epoch": 3.5, + "learning_rate": 4.350438961113911e-05, + "loss": 0.6998, + "step": 1527 + }, + { + "epoch": 3.5, + "learning_rate": 4.3381831702215084e-05, + "loss": 0.6792, + "step": 1528 + }, + { + "epoch": 3.5, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7644, + "step": 1529 + }, + { + "epoch": 3.51, + "learning_rate": 4.3137091271775e-05, + "loss": 0.6055, + "step": 1530 + }, + { + "epoch": 3.51, + "learning_rate": 4.301490929075852e-05, + "loss": 0.7126, + "step": 1531 + }, + { + "epoch": 3.51, + "learning_rate": 4.289285315908237e-05, + "loss": 0.7635, + "step": 1532 + }, + { + "epoch": 3.51, + "learning_rate": 4.277092314630278e-05, + "loss": 0.9089, + "step": 1533 + }, + { + "epoch": 3.51, + "learning_rate": 4.264911952169735e-05, + "loss": 0.7267, + "step": 1534 + }, + { + "epoch": 3.52, + "learning_rate": 4.2527442554264605e-05, + "loss": 0.6774, + "step": 1535 + }, + { + "epoch": 3.52, + "learning_rate": 4.240589251272342e-05, + "loss": 0.8402, + "step": 1536 + }, + { + "epoch": 3.52, + "learning_rate": 4.228446966551226e-05, + "loss": 0.8603, + "step": 1537 + }, + { + "epoch": 3.52, + "learning_rate": 4.2163174280788697e-05, + "loss": 0.6459, + "step": 1538 + }, + { + "epoch": 3.53, + "learning_rate": 4.2042006626428906e-05, + "loss": 0.7192, + "step": 1539 + }, + { + "epoch": 3.53, + "learning_rate": 4.192096697002686e-05, + "loss": 0.8621, + "step": 1540 + }, + { + "epoch": 3.53, + "learning_rate": 4.1800055578893883e-05, + "loss": 0.8194, + "step": 1541 + }, + { + "epoch": 3.53, + "learning_rate": 4.167927272005805e-05, + "loss": 0.8702, + "step": 1542 + }, + { + "epoch": 3.53, + "learning_rate": 4.155861866026364e-05, + "loss": 0.8677, + "step": 1543 + }, + { + "epoch": 3.54, + "learning_rate": 4.143809366597037e-05, + "loss": 0.7971, + "step": 1544 + }, + { + "epoch": 3.54, + "learning_rate": 4.131769800335292e-05, + "loss": 0.7896, + "step": 1545 + }, + { + "epoch": 3.54, + "learning_rate": 4.119743193830048e-05, + "loss": 0.889, + "step": 1546 + }, + { + "epoch": 3.54, + "learning_rate": 4.10772957364159e-05, + "loss": 0.7497, + "step": 1547 + }, + { + "epoch": 3.55, + "learning_rate": 4.0957289663015255e-05, + "loss": 0.9096, + "step": 1548 + }, + { + "epoch": 3.55, + "learning_rate": 4.083741398312727e-05, + "loss": 0.8658, + "step": 1549 + }, + { + "epoch": 3.55, + "learning_rate": 4.071766896149273e-05, + "loss": 0.5634, + "step": 1550 + }, + { + "epoch": 3.55, + "learning_rate": 4.059805486256376e-05, + "loss": 0.6693, + "step": 1551 + }, + { + "epoch": 3.56, + "learning_rate": 4.0478571950503486e-05, + "loss": 0.7128, + "step": 1552 + }, + { + "epoch": 3.56, + "learning_rate": 4.035922048918519e-05, + "loss": 0.7838, + "step": 1553 + }, + { + "epoch": 3.56, + "learning_rate": 4.024000074219187e-05, + "loss": 0.9549, + "step": 1554 + }, + { + "epoch": 3.56, + "learning_rate": 4.012091297281574e-05, + "loss": 0.6245, + "step": 1555 + }, + { + "epoch": 3.56, + "learning_rate": 4.0001957444057426e-05, + "loss": 0.7671, + "step": 1556 + }, + { + "epoch": 3.57, + "learning_rate": 3.988313441862553e-05, + "loss": 0.6645, + "step": 1557 + }, + { + "epoch": 3.57, + "learning_rate": 3.976444415893608e-05, + "loss": 0.8291, + "step": 1558 + }, + { + "epoch": 3.57, + "learning_rate": 3.96458869271119e-05, + "loss": 0.8715, + "step": 1559 + }, + { + "epoch": 3.57, + "learning_rate": 3.952746298498195e-05, + "loss": 0.8423, + "step": 1560 + }, + { + "epoch": 3.58, + "learning_rate": 3.940917259408085e-05, + "loss": 0.8303, + "step": 1561 + }, + { + "epoch": 3.58, + "learning_rate": 3.929101601564834e-05, + "loss": 0.7876, + "step": 1562 + }, + { + "epoch": 3.58, + "learning_rate": 3.9172993510628574e-05, + "loss": 0.7409, + "step": 1563 + }, + { + "epoch": 3.58, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.8988, + "step": 1564 + }, + { + "epoch": 3.59, + "learning_rate": 3.8937351763122845e-05, + "loss": 1.0367, + "step": 1565 + }, + { + "epoch": 3.59, + "learning_rate": 3.8819733041042515e-05, + "loss": 0.682, + "step": 1566 + }, + { + "epoch": 3.59, + "learning_rate": 3.870224943318491e-05, + "loss": 0.815, + "step": 1567 + }, + { + "epoch": 3.59, + "learning_rate": 3.858490119900794e-05, + "loss": 0.6516, + "step": 1568 + }, + { + "epoch": 3.59, + "learning_rate": 3.846768859767066e-05, + "loss": 0.7371, + "step": 1569 + }, + { + "epoch": 3.6, + "learning_rate": 3.8350611888032474e-05, + "loss": 0.7401, + "step": 1570 + }, + { + "epoch": 3.6, + "learning_rate": 3.823367132865265e-05, + "loss": 0.7305, + "step": 1571 + }, + { + "epoch": 3.6, + "learning_rate": 3.8116867177789936e-05, + "loss": 0.7422, + "step": 1572 + }, + { + "epoch": 3.6, + "learning_rate": 3.8000199693401675e-05, + "loss": 0.7621, + "step": 1573 + }, + { + "epoch": 3.61, + "learning_rate": 3.788366913314339e-05, + "loss": 0.935, + "step": 1574 + }, + { + "epoch": 3.61, + "learning_rate": 3.776727575436829e-05, + "loss": 0.7587, + "step": 1575 + }, + { + "epoch": 3.61, + "learning_rate": 3.7651019814126654e-05, + "loss": 0.9029, + "step": 1576 + }, + { + "epoch": 3.61, + "learning_rate": 3.753490156916511e-05, + "loss": 0.8324, + "step": 1577 + }, + { + "epoch": 3.62, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7316, + "step": 1578 + }, + { + "epoch": 3.62, + "learning_rate": 3.730307919054803e-05, + "loss": 0.684, + "step": 1579 + }, + { + "epoch": 3.62, + "learning_rate": 3.718737556886316e-05, + "loss": 0.7547, + "step": 1580 + }, + { + "epoch": 3.62, + "learning_rate": 3.7071810666398496e-05, + "loss": 0.8581, + "step": 1581 + }, + { + "epoch": 3.62, + "learning_rate": 3.695638473837466e-05, + "loss": 0.7707, + "step": 1582 + }, + { + "epoch": 3.63, + "learning_rate": 3.684109803970531e-05, + "loss": 0.755, + "step": 1583 + }, + { + "epoch": 3.63, + "learning_rate": 3.6725950824996535e-05, + "loss": 0.8436, + "step": 1584 + }, + { + "epoch": 3.63, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7491, + "step": 1585 + }, + { + "epoch": 3.63, + "learning_rate": 3.649607586434474e-05, + "loss": 0.6946, + "step": 1586 + }, + { + "epoch": 3.64, + "learning_rate": 3.6381348626071475e-05, + "loss": 0.7697, + "step": 1587 + }, + { + "epoch": 3.64, + "learning_rate": 3.626676188709743e-05, + "loss": 0.8108, + "step": 1588 + }, + { + "epoch": 3.64, + "learning_rate": 3.6152315900482905e-05, + "loss": 0.7676, + "step": 1589 + }, + { + "epoch": 3.64, + "learning_rate": 3.603801091897731e-05, + "loss": 0.8506, + "step": 1590 + }, + { + "epoch": 3.64, + "learning_rate": 3.592384719501878e-05, + "loss": 0.7521, + "step": 1591 + }, + { + "epoch": 3.65, + "learning_rate": 3.580982498073344e-05, + "loss": 0.8371, + "step": 1592 + }, + { + "epoch": 3.65, + "learning_rate": 3.5695944527934865e-05, + "loss": 0.816, + "step": 1593 + }, + { + "epoch": 3.65, + "learning_rate": 3.5582206088123535e-05, + "loss": 0.7097, + "step": 1594 + }, + { + "epoch": 3.65, + "learning_rate": 3.546860991248641e-05, + "loss": 0.7147, + "step": 1595 + }, + { + "epoch": 3.66, + "learning_rate": 3.5355156251896136e-05, + "loss": 0.7807, + "step": 1596 + }, + { + "epoch": 3.66, + "learning_rate": 3.524184535691068e-05, + "loss": 0.8517, + "step": 1597 + }, + { + "epoch": 3.66, + "learning_rate": 3.5128677477772734e-05, + "loss": 0.8549, + "step": 1598 + }, + { + "epoch": 3.66, + "learning_rate": 3.501565286440914e-05, + "loss": 0.7514, + "step": 1599 + }, + { + "epoch": 3.67, + "learning_rate": 3.490277176643033e-05, + "loss": 0.8055, + "step": 1600 + }, + { + "epoch": 3.67, + "learning_rate": 3.4790034433129725e-05, + "loss": 0.5494, + "step": 1601 + }, + { + "epoch": 3.67, + "learning_rate": 3.467744111348338e-05, + "loss": 0.9018, + "step": 1602 + }, + { + "epoch": 3.67, + "learning_rate": 3.4564992056149214e-05, + "loss": 0.7319, + "step": 1603 + }, + { + "epoch": 3.67, + "learning_rate": 3.445268750946651e-05, + "loss": 0.8997, + "step": 1604 + }, + { + "epoch": 3.68, + "learning_rate": 3.434052772145554e-05, + "loss": 0.7977, + "step": 1605 + }, + { + "epoch": 3.68, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7205, + "step": 1606 + }, + { + "epoch": 3.68, + "learning_rate": 3.411664341193041e-05, + "loss": 0.848, + "step": 1607 + }, + { + "epoch": 3.68, + "learning_rate": 3.400491938485596e-05, + "loss": 0.7864, + "step": 1608 + }, + { + "epoch": 3.69, + "learning_rate": 3.389334110533161e-05, + "loss": 0.7184, + "step": 1609 + }, + { + "epoch": 3.69, + "learning_rate": 3.378190881977359e-05, + "loss": 0.8362, + "step": 1610 + }, + { + "epoch": 3.69, + "learning_rate": 3.367062277427567e-05, + "loss": 0.6743, + "step": 1611 + }, + { + "epoch": 3.69, + "learning_rate": 3.3559483214608824e-05, + "loss": 0.7561, + "step": 1612 + }, + { + "epoch": 3.7, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.7342, + "step": 1613 + }, + { + "epoch": 3.7, + "learning_rate": 3.333764453423357e-05, + "loss": 0.7918, + "step": 1614 + }, + { + "epoch": 3.7, + "learning_rate": 3.322694590344719e-05, + "loss": 0.75, + "step": 1615 + }, + { + "epoch": 3.7, + "learning_rate": 3.3116394738334866e-05, + "loss": 0.7874, + "step": 1616 + }, + { + "epoch": 3.7, + "learning_rate": 3.300599128304443e-05, + "loss": 0.7555, + "step": 1617 + }, + { + "epoch": 3.71, + "learning_rate": 3.2895735781397685e-05, + "loss": 0.8434, + "step": 1618 + }, + { + "epoch": 3.71, + "learning_rate": 3.278562847688951e-05, + "loss": 0.8756, + "step": 1619 + }, + { + "epoch": 3.71, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.8765, + "step": 1620 + }, + { + "epoch": 3.71, + "learning_rate": 3.256585943163176e-05, + "loss": 0.8501, + "step": 1621 + }, + { + "epoch": 3.72, + "learning_rate": 3.2456198176233543e-05, + "loss": 1.0232, + "step": 1622 + }, + { + "epoch": 3.72, + "learning_rate": 3.234668608867547e-05, + "loss": 0.7117, + "step": 1623 + }, + { + "epoch": 3.72, + "learning_rate": 3.2237323410810715e-05, + "loss": 0.9795, + "step": 1624 + }, + { + "epoch": 3.72, + "learning_rate": 3.212811038416251e-05, + "loss": 0.887, + "step": 1625 + }, + { + "epoch": 3.73, + "learning_rate": 3.201904724992352e-05, + "loss": 0.7008, + "step": 1626 + }, + { + "epoch": 3.73, + "learning_rate": 3.191013424895536e-05, + "loss": 0.7542, + "step": 1627 + }, + { + "epoch": 3.73, + "learning_rate": 3.18013716217882e-05, + "loss": 0.871, + "step": 1628 + }, + { + "epoch": 3.73, + "learning_rate": 3.1692759608620004e-05, + "loss": 0.7761, + "step": 1629 + }, + { + "epoch": 3.73, + "learning_rate": 3.158429844931611e-05, + "loss": 0.842, + "step": 1630 + }, + { + "epoch": 3.74, + "learning_rate": 3.1475988383408774e-05, + "loss": 0.8322, + "step": 1631 + }, + { + "epoch": 3.74, + "learning_rate": 3.136782965009658e-05, + "loss": 0.7911, + "step": 1632 + }, + { + "epoch": 3.74, + "learning_rate": 3.1259822488243806e-05, + "loss": 0.8911, + "step": 1633 + }, + { + "epoch": 3.74, + "learning_rate": 3.115196713638e-05, + "loss": 0.9232, + "step": 1634 + }, + { + "epoch": 3.75, + "learning_rate": 3.104426383269957e-05, + "loss": 0.8265, + "step": 1635 + }, + { + "epoch": 3.75, + "learning_rate": 3.093671281506099e-05, + "loss": 0.7861, + "step": 1636 + }, + { + "epoch": 3.75, + "learning_rate": 3.0829314320986433e-05, + "loss": 0.6548, + "step": 1637 + }, + { + "epoch": 3.75, + "learning_rate": 3.072206858766134e-05, + "loss": 0.7974, + "step": 1638 + }, + { + "epoch": 3.75, + "learning_rate": 3.061497585193369e-05, + "loss": 0.849, + "step": 1639 + }, + { + "epoch": 3.76, + "learning_rate": 3.050803635031355e-05, + "loss": 0.7438, + "step": 1640 + }, + { + "epoch": 3.76, + "learning_rate": 3.040125031897264e-05, + "loss": 0.838, + "step": 1641 + }, + { + "epoch": 3.76, + "learning_rate": 3.029461799374378e-05, + "loss": 0.8879, + "step": 1642 + }, + { + "epoch": 3.76, + "learning_rate": 3.0188139610120248e-05, + "loss": 0.7747, + "step": 1643 + }, + { + "epoch": 3.77, + "learning_rate": 3.0081815403255332e-05, + "loss": 0.7179, + "step": 1644 + }, + { + "epoch": 3.77, + "learning_rate": 2.9975645607961955e-05, + "loss": 0.7618, + "step": 1645 + }, + { + "epoch": 3.77, + "learning_rate": 2.9869630458711927e-05, + "loss": 0.6977, + "step": 1646 + }, + { + "epoch": 3.77, + "learning_rate": 2.9763770189635497e-05, + "loss": 0.8052, + "step": 1647 + }, + { + "epoch": 3.78, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.728, + "step": 1648 + }, + { + "epoch": 3.78, + "learning_rate": 2.955251522681408e-05, + "loss": 0.8593, + "step": 1649 + }, + { + "epoch": 3.78, + "learning_rate": 2.944712099961736e-05, + "loss": 0.8347, + "step": 1650 + }, + { + "epoch": 3.78, + "learning_rate": 2.9341882585689905e-05, + "loss": 0.733, + "step": 1651 + }, + { + "epoch": 3.78, + "learning_rate": 2.9236800217446593e-05, + "loss": 0.6998, + "step": 1652 + }, + { + "epoch": 3.79, + "learning_rate": 2.9131874126957727e-05, + "loss": 0.8923, + "step": 1653 + }, + { + "epoch": 3.79, + "learning_rate": 2.9027104545948414e-05, + "loss": 0.5376, + "step": 1654 + }, + { + "epoch": 3.79, + "learning_rate": 2.892249170579826e-05, + "loss": 0.7465, + "step": 1655 + }, + { + "epoch": 3.79, + "learning_rate": 2.8818035837540537e-05, + "loss": 0.7833, + "step": 1656 + }, + { + "epoch": 3.8, + "learning_rate": 2.8713737171861986e-05, + "loss": 0.7611, + "step": 1657 + }, + { + "epoch": 3.8, + "learning_rate": 2.8609595939102153e-05, + "loss": 0.7226, + "step": 1658 + }, + { + "epoch": 3.8, + "learning_rate": 2.8505612369252832e-05, + "loss": 0.8847, + "step": 1659 + }, + { + "epoch": 3.8, + "learning_rate": 2.840178669195763e-05, + "loss": 0.7511, + "step": 1660 + }, + { + "epoch": 3.81, + "learning_rate": 2.8298119136511558e-05, + "loss": 0.6833, + "step": 1661 + }, + { + "epoch": 3.81, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7595, + "step": 1662 + }, + { + "epoch": 3.81, + "learning_rate": 2.8091259306599904e-05, + "loss": 0.7486, + "step": 1663 + }, + { + "epoch": 3.81, + "learning_rate": 2.7988067488976156e-05, + "loss": 0.8106, + "step": 1664 + }, + { + "epoch": 3.81, + "learning_rate": 2.7885034706884185e-05, + "loss": 0.8012, + "step": 1665 + }, + { + "epoch": 3.82, + "learning_rate": 2.7782161187867818e-05, + "loss": 0.7598, + "step": 1666 + }, + { + "epoch": 3.82, + "learning_rate": 2.7679447159119164e-05, + "loss": 0.6638, + "step": 1667 + }, + { + "epoch": 3.82, + "learning_rate": 2.7576892847478207e-05, + "loss": 0.6576, + "step": 1668 + }, + { + "epoch": 3.82, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.8174, + "step": 1669 + }, + { + "epoch": 3.83, + "learning_rate": 2.737226428111471e-05, + "loss": 0.868, + "step": 1670 + }, + { + "epoch": 3.83, + "learning_rate": 2.7270190478306378e-05, + "loss": 0.6411, + "step": 1671 + }, + { + "epoch": 3.83, + "learning_rate": 2.7168277296433053e-05, + "loss": 0.7872, + "step": 1672 + }, + { + "epoch": 3.83, + "learning_rate": 2.7066524960565965e-05, + "loss": 0.7556, + "step": 1673 + }, + { + "epoch": 3.84, + "learning_rate": 2.6964933695421192e-05, + "loss": 0.8606, + "step": 1674 + }, + { + "epoch": 3.84, + "learning_rate": 2.6863503725359107e-05, + "loss": 0.7776, + "step": 1675 + }, + { + "epoch": 3.84, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7095, + "step": 1676 + }, + { + "epoch": 3.84, + "learning_rate": 2.666112856614259e-05, + "loss": 0.8587, + "step": 1677 + }, + { + "epoch": 3.84, + "learning_rate": 2.65601838239258e-05, + "loss": 0.8568, + "step": 1678 + }, + { + "epoch": 3.85, + "learning_rate": 2.6459401270665894e-05, + "loss": 0.7725, + "step": 1679 + }, + { + "epoch": 3.85, + "learning_rate": 2.6358781128937172e-05, + "loss": 0.8665, + "step": 1680 + }, + { + "epoch": 3.85, + "learning_rate": 2.625832362095528e-05, + "loss": 0.8286, + "step": 1681 + }, + { + "epoch": 3.85, + "learning_rate": 2.6158028968576743e-05, + "loss": 0.9445, + "step": 1682 + }, + { + "epoch": 3.86, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7562, + "step": 1683 + }, + { + "epoch": 3.86, + "learning_rate": 2.5957929116256675e-05, + "loss": 0.8086, + "step": 1684 + }, + { + "epoch": 3.86, + "learning_rate": 2.5858124358227853e-05, + "loss": 0.8513, + "step": 1685 + }, + { + "epoch": 3.86, + "learning_rate": 2.5758483339626738e-05, + "loss": 0.7107, + "step": 1686 + }, + { + "epoch": 3.86, + "learning_rate": 2.565900628050659e-05, + "loss": 0.7926, + "step": 1687 + }, + { + "epoch": 3.87, + "learning_rate": 2.5559693400558658e-05, + "loss": 0.7839, + "step": 1688 + }, + { + "epoch": 3.87, + "learning_rate": 2.546054491911147e-05, + "loss": 0.8132, + "step": 1689 + }, + { + "epoch": 3.87, + "learning_rate": 2.536156105513062e-05, + "loss": 0.6755, + "step": 1690 + }, + { + "epoch": 3.87, + "learning_rate": 2.52627420272181e-05, + "loss": 0.7823, + "step": 1691 + }, + { + "epoch": 3.88, + "learning_rate": 2.5164088053611845e-05, + "loss": 0.8078, + "step": 1692 + }, + { + "epoch": 3.88, + "learning_rate": 2.5065599352185254e-05, + "loss": 0.7328, + "step": 1693 + }, + { + "epoch": 3.88, + "learning_rate": 2.4967276140446826e-05, + "loss": 0.9089, + "step": 1694 + }, + { + "epoch": 3.88, + "learning_rate": 2.48691186355395e-05, + "loss": 0.7683, + "step": 1695 + }, + { + "epoch": 3.89, + "learning_rate": 2.477112705424024e-05, + "loss": 0.7681, + "step": 1696 + }, + { + "epoch": 3.89, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.8331, + "step": 1697 + }, + { + "epoch": 3.89, + "learning_rate": 2.4575642527741415e-05, + "loss": 0.7678, + "step": 1698 + }, + { + "epoch": 3.89, + "learning_rate": 2.447815001426177e-05, + "loss": 0.7815, + "step": 1699 + }, + { + "epoch": 3.89, + "learning_rate": 2.4380824287829074e-05, + "loss": 0.9155, + "step": 1700 + }, + { + "epoch": 3.9, + "learning_rate": 2.428366556338344e-05, + "loss": 0.7475, + "step": 1701 + }, + { + "epoch": 3.9, + "learning_rate": 2.4186674055496083e-05, + "loss": 0.6909, + "step": 1702 + }, + { + "epoch": 3.9, + "learning_rate": 2.4089849978368918e-05, + "loss": 0.7278, + "step": 1703 + }, + { + "epoch": 3.9, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8053, + "step": 1704 + }, + { + "epoch": 3.91, + "learning_rate": 2.389670497135379e-05, + "loss": 0.6703, + "step": 1705 + }, + { + "epoch": 3.91, + "learning_rate": 2.3800384468018954e-05, + "loss": 0.7334, + "step": 1706 + }, + { + "epoch": 3.91, + "learning_rate": 2.370423224854975e-05, + "loss": 0.7021, + "step": 1707 + }, + { + "epoch": 3.91, + "learning_rate": 2.3608248525294628e-05, + "loss": 0.7711, + "step": 1708 + }, + { + "epoch": 3.92, + "learning_rate": 2.3512433510229858e-05, + "loss": 0.8555, + "step": 1709 + }, + { + "epoch": 3.92, + "learning_rate": 2.3416787414959097e-05, + "loss": 0.7019, + "step": 1710 + }, + { + "epoch": 3.92, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.9331, + "step": 1711 + }, + { + "epoch": 3.92, + "learning_rate": 2.322600282834888e-05, + "loss": 0.7915, + "step": 1712 + }, + { + "epoch": 3.92, + "learning_rate": 2.3130864758349645e-05, + "loss": 0.8168, + "step": 1713 + }, + { + "epoch": 3.93, + "learning_rate": 2.303589645082411e-05, + "loss": 0.7711, + "step": 1714 + }, + { + "epoch": 3.93, + "learning_rate": 2.2941098115506065e-05, + "loss": 0.7319, + "step": 1715 + }, + { + "epoch": 3.93, + "learning_rate": 2.2846469961753915e-05, + "loss": 0.7473, + "step": 1716 + }, + { + "epoch": 3.93, + "learning_rate": 2.27520121985502e-05, + "loss": 0.7365, + "step": 1717 + }, + { + "epoch": 3.94, + "learning_rate": 2.265772503450122e-05, + "loss": 0.9078, + "step": 1718 + }, + { + "epoch": 3.94, + "learning_rate": 2.256360867783648e-05, + "loss": 0.6878, + "step": 1719 + }, + { + "epoch": 3.94, + "learning_rate": 2.246966333640823e-05, + "loss": 0.7913, + "step": 1720 + }, + { + "epoch": 3.94, + "learning_rate": 2.2375889217691137e-05, + "loss": 0.8684, + "step": 1721 + }, + { + "epoch": 3.95, + "learning_rate": 2.2282286528781605e-05, + "loss": 0.7516, + "step": 1722 + }, + { + "epoch": 3.95, + "learning_rate": 2.218885547639754e-05, + "loss": 0.787, + "step": 1723 + }, + { + "epoch": 3.95, + "learning_rate": 2.2095596266877782e-05, + "loss": 0.801, + "step": 1724 + }, + { + "epoch": 3.95, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.8423, + "step": 1725 + }, + { + "epoch": 3.95, + "learning_rate": 2.1909594199888372e-05, + "loss": 0.6984, + "step": 1726 + }, + { + "epoch": 3.96, + "learning_rate": 2.181685175319702e-05, + "loss": 0.7593, + "step": 1727 + }, + { + "epoch": 3.96, + "learning_rate": 2.172428197092561e-05, + "loss": 0.7661, + "step": 1728 + }, + { + "epoch": 3.96, + "learning_rate": 2.1631885057510838e-05, + "loss": 0.8231, + "step": 1729 + }, + { + "epoch": 3.96, + "learning_rate": 2.153966121700769e-05, + "loss": 0.7426, + "step": 1730 + }, + { + "epoch": 3.97, + "learning_rate": 2.1447610653088947e-05, + "loss": 0.7836, + "step": 1731 + }, + { + "epoch": 3.97, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.9467, + "step": 1732 + }, + { + "epoch": 3.97, + "learning_rate": 2.126403016778168e-05, + "loss": 0.8632, + "step": 1733 + }, + { + "epoch": 3.97, + "learning_rate": 2.117250065182349e-05, + "loss": 0.8532, + "step": 1734 + }, + { + "epoch": 3.97, + "learning_rate": 2.1081145223309395e-05, + "loss": 0.769, + "step": 1735 + }, + { + "epoch": 3.98, + "learning_rate": 2.0989964083994252e-05, + "loss": 0.6967, + "step": 1736 + }, + { + "epoch": 3.98, + "learning_rate": 2.08989574352481e-05, + "loss": 0.7737, + "step": 1737 + }, + { + "epoch": 3.98, + "learning_rate": 2.0808125478055505e-05, + "loss": 0.5646, + "step": 1738 + }, + { + "epoch": 3.98, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.7515, + "step": 1739 + }, + { + "epoch": 3.99, + "learning_rate": 2.0626986440340035e-05, + "loss": 0.718, + "step": 1740 + }, + { + "epoch": 3.99, + "learning_rate": 2.053667975985567e-05, + "loss": 0.8102, + "step": 1741 + }, + { + "epoch": 3.99, + "learning_rate": 2.0446548571000935e-05, + "loss": 0.8485, + "step": 1742 + }, + { + "epoch": 3.99, + "learning_rate": 2.035659307282699e-05, + "loss": 0.7086, + "step": 1743 + }, + { + "epoch": 4.0, + "learning_rate": 2.0266813463997092e-05, + "loss": 0.7731, + "step": 1744 + }, + { + "epoch": 4.0, + "learning_rate": 2.0177209942785958e-05, + "loss": 0.5973, + "step": 1745 + }, + { + "epoch": 4.0, + "learning_rate": 2.008778270707944e-05, + "loss": 0.8096, + "step": 1746 + } + ], + "max_steps": 2180, + "num_train_epochs": 5, + "total_flos": 472941381419008.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1746/training_args.bin b/checkpoint-1746/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4aa0907a784d65549a9c45257c4d455176479607 --- /dev/null +++ b/checkpoint-1746/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adff180a74f6fc1e6a420417eadde6ef8ff75561e442f481bfe772c93f46e2ae +size 6011 diff --git a/checkpoint-1746/zero_to_fp32.py b/checkpoint-1746/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..c98caae31534368be22b67fc4ae906836c992a8d --- /dev/null +++ b/checkpoint-1746/zero_to_fp32.py @@ -0,0 +1,587 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag) diff --git a/checkpoint-2180/README.md b/checkpoint-2180/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/checkpoint-2180/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-2180/adapter_config.json b/checkpoint-2180/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a764b53e4dc8b17af932aa1de32ced6a340469f0 --- /dev/null +++ b/checkpoint-2180/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "liuhaotian/llava-v1.5-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 256, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "up_proj", + "v_proj", + "down_proj", + "q_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-2180/adapter_model.bin b/checkpoint-2180/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a7e71401cb8ba470a733f2631f8f2f3db14ef8ac --- /dev/null +++ b/checkpoint-2180/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:738e184ba011761c1745a66f340bf5dab629f42365e7de66fb09d8cf1e23c4c7 +size 639786637 diff --git a/checkpoint-2180/global_step2180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-2180/global_step2180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3f3f2724f43738b33ac871fa61423037682e44c --- /dev/null +++ b/checkpoint-2180/global_step2180/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51c6e751077958b0786037d8a83ed63170fdc83f51c313740737c650e5444173 +size 1022391865 diff --git a/checkpoint-2180/global_step2180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-2180/global_step2180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af1aa7fd75049b455e414c86caab0a36d217ad62 --- /dev/null +++ b/checkpoint-2180/global_step2180/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:025360786572288ab0c095978fe7eec6d98effe1c139f5df92c1d9fc28abb9df +size 1022391865 diff --git a/checkpoint-2180/global_step2180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-2180/global_step2180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3a379150e34a74d894e6cf8205d1744c2551051 --- /dev/null +++ b/checkpoint-2180/global_step2180/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:185e3cf7fe448ee9eecfcbebc566b70517e6010f5bae65b4b5d951bef2787e1d +size 1022391865 diff --git a/checkpoint-2180/global_step2180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-2180/global_step2180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a98c3baac4a77c4413384582cab0d74737d29eba --- /dev/null +++ b/checkpoint-2180/global_step2180/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:310df2d45955ec3b938bc8bd1d680c479ef4e87a8c7c1ff526fe9b4e74f9d543 +size 1022391865 diff --git a/checkpoint-2180/global_step2180/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-2180/global_step2180/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54399d7ceaaa8240085f36f6addcbd2f4c3191dc --- /dev/null +++ b/checkpoint-2180/global_step2180/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5ac793bd5f752e5f304d998a2515569249aa04d57ca0303f9a9cafd187a61c0 +size 3521982567 diff --git a/checkpoint-2180/global_step2180/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-2180/global_step2180/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e2ac0d972afc3055baf313190645c4ab0315098 --- /dev/null +++ b/checkpoint-2180/global_step2180/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:380b357fb091f3a200d2bac788cd260b6e064d54df6a17c21c67bebea3e464c8 +size 3521982567 diff --git a/checkpoint-2180/global_step2180/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-2180/global_step2180/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..209913b5047e778b0adfa0b552e699ceb7861e6f --- /dev/null +++ b/checkpoint-2180/global_step2180/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71fe30d09570f77c227f22a4582ef04c435e22af3b05c51eb01a5f3fbecf89db +size 3521982567 diff --git a/checkpoint-2180/global_step2180/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-2180/global_step2180/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b92146c9a7bf0409b8255cc6ec861c89f6dde78 --- /dev/null +++ b/checkpoint-2180/global_step2180/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eccbe2fc823cc7f6aa62dcf6cfd2b88cbfe771f3dcf5856f685d5b28900b62e +size 3521982567 diff --git a/checkpoint-2180/latest b/checkpoint-2180/latest new file mode 100644 index 0000000000000000000000000000000000000000..380ab9c7e89223b4aad3a0417ed81ccee32d86a1 --- /dev/null +++ b/checkpoint-2180/latest @@ -0,0 +1 @@ +global_step2180 \ No newline at end of file diff --git a/checkpoint-2180/rng_state_0.pth b/checkpoint-2180/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..3182f34cd7445358d312b9c2b4bef001933d4455 --- /dev/null +++ b/checkpoint-2180/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60436a5f65ffc58d4f0662dae31b57dd8ed649cef0d0f7d6f0f8ea075d32d4e5 +size 17655 diff --git a/checkpoint-2180/rng_state_1.pth b/checkpoint-2180/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9704c6a04277011bd779ca79e6cea1b59f5e65b --- /dev/null +++ b/checkpoint-2180/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55c61cb4c600215d408017ad060d3c51faf5b97bec04a23a1b680dd293416fdc +size 17655 diff --git a/checkpoint-2180/rng_state_2.pth b/checkpoint-2180/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d1b9155a46fe0cf820a0006a64010de19d03760 --- /dev/null +++ b/checkpoint-2180/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bc300939be8fd538ccf2fc590929172bb7104d825e8275a7fa12185e5284714 +size 17655 diff --git a/checkpoint-2180/rng_state_3.pth b/checkpoint-2180/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d3f9571a36f2259a6acb83fbbebcb79d25a226c9 --- /dev/null +++ b/checkpoint-2180/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ffbdafb82bdbf0d2b66a6c92d0d968c0839969f9ad5a2b72fa9abc8b641f7bf +size 17655 diff --git a/checkpoint-2180/special_tokens_map.json b/checkpoint-2180/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/checkpoint-2180/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-2180/tokenizer.model b/checkpoint-2180/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-2180/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-2180/tokenizer_config.json b/checkpoint-2180/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..740756b4bef305e27d0bb4d2e1a40dd8847797f7 --- /dev/null +++ b/checkpoint-2180/tokenizer_config.json @@ -0,0 +1,35 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 2048, + "pad_token": null, + "padding_side": "right", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-2180/trainer_state.json b/checkpoint-2180/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2d0d681a6a910c479695ce2613d5e2355b84cfad --- /dev/null +++ b/checkpoint-2180/trainer_state.json @@ -0,0 +1,13096 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.9942726231386025, + "global_step": 2180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.946, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 6.060606060606061e-06, + "loss": 1.908, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 9.090909090909091e-06, + "loss": 2.1083, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 1.2121212121212122e-05, + "loss": 2.3218, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 1.5151515151515153e-05, + "loss": 1.8338, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 1.8181818181818182e-05, + "loss": 2.0202, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 2.1212121212121215e-05, + "loss": 2.1332, + "step": 7 + }, + { + "epoch": 0.02, + "learning_rate": 2.4242424242424244e-05, + "loss": 1.8593, + "step": 8 + }, + { + "epoch": 0.02, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.5359, + "step": 9 + }, + { + "epoch": 0.02, + "learning_rate": 3.0303030303030306e-05, + "loss": 1.327, + "step": 10 + }, + { + "epoch": 0.03, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.7252, + "step": 11 + }, + { + "epoch": 0.03, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.4351, + "step": 12 + }, + { + "epoch": 0.03, + "learning_rate": 3.939393939393939e-05, + "loss": 1.2774, + "step": 13 + }, + { + "epoch": 0.03, + "learning_rate": 4.242424242424243e-05, + "loss": 1.5145, + "step": 14 + }, + { + "epoch": 0.03, + "learning_rate": 4.545454545454546e-05, + "loss": 1.1529, + "step": 15 + }, + { + "epoch": 0.04, + "learning_rate": 4.848484848484849e-05, + "loss": 1.0047, + "step": 16 + }, + { + "epoch": 0.04, + "learning_rate": 5.151515151515152e-05, + "loss": 1.3872, + "step": 17 + }, + { + "epoch": 0.04, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.1229, + "step": 18 + }, + { + "epoch": 0.04, + "learning_rate": 5.757575757575758e-05, + "loss": 1.3386, + "step": 19 + }, + { + "epoch": 0.05, + "learning_rate": 6.060606060606061e-05, + "loss": 1.2493, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 6.363636363636364e-05, + "loss": 1.1427, + "step": 21 + }, + { + "epoch": 0.05, + "learning_rate": 6.666666666666667e-05, + "loss": 1.0895, + "step": 22 + }, + { + "epoch": 0.05, + "learning_rate": 6.96969696969697e-05, + "loss": 1.1989, + "step": 23 + }, + { + "epoch": 0.05, + "learning_rate": 7.272727272727273e-05, + "loss": 1.0438, + "step": 24 + }, + { + "epoch": 0.06, + "learning_rate": 7.575757575757576e-05, + "loss": 1.176, + "step": 25 + }, + { + "epoch": 0.06, + "learning_rate": 7.878787878787879e-05, + "loss": 1.1372, + "step": 26 + }, + { + "epoch": 0.06, + "learning_rate": 8.181818181818183e-05, + "loss": 1.2983, + "step": 27 + }, + { + "epoch": 0.06, + "learning_rate": 8.484848484848486e-05, + "loss": 0.9371, + "step": 28 + }, + { + "epoch": 0.07, + "learning_rate": 8.787878787878789e-05, + "loss": 1.2299, + "step": 29 + }, + { + "epoch": 0.07, + "learning_rate": 9.090909090909092e-05, + "loss": 0.9441, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 9.393939393939395e-05, + "loss": 1.0011, + "step": 31 + }, + { + "epoch": 0.07, + "learning_rate": 9.696969696969698e-05, + "loss": 1.1704, + "step": 32 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 1.1193, + "step": 33 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010303030303030303, + "loss": 1.1559, + "step": 34 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010606060606060606, + "loss": 0.8677, + "step": 35 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010909090909090909, + "loss": 1.0865, + "step": 36 + }, + { + "epoch": 0.08, + "learning_rate": 0.00011212121212121212, + "loss": 1.0922, + "step": 37 + }, + { + "epoch": 0.09, + "learning_rate": 0.00011515151515151516, + "loss": 0.9434, + "step": 38 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001181818181818182, + "loss": 0.9144, + "step": 39 + }, + { + "epoch": 0.09, + "learning_rate": 0.00012121212121212122, + "loss": 0.9546, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.00012424242424242425, + "loss": 1.0654, + "step": 41 + }, + { + "epoch": 0.1, + "learning_rate": 0.00012727272727272728, + "loss": 0.8077, + "step": 42 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001303030303030303, + "loss": 1.0758, + "step": 43 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013333333333333334, + "loss": 1.1512, + "step": 44 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013636363636363637, + "loss": 0.84, + "step": 45 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001393939393939394, + "loss": 1.0567, + "step": 46 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014242424242424243, + "loss": 1.0165, + "step": 47 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014545454545454546, + "loss": 0.8678, + "step": 48 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014848484848484849, + "loss": 1.055, + "step": 49 + }, + { + "epoch": 0.11, + "learning_rate": 0.00015151515151515152, + "loss": 1.0669, + "step": 50 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015454545454545454, + "loss": 0.9915, + "step": 51 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015757575757575757, + "loss": 0.993, + "step": 52 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001606060606060606, + "loss": 1.1085, + "step": 53 + }, + { + "epoch": 0.12, + "learning_rate": 0.00016363636363636366, + "loss": 0.9391, + "step": 54 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001666666666666667, + "loss": 0.975, + "step": 55 + }, + { + "epoch": 0.13, + "learning_rate": 0.00016969696969696972, + "loss": 1.0697, + "step": 56 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017272727272727275, + "loss": 0.9462, + "step": 57 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017575757575757578, + "loss": 1.1209, + "step": 58 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001787878787878788, + "loss": 1.0648, + "step": 59 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018181818181818183, + "loss": 0.9964, + "step": 60 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018484848484848484, + "loss": 0.8451, + "step": 61 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001878787878787879, + "loss": 0.8437, + "step": 62 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019090909090909092, + "loss": 1.1271, + "step": 63 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019393939393939395, + "loss": 1.161, + "step": 64 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019696969696969698, + "loss": 1.0032, + "step": 65 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 66 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019999988957695886, + "loss": 0.9543, + "step": 67 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999955830807923, + "loss": 1.0274, + "step": 68 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999900619409279, + "loss": 0.9334, + "step": 69 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001999982332362188, + "loss": 1.0398, + "step": 70 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999723943616433, + "loss": 0.9049, + "step": 71 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999602479612417, + "loss": 0.7452, + "step": 72 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999458931878073, + "loss": 0.8762, + "step": 73 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999293300730427, + "loss": 1.0941, + "step": 74 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999105586535268, + "loss": 0.7713, + "step": 75 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019998895789707154, + "loss": 0.9233, + "step": 76 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998663910709416, + "loss": 0.8634, + "step": 77 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998409950054146, + "loss": 0.9697, + "step": 78 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998133908302209, + "loss": 1.0816, + "step": 79 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001999783578606323, + "loss": 0.9659, + "step": 80 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019997515583995603, + "loss": 0.9644, + "step": 81 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019997173302806478, + "loss": 0.8561, + "step": 82 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019996808943251773, + "loss": 1.0016, + "step": 83 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001999642250613616, + "loss": 0.8951, + "step": 84 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019996013992313073, + "loss": 1.0157, + "step": 85 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019995583402684694, + "loss": 0.9414, + "step": 86 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019995130738201966, + "loss": 0.8097, + "step": 87 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019994655999864582, + "loss": 0.8606, + "step": 88 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001999415918872098, + "loss": 1.0427, + "step": 89 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019993640305868352, + "loss": 0.9578, + "step": 90 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019993099352452623, + "loss": 1.1097, + "step": 91 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019992536329668478, + "loss": 0.8119, + "step": 92 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019991951238759325, + "loss": 0.9915, + "step": 93 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001999134408101731, + "loss": 0.838, + "step": 94 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019990714857783326, + "loss": 0.8935, + "step": 95 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019990063570446984, + "loss": 0.7914, + "step": 96 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019989390220446622, + "loss": 0.8724, + "step": 97 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019988694809269314, + "loss": 1.0374, + "step": 98 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019987977338450845, + "loss": 0.9028, + "step": 99 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019987237809575723, + "loss": 0.9986, + "step": 100 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019986476224277165, + "loss": 1.113, + "step": 101 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019985692584237108, + "loss": 0.8395, + "step": 102 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019984886891186184, + "loss": 1.0134, + "step": 103 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001998405914690374, + "loss": 0.8845, + "step": 104 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019983209353217812, + "loss": 0.7507, + "step": 105 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019982337512005138, + "loss": 0.9073, + "step": 106 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019981443625191148, + "loss": 0.9973, + "step": 107 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019980527694749952, + "loss": 1.0733, + "step": 108 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019979589722704346, + "loss": 0.9148, + "step": 109 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019978629711125812, + "loss": 0.8385, + "step": 110 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019977647662134488, + "loss": 0.75, + "step": 111 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019976643577899195, + "loss": 0.9002, + "step": 112 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019975617460637416, + "loss": 0.8754, + "step": 113 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001997456931261529, + "loss": 0.8886, + "step": 114 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019973499136147606, + "loss": 1.0058, + "step": 115 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019972406933597812, + "loss": 0.9276, + "step": 116 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019971292707377991, + "loss": 0.9922, + "step": 117 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019970156459948873, + "loss": 0.9507, + "step": 118 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001996899819381981, + "loss": 0.9619, + "step": 119 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019967817911548794, + "loss": 0.8163, + "step": 120 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019966615615742424, + "loss": 1.0647, + "step": 121 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001996539130905593, + "loss": 0.9348, + "step": 122 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019964144994193142, + "loss": 1.0523, + "step": 123 + }, + { + "epoch": 0.28, + "learning_rate": 0.000199628766739065, + "loss": 0.9063, + "step": 124 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019961586350997033, + "loss": 1.0227, + "step": 125 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001996027402831438, + "loss": 1.006, + "step": 126 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019958939708756746, + "loss": 0.9082, + "step": 127 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019957583395270923, + "loss": 0.8756, + "step": 128 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995620509085228, + "loss": 0.8311, + "step": 129 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019954804798544745, + "loss": 1.0332, + "step": 130 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019953382521440815, + "loss": 0.9427, + "step": 131 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019951938262681527, + "loss": 0.838, + "step": 132 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995047202545647, + "loss": 0.8509, + "step": 133 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019948983813003774, + "loss": 0.8944, + "step": 134 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019947473628610099, + "loss": 0.9569, + "step": 135 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019945941475610623, + "loss": 0.7805, + "step": 136 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019944387357389052, + "loss": 0.9337, + "step": 137 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994281127737759, + "loss": 0.8712, + "step": 138 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994121323905695, + "loss": 0.9264, + "step": 139 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001993959324595634, + "loss": 0.9323, + "step": 140 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019937951301653444, + "loss": 0.8331, + "step": 141 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001993628740977444, + "loss": 0.902, + "step": 142 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001993460157399396, + "loss": 0.8676, + "step": 143 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019932893798035116, + "loss": 0.8525, + "step": 144 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019931164085669456, + "loss": 0.8571, + "step": 145 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019929412440716985, + "loss": 1.0006, + "step": 146 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019927638867046142, + "loss": 0.9849, + "step": 147 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019925843368573794, + "loss": 0.9064, + "step": 148 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001992402594926523, + "loss": 0.9716, + "step": 149 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001992218661313415, + "loss": 0.7553, + "step": 150 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019920325364242654, + "loss": 0.7921, + "step": 151 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019918442206701245, + "loss": 0.7994, + "step": 152 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001991653714466879, + "loss": 0.8296, + "step": 153 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019914610182352548, + "loss": 0.8116, + "step": 154 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019912661324008148, + "loss": 0.9844, + "step": 155 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019910690573939557, + "loss": 0.865, + "step": 156 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019908697936499103, + "loss": 0.959, + "step": 157 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019906683416087448, + "loss": 0.7727, + "step": 158 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019904647017153582, + "loss": 0.707, + "step": 159 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019902588744194813, + "loss": 0.8597, + "step": 160 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019900508601756756, + "loss": 0.9146, + "step": 161 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001989840659443332, + "loss": 0.9571, + "step": 162 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001989628272686671, + "loss": 0.8537, + "step": 163 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019894137003747403, + "loss": 0.828, + "step": 164 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019891969429814145, + "loss": 0.8055, + "step": 165 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001988978000985394, + "loss": 0.8432, + "step": 166 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001988756874870203, + "loss": 0.8101, + "step": 167 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019885335651241903, + "loss": 0.9072, + "step": 168 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001988308072240527, + "loss": 0.7862, + "step": 169 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019880803967172047, + "loss": 0.8303, + "step": 170 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019878505390570362, + "loss": 0.9489, + "step": 171 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001987618499767653, + "loss": 1.0125, + "step": 172 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001987384279361505, + "loss": 0.809, + "step": 173 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019871478783558587, + "loss": 0.9488, + "step": 174 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001986909297272796, + "loss": 0.9664, + "step": 175 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001986668536639215, + "loss": 0.9657, + "step": 176 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001986425596986825, + "loss": 0.8123, + "step": 177 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019861804788521493, + "loss": 0.9482, + "step": 178 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019859331827765212, + "loss": 0.879, + "step": 179 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019856837093060848, + "loss": 0.896, + "step": 180 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019854320589917927, + "loss": 1.0729, + "step": 181 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019851782323894042, + "loss": 0.9844, + "step": 182 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001984922230059486, + "loss": 0.9131, + "step": 183 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019846640525674082, + "loss": 0.9417, + "step": 184 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019844037004833473, + "loss": 0.9633, + "step": 185 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001984141174382279, + "loss": 0.968, + "step": 186 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019838764748439827, + "loss": 0.8447, + "step": 187 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019836096024530373, + "loss": 0.8638, + "step": 188 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019833405577988195, + "loss": 0.9346, + "step": 189 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001983069341475504, + "loss": 0.8969, + "step": 190 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019827959540820613, + "loss": 0.8499, + "step": 191 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019825203962222572, + "loss": 0.8041, + "step": 192 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019822426685046497, + "loss": 0.9216, + "step": 193 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019819627715425903, + "loss": 0.906, + "step": 194 + }, + { + "epoch": 0.45, + "learning_rate": 0.000198168070595422, + "loss": 0.8969, + "step": 195 + }, + { + "epoch": 0.45, + "learning_rate": 0.000198139647236247, + "loss": 0.7949, + "step": 196 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019811100713950587, + "loss": 0.8996, + "step": 197 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019808215036844917, + "loss": 0.9118, + "step": 198 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001980530769868059, + "loss": 0.7355, + "step": 199 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019802378705878354, + "loss": 0.8344, + "step": 200 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019799428064906774, + "loss": 0.9639, + "step": 201 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001979645578228222, + "loss": 0.852, + "step": 202 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001979346186456887, + "loss": 0.8493, + "step": 203 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019790446318378665, + "loss": 0.851, + "step": 204 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019787409150371328, + "loss": 0.7161, + "step": 205 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019784350367254322, + "loss": 0.9846, + "step": 206 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001978126997578285, + "loss": 0.7883, + "step": 207 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019778167982759833, + "loss": 0.8691, + "step": 208 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019775044395035907, + "loss": 0.928, + "step": 209 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001977189921950939, + "loss": 0.8244, + "step": 210 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001976873246312628, + "loss": 1.0413, + "step": 211 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001976554413288023, + "loss": 0.8261, + "step": 212 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001976233423581255, + "loss": 0.823, + "step": 213 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019759102779012166, + "loss": 0.9386, + "step": 214 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019755849769615628, + "loss": 0.8156, + "step": 215 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019752575214807076, + "loss": 0.8556, + "step": 216 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019749279121818235, + "loss": 0.7769, + "step": 217 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019745961497928406, + "loss": 1.0772, + "step": 218 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019742622350464418, + "loss": 0.8147, + "step": 219 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001973926168680066, + "loss": 0.9529, + "step": 220 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019735879514359018, + "loss": 0.8688, + "step": 221 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019732475840608888, + "loss": 0.9647, + "step": 222 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019729050673067156, + "loss": 0.837, + "step": 223 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019725604019298163, + "loss": 0.9211, + "step": 224 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019722135886913715, + "loss": 0.9434, + "step": 225 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001971864628357304, + "loss": 0.6506, + "step": 226 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019715135216982798, + "loss": 0.8052, + "step": 227 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019711602694897037, + "loss": 0.7852, + "step": 228 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019708048725117192, + "loss": 0.9283, + "step": 229 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001970447331549207, + "loss": 0.9081, + "step": 230 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019700876473917824, + "loss": 0.9036, + "step": 231 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019697258208337934, + "loss": 0.716, + "step": 232 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019693618526743197, + "loss": 0.8192, + "step": 233 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001968995743717171, + "loss": 0.9773, + "step": 234 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019686274947708848, + "loss": 0.8698, + "step": 235 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001968257106648724, + "loss": 0.9062, + "step": 236 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019678845801686764, + "loss": 0.8984, + "step": 237 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019675099161534521, + "loss": 0.8087, + "step": 238 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019671331154304822, + "loss": 0.8272, + "step": 239 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019667541788319162, + "loss": 0.784, + "step": 240 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019663731071946206, + "loss": 0.8777, + "step": 241 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019659899013601772, + "loss": 0.8534, + "step": 242 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019656045621748808, + "loss": 0.9645, + "step": 243 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019652170904897387, + "loss": 0.9692, + "step": 244 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019648274871604662, + "loss": 0.838, + "step": 245 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019644357530474872, + "loss": 0.7445, + "step": 246 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001964041889015931, + "loss": 0.9065, + "step": 247 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019636458959356316, + "loss": 0.7806, + "step": 248 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019632477746811232, + "loss": 0.7971, + "step": 249 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019628475261316417, + "loss": 0.8409, + "step": 250 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019624451511711198, + "loss": 0.7432, + "step": 251 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019620406506881875, + "loss": 0.9096, + "step": 252 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019616340255761676, + "loss": 0.8004, + "step": 253 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019612252767330763, + "loss": 0.7978, + "step": 254 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001960814405061619, + "loss": 0.9535, + "step": 255 + }, + { + "epoch": 0.59, + "learning_rate": 0.000196040141146919, + "loss": 0.9945, + "step": 256 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001959986296867869, + "loss": 0.9703, + "step": 257 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019595690621744208, + "loss": 0.9639, + "step": 258 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019591497083102914, + "loss": 0.9312, + "step": 259 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019587282362016083, + "loss": 0.7709, + "step": 260 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001958304646779175, + "loss": 0.8547, + "step": 261 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019578789409784727, + "loss": 0.8081, + "step": 262 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019574511197396563, + "loss": 0.8476, + "step": 263 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019570211840075517, + "loss": 0.9658, + "step": 264 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019565891347316552, + "loss": 0.7778, + "step": 265 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001956154972866131, + "loss": 0.9926, + "step": 266 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001955718699369808, + "loss": 0.957, + "step": 267 + }, + { + "epoch": 0.61, + "learning_rate": 0.000195528031520618, + "loss": 0.9396, + "step": 268 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019548398213434007, + "loss": 0.9049, + "step": 269 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019543972187542833, + "loss": 0.9683, + "step": 270 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019539525084162992, + "loss": 0.8555, + "step": 271 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019535056913115725, + "loss": 0.8489, + "step": 272 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001953056768426882, + "loss": 0.8728, + "step": 273 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019526057407536564, + "loss": 0.9443, + "step": 274 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019521526092879725, + "loss": 0.8161, + "step": 275 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019516973750305532, + "loss": 0.8936, + "step": 276 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019512400389867657, + "loss": 0.8315, + "step": 277 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019507806021666188, + "loss": 0.9298, + "step": 278 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019503190655847604, + "loss": 0.8235, + "step": 279 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019498554302604766, + "loss": 0.9245, + "step": 280 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001949389697217687, + "loss": 0.8302, + "step": 281 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019489218674849455, + "loss": 0.8488, + "step": 282 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019484519420954354, + "loss": 0.8177, + "step": 283 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019479799220869682, + "loss": 1.0039, + "step": 284 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019475058085019825, + "loss": 0.7685, + "step": 285 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019470296023875387, + "loss": 0.9174, + "step": 286 + }, + { + "epoch": 0.66, + "learning_rate": 0.000194655130479532, + "loss": 1.0997, + "step": 287 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019460709167816274, + "loss": 0.9759, + "step": 288 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001945588439407379, + "loss": 0.9397, + "step": 289 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019451038737381077, + "loss": 1.0367, + "step": 290 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019446172208439574, + "loss": 0.8298, + "step": 291 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001944128481799682, + "loss": 0.9094, + "step": 292 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019436376576846423, + "loss": 1.1234, + "step": 293 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019431447495828045, + "loss": 0.9103, + "step": 294 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001942649758582737, + "loss": 0.7841, + "step": 295 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019421526857776072, + "loss": 0.8817, + "step": 296 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019416535322651818, + "loss": 1.0682, + "step": 297 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019411522991478214, + "loss": 0.9201, + "step": 298 + }, + { + "epoch": 0.68, + "learning_rate": 0.000194064898753248, + "loss": 4.1834, + "step": 299 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019401435985307012, + "loss": 1.0391, + "step": 300 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019396361332586166, + "loss": 2.5015, + "step": 301 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001939126592836944, + "loss": 0.7927, + "step": 302 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001938614978390983, + "loss": 2.2345, + "step": 303 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019381012910506146, + "loss": 0.9311, + "step": 304 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019375855319502962, + "loss": 0.9713, + "step": 305 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019370677022290624, + "loss": 0.8967, + "step": 306 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019365478030305196, + "loss": 3.095, + "step": 307 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001936025835502845, + "loss": 1.1008, + "step": 308 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001935501800798783, + "loss": 1.5409, + "step": 309 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019349757000756444, + "loss": 1.02, + "step": 310 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019344475344953012, + "loss": 1.0101, + "step": 311 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001933917305224187, + "loss": 0.7686, + "step": 312 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001933385013433292, + "loss": 1.1061, + "step": 313 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001932850660298162, + "loss": 0.8083, + "step": 314 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001932314246998895, + "loss": 1.1942, + "step": 315 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019317757747201384, + "loss": 0.8551, + "step": 316 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019312352446510878, + "loss": 0.9049, + "step": 317 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019306926579854821, + "loss": 0.7072, + "step": 318 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019301480159216028, + "loss": 0.8552, + "step": 319 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019296013196622706, + "loss": 0.8414, + "step": 320 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001929052570414843, + "loss": 0.9198, + "step": 321 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019285017693912107, + "loss": 2.1953, + "step": 322 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019279489178077969, + "loss": 0.851, + "step": 323 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019273940168855518, + "loss": 1.0239, + "step": 324 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019268370678499533, + "loss": 1.5125, + "step": 325 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019262780719310008, + "loss": 0.9171, + "step": 326 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019257170303632148, + "loss": 0.9794, + "step": 327 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019251539443856344, + "loss": 0.9023, + "step": 328 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019245888152418124, + "loss": 1.058, + "step": 329 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019240216441798142, + "loss": 0.9411, + "step": 330 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001923452432452215, + "loss": 1.197, + "step": 331 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001922881181316097, + "loss": 0.9253, + "step": 332 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001922307892033046, + "loss": 1.156, + "step": 333 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019217325658691482, + "loss": 0.9424, + "step": 334 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019211552040949891, + "loss": 1.1147, + "step": 335 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019205758079856498, + "loss": 0.8528, + "step": 336 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001919994378820704, + "loss": 0.8105, + "step": 337 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019194109178842153, + "loss": 0.9279, + "step": 338 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019188254264647337, + "loss": 0.9231, + "step": 339 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019182379058552948, + "loss": 1.0425, + "step": 340 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019176483573534142, + "loss": 0.8794, + "step": 341 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019170567822610873, + "loss": 0.9873, + "step": 342 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001916463181884784, + "loss": 0.8146, + "step": 343 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019158675575354478, + "loss": 1.027, + "step": 344 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019152699105284913, + "loss": 0.8093, + "step": 345 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001914670242183795, + "loss": 0.951, + "step": 346 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019140685538257028, + "loss": 0.9268, + "step": 347 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019134648467830198, + "loss": 1.0205, + "step": 348 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019128591223890092, + "loss": 0.9043, + "step": 349 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019122513819813902, + "loss": 0.7387, + "step": 350 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001911641626902333, + "loss": 0.9422, + "step": 351 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019110298584984578, + "loss": 0.9015, + "step": 352 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001910416078120832, + "loss": 0.7522, + "step": 353 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019098002871249646, + "loss": 0.9722, + "step": 354 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001909182486870806, + "loss": 0.8358, + "step": 355 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019085626787227443, + "loss": 0.9859, + "step": 356 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019079408640496013, + "loss": 0.7796, + "step": 357 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019073170442246302, + "loss": 0.8617, + "step": 358 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001906691220625513, + "loss": 0.7727, + "step": 359 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001906063394634356, + "loss": 0.8786, + "step": 360 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001905433567637689, + "loss": 0.9117, + "step": 361 + }, + { + "epoch": 0.83, + "learning_rate": 0.000190480174102646, + "loss": 0.9182, + "step": 362 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001904167916196033, + "loss": 0.9706, + "step": 363 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001903532094546186, + "loss": 0.8036, + "step": 364 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001902894277481105, + "loss": 0.902, + "step": 365 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019022544664093854, + "loss": 0.9231, + "step": 366 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019016126627440237, + "loss": 0.9751, + "step": 367 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001900968867902419, + "loss": 0.8373, + "step": 368 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001900323083306367, + "loss": 0.8695, + "step": 369 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001899675310382057, + "loss": 0.8654, + "step": 370 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018990255505600706, + "loss": 0.98, + "step": 371 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018983738052753767, + "loss": 0.7454, + "step": 372 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018977200759673295, + "loss": 0.829, + "step": 373 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018970643640796642, + "loss": 0.8262, + "step": 374 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001896406671060495, + "loss": 1.0659, + "step": 375 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018957469983623112, + "loss": 0.8551, + "step": 376 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018950853474419742, + "loss": 0.7991, + "step": 377 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001894421719760714, + "loss": 0.8662, + "step": 378 + }, + { + "epoch": 0.87, + "learning_rate": 0.00018937561167841263, + "loss": 0.8817, + "step": 379 + }, + { + "epoch": 0.87, + "learning_rate": 0.00018930885399821693, + "loss": 1.0894, + "step": 380 + }, + { + "epoch": 0.87, + "learning_rate": 0.000189241899082916, + "loss": 0.8225, + "step": 381 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018917474708037718, + "loss": 0.9065, + "step": 382 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018910739813890302, + "loss": 0.8779, + "step": 383 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018903985240723104, + "loss": 0.7909, + "step": 384 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018897211003453328, + "loss": 0.7649, + "step": 385 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018890417117041619, + "loss": 0.9788, + "step": 386 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018883603596492004, + "loss": 0.938, + "step": 387 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018876770456851877, + "loss": 0.9032, + "step": 388 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018869917713211964, + "loss": 0.9059, + "step": 389 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018863045380706274, + "loss": 0.8896, + "step": 390 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001885615347451209, + "loss": 0.7614, + "step": 391 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001884924200984991, + "loss": 0.978, + "step": 392 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001884231100198344, + "loss": 0.9406, + "step": 393 + }, + { + "epoch": 0.9, + "learning_rate": 0.00018835360466219533, + "loss": 0.7555, + "step": 394 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001882839041790818, + "loss": 0.9049, + "step": 395 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018821400872442458, + "loss": 0.7041, + "step": 396 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018814391845258505, + "loss": 0.8995, + "step": 397 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001880736335183548, + "loss": 0.7461, + "step": 398 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018800315407695539, + "loss": 0.9954, + "step": 399 + }, + { + "epoch": 0.92, + "learning_rate": 0.00018793248028403788, + "loss": 0.9035, + "step": 400 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001878616122956826, + "loss": 0.9083, + "step": 401 + }, + { + "epoch": 0.92, + "learning_rate": 0.00018779055026839868, + "loss": 0.7286, + "step": 402 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001877192943591239, + "loss": 0.8001, + "step": 403 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018764784472522403, + "loss": 0.8795, + "step": 404 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001875762015244929, + "loss": 0.8912, + "step": 405 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018750436491515163, + "loss": 0.8848, + "step": 406 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018743233505584862, + "loss": 0.8512, + "step": 407 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018736011210565898, + "loss": 0.8537, + "step": 408 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018728769622408423, + "loss": 0.8777, + "step": 409 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018721508757105202, + "loss": 0.7849, + "step": 410 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018714228630691576, + "loss": 0.9669, + "step": 411 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001870692925924541, + "loss": 0.9299, + "step": 412 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018699610658887088, + "loss": 1.0188, + "step": 413 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018692272845779448, + "loss": 0.8388, + "step": 414 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018684915836127765, + "loss": 0.7904, + "step": 415 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018677539646179707, + "loss": 0.9689, + "step": 416 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018670144292225297, + "loss": 0.7339, + "step": 417 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018662729790596888, + "loss": 0.7894, + "step": 418 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018655296157669117, + "loss": 0.7163, + "step": 419 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018647843409858869, + "loss": 0.8642, + "step": 420 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018640371563625246, + "loss": 0.9281, + "step": 421 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018632880635469526, + "loss": 0.834, + "step": 422 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018625370641935129, + "loss": 0.7316, + "step": 423 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018617841599607586, + "loss": 0.8504, + "step": 424 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018610293525114492, + "loss": 0.8731, + "step": 425 + }, + { + "epoch": 0.98, + "learning_rate": 0.00018602726435125474, + "loss": 0.8803, + "step": 426 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001859514034635215, + "loss": 0.8417, + "step": 427 + }, + { + "epoch": 0.98, + "learning_rate": 0.000185875352755481, + "loss": 0.8947, + "step": 428 + }, + { + "epoch": 0.98, + "learning_rate": 0.00018579911239508827, + "loss": 0.8368, + "step": 429 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018572268255071718, + "loss": 0.8231, + "step": 430 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018564606339116, + "loss": 0.8576, + "step": 431 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001855692550856272, + "loss": 0.8753, + "step": 432 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018549225780374685, + "loss": 0.7778, + "step": 433 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018541507171556445, + "loss": 0.7516, + "step": 434 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001853376969915425, + "loss": 0.7466, + "step": 435 + }, + { + "epoch": 1.0, + "learning_rate": 0.00018526013380255999, + "loss": 0.917, + "step": 436 + }, + { + "epoch": 1.0, + "learning_rate": 0.00018518238231991218, + "loss": 0.9042, + "step": 437 + }, + { + "epoch": 1.0, + "learning_rate": 0.00018510444271531022, + "loss": 0.8587, + "step": 438 + }, + { + "epoch": 1.01, + "learning_rate": 0.00018502631516088066, + "loss": 0.9001, + "step": 439 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001849479998291651, + "loss": 0.7977, + "step": 440 + }, + { + "epoch": 1.01, + "learning_rate": 0.00018486949689311993, + "loss": 0.8711, + "step": 441 + }, + { + "epoch": 1.01, + "learning_rate": 0.00018479080652611583, + "loss": 0.7192, + "step": 442 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001847119289019373, + "loss": 0.9608, + "step": 443 + }, + { + "epoch": 1.02, + "learning_rate": 0.00018463286419478255, + "loss": 0.7097, + "step": 444 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001845536125792629, + "loss": 0.7354, + "step": 445 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001844741742304024, + "loss": 0.8711, + "step": 446 + }, + { + "epoch": 1.02, + "learning_rate": 0.00018439454932363755, + "loss": 0.8832, + "step": 447 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018431473803481684, + "loss": 0.932, + "step": 448 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018423474054020034, + "loss": 0.8394, + "step": 449 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018415455701645942, + "loss": 0.7698, + "step": 450 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018407418764067627, + "loss": 0.8856, + "step": 451 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018399363259034347, + "loss": 0.8529, + "step": 452 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018391289204336368, + "loss": 0.9898, + "step": 453 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018383196617804926, + "loss": 0.8312, + "step": 454 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018375085517312182, + "loss": 0.8234, + "step": 455 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018366955920771184, + "loss": 0.7871, + "step": 456 + }, + { + "epoch": 1.05, + "learning_rate": 0.00018358807846135825, + "loss": 0.9814, + "step": 457 + }, + { + "epoch": 1.05, + "learning_rate": 0.00018350641311400812, + "loss": 0.8183, + "step": 458 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001834245633460161, + "loss": 0.8961, + "step": 459 + }, + { + "epoch": 1.05, + "learning_rate": 0.00018334252933814427, + "loss": 0.9166, + "step": 460 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018326031127156148, + "loss": 1.0031, + "step": 461 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018317790932784317, + "loss": 0.8171, + "step": 462 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001830953236889707, + "loss": 0.83, + "step": 463 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018301255453733134, + "loss": 0.8134, + "step": 464 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001829296020557174, + "loss": 0.8561, + "step": 465 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001828464664273263, + "loss": 0.8669, + "step": 466 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001827631478357597, + "loss": 1.003, + "step": 467 + }, + { + "epoch": 1.07, + "learning_rate": 0.00018267964646502357, + "loss": 0.8715, + "step": 468 + }, + { + "epoch": 1.07, + "learning_rate": 0.00018259596249952731, + "loss": 0.7434, + "step": 469 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018251209612408373, + "loss": 0.9163, + "step": 470 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018242804752390844, + "loss": 1.0639, + "step": 471 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018234381688461942, + "loss": 0.8266, + "step": 472 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018225940439223684, + "loss": 0.7582, + "step": 473 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001821748102331823, + "loss": 0.8547, + "step": 474 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001820900345942787, + "loss": 0.7908, + "step": 475 + }, + { + "epoch": 1.09, + "learning_rate": 0.00018200507766274977, + "loss": 0.6203, + "step": 476 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001819199396262195, + "loss": 0.806, + "step": 477 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001818346206727119, + "loss": 0.8016, + "step": 478 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001817491209906506, + "loss": 0.8548, + "step": 479 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018166344076885827, + "loss": 0.9194, + "step": 480 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018157758019655634, + "loss": 0.8704, + "step": 481 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018149153946336446, + "loss": 0.8373, + "step": 482 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001814053187593003, + "loss": 0.8229, + "step": 483 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018131891827477884, + "loss": 0.8289, + "step": 484 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018123233820061218, + "loss": 0.7753, + "step": 485 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018114557872800905, + "loss": 1.029, + "step": 486 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001810586400485743, + "loss": 0.6198, + "step": 487 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001809715223543087, + "loss": 0.8418, + "step": 488 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018088422583760813, + "loss": 0.7421, + "step": 489 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001807967506912636, + "loss": 0.8032, + "step": 490 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018070909710846052, + "loss": 0.7956, + "step": 491 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018062126528277844, + "loss": 0.9013, + "step": 492 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018053325540819045, + "loss": 0.9582, + "step": 493 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018044506767906295, + "loss": 0.6845, + "step": 494 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018035670229015507, + "loss": 0.8731, + "step": 495 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001802681594366183, + "loss": 0.8369, + "step": 496 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018017943931399603, + "loss": 0.6557, + "step": 497 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018009054211822324, + "loss": 0.7997, + "step": 498 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001800014680456259, + "loss": 0.8348, + "step": 499 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001799122172929206, + "loss": 0.9043, + "step": 500 + }, + { + "epoch": 1.15, + "learning_rate": 0.00017982279005721407, + "loss": 0.8499, + "step": 501 + }, + { + "epoch": 1.15, + "learning_rate": 0.00017973318653600293, + "loss": 0.8595, + "step": 502 + }, + { + "epoch": 1.15, + "learning_rate": 0.00017964340692717303, + "loss": 0.9468, + "step": 503 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001795534514289991, + "loss": 0.9848, + "step": 504 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017946332024014434, + "loss": 0.7326, + "step": 505 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017937301355965996, + "loss": 0.8479, + "step": 506 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017928253158698473, + "loss": 0.8669, + "step": 507 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017919187452194454, + "loss": 0.8163, + "step": 508 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017910104256475194, + "loss": 0.926, + "step": 509 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017901003591600575, + "loss": 0.7956, + "step": 510 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017891885477669064, + "loss": 0.9002, + "step": 511 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017882749934817652, + "loss": 0.787, + "step": 512 + }, + { + "epoch": 1.18, + "learning_rate": 0.00017873596983221832, + "loss": 0.7519, + "step": 513 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001786442664309554, + "loss": 0.8067, + "step": 514 + }, + { + "epoch": 1.18, + "learning_rate": 0.00017855238934691108, + "loss": 0.8824, + "step": 515 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001784603387829923, + "loss": 0.8014, + "step": 516 + }, + { + "epoch": 1.18, + "learning_rate": 0.00017836811494248919, + "loss": 0.6672, + "step": 517 + }, + { + "epoch": 1.19, + "learning_rate": 0.00017827571802907444, + "loss": 0.8516, + "step": 518 + }, + { + "epoch": 1.19, + "learning_rate": 0.000178183148246803, + "loss": 0.8476, + "step": 519 + }, + { + "epoch": 1.19, + "learning_rate": 0.00017809040580011164, + "loss": 0.8493, + "step": 520 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001779974908938184, + "loss": 0.7288, + "step": 521 + }, + { + "epoch": 1.2, + "learning_rate": 0.00017790440373312223, + "loss": 0.7443, + "step": 522 + }, + { + "epoch": 1.2, + "learning_rate": 0.00017781114452360245, + "loss": 0.8767, + "step": 523 + }, + { + "epoch": 1.2, + "learning_rate": 0.00017771771347121842, + "loss": 0.8025, + "step": 524 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001776241107823089, + "loss": 0.8842, + "step": 525 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017753033666359177, + "loss": 0.9648, + "step": 526 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017743639132216353, + "loss": 0.7872, + "step": 527 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001773422749654988, + "loss": 0.9122, + "step": 528 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017724798780144983, + "loss": 0.7688, + "step": 529 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001771535300382461, + "loss": 0.8938, + "step": 530 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017705890188449394, + "loss": 0.7152, + "step": 531 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001769641035491759, + "loss": 0.7077, + "step": 532 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017686913524165036, + "loss": 0.8872, + "step": 533 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017677399717165116, + "loss": 0.8775, + "step": 534 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017667868954928694, + "loss": 0.8508, + "step": 535 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017658321258504092, + "loss": 0.8589, + "step": 536 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017648756648977018, + "loss": 0.6499, + "step": 537 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017639175147470538, + "loss": 0.8927, + "step": 538 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017629576775145026, + "loss": 0.8702, + "step": 539 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017619961553198108, + "loss": 0.7958, + "step": 540 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017610329502864625, + "loss": 0.8582, + "step": 541 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017600680645416583, + "loss": 0.7905, + "step": 542 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001759101500216311, + "loss": 0.7574, + "step": 543 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017581332594450392, + "loss": 0.861, + "step": 544 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017571633443661658, + "loss": 0.7682, + "step": 545 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017561917571217093, + "loss": 0.7547, + "step": 546 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017552184998573825, + "loss": 0.7852, + "step": 547 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001754243574722586, + "loss": 0.7635, + "step": 548 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017532669838704035, + "loss": 0.8714, + "step": 549 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017522887294575977, + "loss": 0.7839, + "step": 550 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017513088136446054, + "loss": 0.8551, + "step": 551 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017503272385955318, + "loss": 0.7367, + "step": 552 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017493440064781475, + "loss": 0.9257, + "step": 553 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017483591194638817, + "loss": 0.8246, + "step": 554 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017473725797278192, + "loss": 0.8319, + "step": 555 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017463843894486937, + "loss": 0.8304, + "step": 556 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017453945508088853, + "loss": 0.6536, + "step": 557 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017444030659944138, + "loss": 0.7606, + "step": 558 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017434099371949345, + "loss": 0.7084, + "step": 559 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017424151666037329, + "loss": 0.8891, + "step": 560 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017414187564177217, + "loss": 0.6199, + "step": 561 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017404207088374333, + "loss": 0.8676, + "step": 562 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001739421026067017, + "loss": 0.8477, + "step": 563 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017384197103142328, + "loss": 0.9234, + "step": 564 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001737416763790447, + "loss": 0.9103, + "step": 565 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017364121887106286, + "loss": 0.7859, + "step": 566 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017354059872933415, + "loss": 0.8623, + "step": 567 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017343981617607424, + "loss": 0.6266, + "step": 568 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017333887143385743, + "loss": 0.8105, + "step": 569 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017323776472561627, + "loss": 0.7752, + "step": 570 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001731364962746409, + "loss": 0.7873, + "step": 571 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001730350663045788, + "loss": 0.8425, + "step": 572 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017293347503943406, + "loss": 0.777, + "step": 573 + }, + { + "epoch": 1.32, + "learning_rate": 0.000172831722703567, + "loss": 0.7348, + "step": 574 + }, + { + "epoch": 1.32, + "learning_rate": 0.00017272980952169365, + "loss": 0.7797, + "step": 575 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001726277357188853, + "loss": 0.8328, + "step": 576 + }, + { + "epoch": 1.32, + "learning_rate": 0.00017252550152056795, + "loss": 0.7109, + "step": 577 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001724231071525218, + "loss": 0.7905, + "step": 578 + }, + { + "epoch": 1.33, + "learning_rate": 0.00017232055284088085, + "loss": 0.7541, + "step": 579 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001722178388121322, + "loss": 0.8954, + "step": 580 + }, + { + "epoch": 1.33, + "learning_rate": 0.00017211496529311582, + "loss": 0.8362, + "step": 581 + }, + { + "epoch": 1.33, + "learning_rate": 0.00017201193251102382, + "loss": 0.8436, + "step": 582 + }, + { + "epoch": 1.34, + "learning_rate": 0.00017190874069340014, + "loss": 0.7594, + "step": 583 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001718053900681397, + "loss": 0.9342, + "step": 584 + }, + { + "epoch": 1.34, + "learning_rate": 0.00017170188086348848, + "loss": 0.8934, + "step": 585 + }, + { + "epoch": 1.34, + "learning_rate": 0.00017159821330804236, + "loss": 0.831, + "step": 586 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001714943876307472, + "loss": 0.8053, + "step": 587 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017139040406089786, + "loss": 0.81, + "step": 588 + }, + { + "epoch": 1.35, + "learning_rate": 0.000171286262828138, + "loss": 0.8245, + "step": 589 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017118196416245947, + "loss": 0.8232, + "step": 590 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017107750829420176, + "loss": 0.8244, + "step": 591 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001709728954540516, + "loss": 0.7863, + "step": 592 + }, + { + "epoch": 1.36, + "learning_rate": 0.00017086812587304234, + "loss": 0.8274, + "step": 593 + }, + { + "epoch": 1.36, + "learning_rate": 0.00017076319978255345, + "loss": 0.6595, + "step": 594 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001706581174143101, + "loss": 0.8582, + "step": 595 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017055287900038263, + "loss": 0.6873, + "step": 596 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017044748477318593, + "loss": 0.8673, + "step": 597 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017034193496547902, + "loss": 0.8055, + "step": 598 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017023622981036455, + "loss": 0.8232, + "step": 599 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001701303695412881, + "loss": 0.8745, + "step": 600 + }, + { + "epoch": 1.38, + "learning_rate": 0.00017002435439203808, + "loss": 0.8034, + "step": 601 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016991818459674468, + "loss": 0.9006, + "step": 602 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001698118603898798, + "loss": 0.7828, + "step": 603 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016970538200625622, + "loss": 0.8413, + "step": 604 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016959874968102735, + "loss": 0.8669, + "step": 605 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016949196364968646, + "loss": 0.9277, + "step": 606 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016938502414806634, + "loss": 0.9256, + "step": 607 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016927793141233868, + "loss": 0.8613, + "step": 608 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016917068567901358, + "loss": 0.9439, + "step": 609 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016906328718493906, + "loss": 0.8606, + "step": 610 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016895573616730044, + "loss": 0.7483, + "step": 611 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016884803286362, + "loss": 0.8359, + "step": 612 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001687401775117562, + "loss": 0.7764, + "step": 613 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016863217034990342, + "loss": 0.9857, + "step": 614 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001685240116165912, + "loss": 0.8706, + "step": 615 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001684157015506839, + "loss": 0.867, + "step": 616 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016830724039138003, + "loss": 0.7974, + "step": 617 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016819862837821181, + "loss": 0.7835, + "step": 618 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016808986575104465, + "loss": 0.7987, + "step": 619 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001679809527500765, + "loss": 0.7383, + "step": 620 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001678718896158375, + "loss": 0.9224, + "step": 621 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016776267658918928, + "loss": 0.8959, + "step": 622 + }, + { + "epoch": 1.43, + "learning_rate": 0.00016765331391132456, + "loss": 0.6702, + "step": 623 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001675438018237665, + "loss": 0.6911, + "step": 624 + }, + { + "epoch": 1.43, + "learning_rate": 0.00016743414056836825, + "loss": 0.9364, + "step": 625 + }, + { + "epoch": 1.43, + "learning_rate": 0.00016732433038731242, + "loss": 0.7902, + "step": 626 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016721437152311054, + "loss": 0.8473, + "step": 627 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016710426421860235, + "loss": 0.8765, + "step": 628 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016699400871695555, + "loss": 0.7705, + "step": 629 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016688360526166514, + "loss": 0.8653, + "step": 630 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001667730540965528, + "loss": 0.9137, + "step": 631 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016666235546576648, + "loss": 0.9772, + "step": 632 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001665515096137797, + "loss": 0.6433, + "step": 633 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001664405167853912, + "loss": 0.8096, + "step": 634 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016632937722572434, + "loss": 0.7298, + "step": 635 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016621809118022647, + "loss": 0.6841, + "step": 636 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016610665889466838, + "loss": 0.9471, + "step": 637 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016599508061514404, + "loss": 0.8396, + "step": 638 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016588335658806962, + "loss": 0.8769, + "step": 639 + }, + { + "epoch": 1.47, + "learning_rate": 0.00016577148706018328, + "loss": 0.8328, + "step": 640 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001656594722785445, + "loss": 0.8932, + "step": 641 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001655473124905335, + "loss": 0.8203, + "step": 642 + }, + { + "epoch": 1.47, + "learning_rate": 0.00016543500794385084, + "loss": 0.8514, + "step": 643 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016532255888651666, + "loss": 0.7396, + "step": 644 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016520996556687028, + "loss": 0.9178, + "step": 645 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001650972282335697, + "loss": 0.6308, + "step": 646 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016498434713559088, + "loss": 0.9018, + "step": 647 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016487132252222727, + "loss": 0.8658, + "step": 648 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016475815464308933, + "loss": 0.8228, + "step": 649 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001646448437481039, + "loss": 0.8944, + "step": 650 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001645313900875136, + "loss": 0.8617, + "step": 651 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016441779391187646, + "loss": 0.9726, + "step": 652 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016430405547206516, + "loss": 0.693, + "step": 653 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016419017501926656, + "loss": 0.8272, + "step": 654 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016407615280498124, + "loss": 0.8523, + "step": 655 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016396198908102272, + "loss": 0.7444, + "step": 656 + }, + { + "epoch": 1.51, + "learning_rate": 0.00016384768409951714, + "loss": 0.8366, + "step": 657 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001637332381129026, + "loss": 0.7441, + "step": 658 + }, + { + "epoch": 1.51, + "learning_rate": 0.00016361865137392854, + "loss": 0.6694, + "step": 659 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001635039241356553, + "loss": 0.8103, + "step": 660 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001633890566514535, + "loss": 0.9135, + "step": 661 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016327404917500346, + "loss": 0.7327, + "step": 662 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016315890196029467, + "loss": 0.8425, + "step": 663 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016304361526162534, + "loss": 0.8812, + "step": 664 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016292818933360151, + "loss": 0.777, + "step": 665 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001628126244311369, + "loss": 0.8864, + "step": 666 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016269692080945198, + "loss": 0.9333, + "step": 667 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016258107872407375, + "loss": 0.906, + "step": 668 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016246509843083492, + "loss": 0.7346, + "step": 669 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016234898018587337, + "loss": 0.8555, + "step": 670 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016223272424563173, + "loss": 0.8449, + "step": 671 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016211633086685664, + "loss": 0.8559, + "step": 672 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016199980030659838, + "loss": 0.7468, + "step": 673 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016188313282221008, + "loss": 0.7986, + "step": 674 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001617663286713474, + "loss": 0.7757, + "step": 675 + }, + { + "epoch": 1.55, + "learning_rate": 0.00016164938811196757, + "loss": 0.8789, + "step": 676 + }, + { + "epoch": 1.55, + "learning_rate": 0.00016153231140232936, + "loss": 0.5499, + "step": 677 + }, + { + "epoch": 1.55, + "learning_rate": 0.00016141509880099206, + "loss": 0.9319, + "step": 678 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016129775056681513, + "loss": 0.6904, + "step": 679 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001611802669589575, + "loss": 0.8506, + "step": 680 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016106264823687716, + "loss": 0.7242, + "step": 681 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016094489466033043, + "loss": 0.6808, + "step": 682 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016082700648937146, + "loss": 0.8017, + "step": 683 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016070898398435167, + "loss": 0.9109, + "step": 684 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016059082740591915, + "loss": 0.7277, + "step": 685 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016047253701501808, + "loss": 0.8601, + "step": 686 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016035411307288813, + "loss": 0.9118, + "step": 687 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001602355558410639, + "loss": 0.8049, + "step": 688 + }, + { + "epoch": 1.58, + "learning_rate": 0.00016011686558137448, + "loss": 0.8174, + "step": 689 + }, + { + "epoch": 1.58, + "learning_rate": 0.00015999804255594258, + "loss": 0.8481, + "step": 690 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001598790870271843, + "loss": 0.7052, + "step": 691 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015975999925780813, + "loss": 0.8208, + "step": 692 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015964077951081485, + "loss": 0.7257, + "step": 693 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015952142804949652, + "loss": 0.858, + "step": 694 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015940194513743624, + "loss": 0.9242, + "step": 695 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001592823310385073, + "loss": 0.7924, + "step": 696 + }, + { + "epoch": 1.6, + "learning_rate": 0.00015916258601687274, + "loss": 0.8788, + "step": 697 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001590427103369848, + "loss": 0.7946, + "step": 698 + }, + { + "epoch": 1.6, + "learning_rate": 0.00015892270426358414, + "loss": 0.8318, + "step": 699 + }, + { + "epoch": 1.6, + "learning_rate": 0.00015880256806169953, + "loss": 0.8983, + "step": 700 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015868230199664711, + "loss": 0.8889, + "step": 701 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015856190633402968, + "loss": 0.9692, + "step": 702 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001584413813397364, + "loss": 0.7787, + "step": 703 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015832072727994193, + "loss": 0.6455, + "step": 704 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015819994442110616, + "loss": 1.0006, + "step": 705 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015807903302997317, + "loss": 0.7384, + "step": 706 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015795799337357114, + "loss": 0.8517, + "step": 707 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015783682571921133, + "loss": 0.8446, + "step": 708 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015771553033448775, + "loss": 0.8227, + "step": 709 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015759410748727662, + "loss": 0.8374, + "step": 710 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001574725574457354, + "loss": 0.7274, + "step": 711 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015735088047830268, + "loss": 0.8728, + "step": 712 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015722907685369723, + "loss": 1.0569, + "step": 713 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015710714684091762, + "loss": 0.9775, + "step": 714 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001569850907092415, + "loss": 0.6832, + "step": 715 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015686290872822504, + "loss": 0.7358, + "step": 716 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015674060116770236, + "loss": 0.9015, + "step": 717 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015661816829778494, + "loss": 0.8516, + "step": 718 + }, + { + "epoch": 1.65, + "learning_rate": 0.00015649561038886094, + "loss": 0.8911, + "step": 719 + }, + { + "epoch": 1.65, + "learning_rate": 0.00015637292771159472, + "loss": 0.7098, + "step": 720 + }, + { + "epoch": 1.65, + "learning_rate": 0.00015625012053692615, + "loss": 0.955, + "step": 721 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001561271891360701, + "loss": 0.6421, + "step": 722 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001560041337805157, + "loss": 0.8807, + "step": 723 + }, + { + "epoch": 1.66, + "learning_rate": 0.00015588095474202595, + "loss": 0.722, + "step": 724 + }, + { + "epoch": 1.66, + "learning_rate": 0.00015575765229263686, + "loss": 0.8055, + "step": 725 + }, + { + "epoch": 1.66, + "learning_rate": 0.00015563422670465712, + "loss": 0.7822, + "step": 726 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015551067825066728, + "loss": 0.8311, + "step": 727 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015538700720351924, + "loss": 0.8519, + "step": 728 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015526321383633568, + "loss": 0.7506, + "step": 729 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001551392984225094, + "loss": 0.8056, + "step": 730 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015501526123570277, + "loss": 0.6968, + "step": 731 + }, + { + "epoch": 1.68, + "learning_rate": 0.000154891102549847, + "loss": 0.829, + "step": 732 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001547668226391417, + "loss": 0.6682, + "step": 733 + }, + { + "epoch": 1.68, + "learning_rate": 0.00015464242177805422, + "loss": 0.8295, + "step": 734 + }, + { + "epoch": 1.68, + "learning_rate": 0.00015451790024131895, + "loss": 0.6911, + "step": 735 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015439325830393687, + "loss": 0.6785, + "step": 736 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015426849624117472, + "loss": 0.81, + "step": 737 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015414361432856475, + "loss": 0.9955, + "step": 738 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015401861284190368, + "loss": 0.8433, + "step": 739 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015389349205725242, + "loss": 0.618, + "step": 740 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015376825225093537, + "loss": 0.7747, + "step": 741 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015364289369953967, + "loss": 0.7673, + "step": 742 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001535174166799148, + "loss": 0.8066, + "step": 743 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015339182146917183, + "loss": 0.8392, + "step": 744 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001532661083446829, + "loss": 0.7949, + "step": 745 + }, + { + "epoch": 1.71, + "learning_rate": 0.00015314027758408044, + "loss": 0.8698, + "step": 746 + }, + { + "epoch": 1.71, + "learning_rate": 0.00015301432946525684, + "loss": 0.7715, + "step": 747 + }, + { + "epoch": 1.71, + "learning_rate": 0.00015288826426636354, + "loss": 0.7583, + "step": 748 + }, + { + "epoch": 1.72, + "learning_rate": 0.00015276208226581064, + "loss": 0.8544, + "step": 749 + }, + { + "epoch": 1.72, + "learning_rate": 0.00015263578374226605, + "loss": 0.8272, + "step": 750 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001525093689746552, + "loss": 0.857, + "step": 751 + }, + { + "epoch": 1.72, + "learning_rate": 0.00015238283824216015, + "loss": 0.9208, + "step": 752 + }, + { + "epoch": 1.73, + "learning_rate": 0.000152256191824219, + "loss": 0.8626, + "step": 753 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015212943000052545, + "loss": 0.9418, + "step": 754 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015200255305102803, + "loss": 0.8087, + "step": 755 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015187556125592945, + "loss": 0.7913, + "step": 756 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015174845489568622, + "loss": 0.8973, + "step": 757 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015162123425100762, + "loss": 0.701, + "step": 758 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015149389960285558, + "loss": 0.898, + "step": 759 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015136645123244366, + "loss": 0.8809, + "step": 760 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015123888942123652, + "loss": 0.7334, + "step": 761 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001511112144509495, + "loss": 0.8506, + "step": 762 + }, + { + "epoch": 1.75, + "learning_rate": 0.00015098342660354775, + "loss": 0.8469, + "step": 763 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001508555261612457, + "loss": 1.0353, + "step": 764 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001507275134065065, + "loss": 0.6269, + "step": 765 + }, + { + "epoch": 1.75, + "learning_rate": 0.00015059938862204127, + "loss": 0.7825, + "step": 766 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001504711520908086, + "loss": 0.8388, + "step": 767 + }, + { + "epoch": 1.76, + "learning_rate": 0.00015034280409601385, + "loss": 0.7383, + "step": 768 + }, + { + "epoch": 1.76, + "learning_rate": 0.00015021434492110852, + "loss": 0.8029, + "step": 769 + }, + { + "epoch": 1.76, + "learning_rate": 0.00015008577484978966, + "loss": 0.6527, + "step": 770 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014995709416599926, + "loss": 0.9434, + "step": 771 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014982830315392358, + "loss": 0.753, + "step": 772 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014969940209799248, + "loss": 0.8143, + "step": 773 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014957039128287892, + "loss": 0.8939, + "step": 774 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001494412709934982, + "loss": 0.9265, + "step": 775 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014931204151500747, + "loss": 0.8261, + "step": 776 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014918270313280495, + "loss": 0.8555, + "step": 777 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014905325613252937, + "loss": 0.8191, + "step": 778 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014892370080005936, + "loss": 0.9159, + "step": 779 + }, + { + "epoch": 1.79, + "learning_rate": 0.00014879403742151283, + "loss": 0.7936, + "step": 780 + }, + { + "epoch": 1.79, + "learning_rate": 0.00014866426628324625, + "loss": 0.8782, + "step": 781 + }, + { + "epoch": 1.79, + "learning_rate": 0.00014853438767185412, + "loss": 0.6078, + "step": 782 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001484044018741682, + "loss": 0.7182, + "step": 783 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014827430917725712, + "loss": 0.7528, + "step": 784 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014814410986842543, + "loss": 0.902, + "step": 785 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014801380423521324, + "loss": 0.8765, + "step": 786 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014788339256539544, + "loss": 0.6332, + "step": 787 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014775287514698105, + "loss": 0.7258, + "step": 788 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014762225226821273, + "loss": 0.7754, + "step": 789 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014749152421756595, + "loss": 0.7039, + "step": 790 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001473606912837485, + "loss": 0.8563, + "step": 791 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014722975375569978, + "loss": 0.8956, + "step": 792 + }, + { + "epoch": 1.82, + "learning_rate": 0.00014709871192259026, + "loss": 0.8724, + "step": 793 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001469675660738206, + "loss": 0.8885, + "step": 794 + }, + { + "epoch": 1.82, + "learning_rate": 0.00014683631649902132, + "loss": 0.7637, + "step": 795 + }, + { + "epoch": 1.82, + "learning_rate": 0.00014670496348805195, + "loss": 0.7596, + "step": 796 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014657350733100047, + "loss": 0.8221, + "step": 797 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014644194831818266, + "loss": 0.8475, + "step": 798 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014631028674014142, + "loss": 0.7966, + "step": 799 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014617852288764625, + "loss": 0.9186, + "step": 800 + }, + { + "epoch": 1.84, + "learning_rate": 0.00014604665705169237, + "loss": 0.9027, + "step": 801 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001459146895235004, + "loss": 0.9357, + "step": 802 + }, + { + "epoch": 1.84, + "learning_rate": 0.00014578262059451537, + "loss": 0.9202, + "step": 803 + }, + { + "epoch": 1.84, + "learning_rate": 0.00014565045055640638, + "loss": 0.9226, + "step": 804 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001455181797010658, + "loss": 0.8416, + "step": 805 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001453858083206086, + "loss": 0.8192, + "step": 806 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001452533367073718, + "loss": 0.8309, + "step": 807 + }, + { + "epoch": 1.85, + "learning_rate": 0.00014512076515391375, + "loss": 0.7646, + "step": 808 + }, + { + "epoch": 1.85, + "learning_rate": 0.00014498809395301356, + "loss": 0.9335, + "step": 809 + }, + { + "epoch": 1.86, + "learning_rate": 0.00014485532339767037, + "loss": 0.9696, + "step": 810 + }, + { + "epoch": 1.86, + "learning_rate": 0.00014472245378110277, + "loss": 0.7, + "step": 811 + }, + { + "epoch": 1.86, + "learning_rate": 0.000144589485396748, + "loss": 0.8206, + "step": 812 + }, + { + "epoch": 1.86, + "learning_rate": 0.0001444564185382617, + "loss": 0.7417, + "step": 813 + }, + { + "epoch": 1.86, + "learning_rate": 0.00014432325349951667, + "loss": 0.6384, + "step": 814 + }, + { + "epoch": 1.87, + "learning_rate": 0.00014418999057460276, + "loss": 0.7801, + "step": 815 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001440566300578259, + "loss": 0.8459, + "step": 816 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001439231722437075, + "loss": 0.8863, + "step": 817 + }, + { + "epoch": 1.87, + "learning_rate": 0.000143789617426984, + "loss": 0.8502, + "step": 818 + }, + { + "epoch": 1.88, + "learning_rate": 0.000143655965902606, + "loss": 0.8522, + "step": 819 + }, + { + "epoch": 1.88, + "learning_rate": 0.00014352221796573757, + "loss": 0.8612, + "step": 820 + }, + { + "epoch": 1.88, + "learning_rate": 0.00014338837391175582, + "loss": 0.8065, + "step": 821 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001432544340362501, + "loss": 0.8777, + "step": 822 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014312039863502145, + "loss": 0.7731, + "step": 823 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014298626800408166, + "loss": 0.8791, + "step": 824 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014285204243965306, + "loss": 0.9095, + "step": 825 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014271772223816757, + "loss": 0.8846, + "step": 826 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014258330769626606, + "loss": 0.701, + "step": 827 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014244879911079779, + "loss": 0.7598, + "step": 828 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014231419677881966, + "loss": 1.0411, + "step": 829 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014217950099759569, + "loss": 0.6915, + "step": 830 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014204471206459628, + "loss": 0.8048, + "step": 831 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001419098302774974, + "loss": 0.7688, + "step": 832 + }, + { + "epoch": 1.91, + "learning_rate": 0.00014177485593418028, + "loss": 0.7863, + "step": 833 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001416397893327304, + "loss": 0.7627, + "step": 834 + }, + { + "epoch": 1.91, + "learning_rate": 0.00014150463077143712, + "loss": 0.7423, + "step": 835 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014136938054879283, + "loss": 0.7236, + "step": 836 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014123403896349227, + "loss": 0.8978, + "step": 837 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014109860631443213, + "loss": 0.9403, + "step": 838 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014096308290071003, + "loss": 0.7267, + "step": 839 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014082746902162414, + "loss": 0.7905, + "step": 840 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014069176497667242, + "loss": 0.8848, + "step": 841 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014055597106555192, + "loss": 0.9057, + "step": 842 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014042008758815818, + "loss": 0.7363, + "step": 843 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014028411484458454, + "loss": 0.8193, + "step": 844 + }, + { + "epoch": 1.94, + "learning_rate": 0.00014014805313512145, + "loss": 0.7387, + "step": 845 + }, + { + "epoch": 1.94, + "learning_rate": 0.00014001190276025593, + "loss": 0.8871, + "step": 846 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001398756640206707, + "loss": 0.7342, + "step": 847 + }, + { + "epoch": 1.94, + "learning_rate": 0.00013973933721724363, + "loss": 0.8557, + "step": 848 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001396029226510472, + "loss": 0.8778, + "step": 849 + }, + { + "epoch": 1.95, + "learning_rate": 0.00013946642062334766, + "loss": 0.7844, + "step": 850 + }, + { + "epoch": 1.95, + "learning_rate": 0.00013932983143560433, + "loss": 0.7941, + "step": 851 + }, + { + "epoch": 1.95, + "learning_rate": 0.00013919315538946905, + "loss": 0.7505, + "step": 852 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001390563927867856, + "loss": 0.8371, + "step": 853 + }, + { + "epoch": 1.96, + "learning_rate": 0.00013891954392958878, + "loss": 0.8128, + "step": 854 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001387826091201039, + "loss": 0.7127, + "step": 855 + }, + { + "epoch": 1.96, + "learning_rate": 0.00013864558866074622, + "loss": 0.8165, + "step": 856 + }, + { + "epoch": 1.96, + "learning_rate": 0.00013850848285411994, + "loss": 0.7103, + "step": 857 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013837129200301794, + "loss": 0.8373, + "step": 858 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013823401641042084, + "loss": 0.6908, + "step": 859 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013809665637949637, + "loss": 0.7358, + "step": 860 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013795921221359877, + "loss": 0.7545, + "step": 861 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013782168421626816, + "loss": 0.7681, + "step": 862 + }, + { + "epoch": 1.98, + "learning_rate": 0.00013768407269122967, + "loss": 1.026, + "step": 863 + }, + { + "epoch": 1.98, + "learning_rate": 0.000137546377942393, + "loss": 0.761, + "step": 864 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001374086002738516, + "loss": 0.8442, + "step": 865 + }, + { + "epoch": 1.98, + "learning_rate": 0.00013727073998988202, + "loss": 0.7959, + "step": 866 + }, + { + "epoch": 1.99, + "learning_rate": 0.00013713279739494333, + "loss": 0.8061, + "step": 867 + }, + { + "epoch": 1.99, + "learning_rate": 0.00013699477279367636, + "loss": 0.7434, + "step": 868 + }, + { + "epoch": 1.99, + "learning_rate": 0.000136856666490903, + "loss": 0.7159, + "step": 869 + }, + { + "epoch": 1.99, + "learning_rate": 0.00013671847879162562, + "loss": 0.867, + "step": 870 + }, + { + "epoch": 2.0, + "learning_rate": 0.00013658021000102636, + "loss": 0.9237, + "step": 871 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001364418604244664, + "loss": 0.8545, + "step": 872 + }, + { + "epoch": 2.0, + "learning_rate": 0.00013630343036748535, + "loss": 0.893, + "step": 873 + }, + { + "epoch": 2.0, + "learning_rate": 0.00013616492013580062, + "loss": 0.9858, + "step": 874 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001360263300353066, + "loss": 0.6643, + "step": 875 + }, + { + "epoch": 2.01, + "learning_rate": 0.0001358876603720741, + "loss": 0.8081, + "step": 876 + }, + { + "epoch": 2.01, + "learning_rate": 0.00013574891145234962, + "loss": 0.7287, + "step": 877 + }, + { + "epoch": 2.01, + "learning_rate": 0.00013561008358255468, + "loss": 0.8078, + "step": 878 + }, + { + "epoch": 2.01, + "learning_rate": 0.0001354711770692853, + "loss": 0.6738, + "step": 879 + }, + { + "epoch": 2.02, + "learning_rate": 0.00013533219221931102, + "loss": 0.7508, + "step": 880 + }, + { + "epoch": 2.02, + "learning_rate": 0.0001351931293395744, + "loss": 0.8724, + "step": 881 + }, + { + "epoch": 2.02, + "learning_rate": 0.0001350539887371904, + "loss": 0.9317, + "step": 882 + }, + { + "epoch": 2.02, + "learning_rate": 0.00013491477071944557, + "loss": 0.7664, + "step": 883 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013477547559379748, + "loss": 0.8065, + "step": 884 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013463610366787392, + "loss": 0.738, + "step": 885 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013449665524947234, + "loss": 0.7554, + "step": 886 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013435713064655912, + "loss": 0.7769, + "step": 887 + }, + { + "epoch": 2.03, + "learning_rate": 0.00013421753016726887, + "loss": 0.6507, + "step": 888 + }, + { + "epoch": 2.04, + "learning_rate": 0.0001340778541199038, + "loss": 0.7293, + "step": 889 + }, + { + "epoch": 2.04, + "learning_rate": 0.00013393810281293292, + "loss": 0.8305, + "step": 890 + }, + { + "epoch": 2.04, + "learning_rate": 0.00013379827655499163, + "loss": 0.7553, + "step": 891 + }, + { + "epoch": 2.04, + "learning_rate": 0.00013365837565488064, + "loss": 0.7724, + "step": 892 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013351840042156565, + "loss": 0.7061, + "step": 893 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013337835116417648, + "loss": 0.7078, + "step": 894 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013323822819200643, + "loss": 0.8201, + "step": 895 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013309803181451156, + "loss": 0.746, + "step": 896 + }, + { + "epoch": 2.05, + "learning_rate": 0.00013295776234131015, + "loss": 0.8276, + "step": 897 + }, + { + "epoch": 2.06, + "learning_rate": 0.0001328174200821817, + "loss": 0.7922, + "step": 898 + }, + { + "epoch": 2.06, + "learning_rate": 0.0001326770053470668, + "loss": 0.7577, + "step": 899 + }, + { + "epoch": 2.06, + "learning_rate": 0.00013253651844606572, + "loss": 0.8217, + "step": 900 + }, + { + "epoch": 2.06, + "learning_rate": 0.00013239595968943832, + "loss": 0.7883, + "step": 901 + }, + { + "epoch": 2.07, + "learning_rate": 0.00013225532938760317, + "loss": 0.9568, + "step": 902 + }, + { + "epoch": 2.07, + "learning_rate": 0.00013211462785113666, + "loss": 0.7348, + "step": 903 + }, + { + "epoch": 2.07, + "learning_rate": 0.00013197385539077275, + "loss": 0.7558, + "step": 904 + }, + { + "epoch": 2.07, + "learning_rate": 0.00013183301231740183, + "loss": 0.7066, + "step": 905 + }, + { + "epoch": 2.08, + "learning_rate": 0.0001316920989420703, + "loss": 0.7663, + "step": 906 + }, + { + "epoch": 2.08, + "learning_rate": 0.00013155111557597985, + "loss": 0.79, + "step": 907 + }, + { + "epoch": 2.08, + "learning_rate": 0.00013141006253048672, + "loss": 0.8237, + "step": 908 + }, + { + "epoch": 2.08, + "learning_rate": 0.0001312689401171011, + "loss": 0.687, + "step": 909 + }, + { + "epoch": 2.08, + "learning_rate": 0.00013112774864748621, + "loss": 0.8254, + "step": 910 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001309864884334579, + "loss": 0.7641, + "step": 911 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001308451597869839, + "loss": 0.7845, + "step": 912 + }, + { + "epoch": 2.09, + "learning_rate": 0.00013070376302018287, + "loss": 0.8661, + "step": 913 + }, + { + "epoch": 2.09, + "learning_rate": 0.0001305622984453241, + "loss": 0.9001, + "step": 914 + }, + { + "epoch": 2.1, + "learning_rate": 0.00013042076637482654, + "loss": 0.7261, + "step": 915 + }, + { + "epoch": 2.1, + "learning_rate": 0.00013027916712125826, + "loss": 0.7954, + "step": 916 + }, + { + "epoch": 2.1, + "learning_rate": 0.0001301375009973356, + "loss": 0.792, + "step": 917 + }, + { + "epoch": 2.1, + "learning_rate": 0.00012999576831592273, + "loss": 0.8423, + "step": 918 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012985396939003065, + "loss": 0.8529, + "step": 919 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012971210453281674, + "loss": 0.9086, + "step": 920 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012957017405758401, + "loss": 0.7099, + "step": 921 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012942817827778038, + "loss": 0.7515, + "step": 922 + }, + { + "epoch": 2.11, + "learning_rate": 0.00012928611750699783, + "loss": 0.7972, + "step": 923 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001291439920589722, + "loss": 0.6615, + "step": 924 + }, + { + "epoch": 2.12, + "learning_rate": 0.00012900180224758185, + "loss": 0.8229, + "step": 925 + }, + { + "epoch": 2.12, + "learning_rate": 0.00012885954838684743, + "loss": 0.8146, + "step": 926 + }, + { + "epoch": 2.12, + "learning_rate": 0.000128717230790931, + "loss": 0.8941, + "step": 927 + }, + { + "epoch": 2.13, + "learning_rate": 0.00012857484977413545, + "loss": 0.7661, + "step": 928 + }, + { + "epoch": 2.13, + "learning_rate": 0.00012843240565090365, + "loss": 0.7404, + "step": 929 + }, + { + "epoch": 2.13, + "learning_rate": 0.00012828989873581785, + "loss": 0.7971, + "step": 930 + }, + { + "epoch": 2.13, + "learning_rate": 0.000128147329343599, + "loss": 0.6813, + "step": 931 + }, + { + "epoch": 2.14, + "learning_rate": 0.00012800469778910601, + "loss": 0.7704, + "step": 932 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001278620043873351, + "loss": 0.7751, + "step": 933 + }, + { + "epoch": 2.14, + "learning_rate": 0.00012771924945341906, + "loss": 0.841, + "step": 934 + }, + { + "epoch": 2.14, + "learning_rate": 0.00012757643330262657, + "loss": 0.858, + "step": 935 + }, + { + "epoch": 2.14, + "learning_rate": 0.00012743355625036143, + "loss": 0.6657, + "step": 936 + }, + { + "epoch": 2.15, + "learning_rate": 0.00012729061861216213, + "loss": 0.7735, + "step": 937 + }, + { + "epoch": 2.15, + "learning_rate": 0.00012714762070370077, + "loss": 0.8935, + "step": 938 + }, + { + "epoch": 2.15, + "learning_rate": 0.00012700456284078264, + "loss": 0.9684, + "step": 939 + }, + { + "epoch": 2.15, + "learning_rate": 0.0001268614453393454, + "loss": 0.9117, + "step": 940 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012671826851545851, + "loss": 0.7613, + "step": 941 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012657503268532236, + "loss": 0.9567, + "step": 942 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012643173816526764, + "loss": 0.8725, + "step": 943 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012628838527175464, + "loss": 0.8088, + "step": 944 + }, + { + "epoch": 2.16, + "learning_rate": 0.00012614497432137273, + "loss": 0.7655, + "step": 945 + }, + { + "epoch": 2.17, + "learning_rate": 0.00012600150563083927, + "loss": 0.7585, + "step": 946 + }, + { + "epoch": 2.17, + "learning_rate": 0.0001258579795169993, + "loss": 0.6351, + "step": 947 + }, + { + "epoch": 2.17, + "learning_rate": 0.0001257143962968246, + "loss": 0.8408, + "step": 948 + }, + { + "epoch": 2.17, + "learning_rate": 0.00012557075628741307, + "loss": 0.7144, + "step": 949 + }, + { + "epoch": 2.18, + "learning_rate": 0.00012542705980598813, + "loss": 0.7022, + "step": 950 + }, + { + "epoch": 2.18, + "learning_rate": 0.00012528330716989769, + "loss": 0.8635, + "step": 951 + }, + { + "epoch": 2.18, + "learning_rate": 0.0001251394986966139, + "loss": 0.8489, + "step": 952 + }, + { + "epoch": 2.18, + "learning_rate": 0.00012499563470373212, + "loss": 0.7563, + "step": 953 + }, + { + "epoch": 2.19, + "learning_rate": 0.00012485171550897037, + "loss": 0.9245, + "step": 954 + }, + { + "epoch": 2.19, + "learning_rate": 0.00012470774143016853, + "loss": 0.9168, + "step": 955 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001245637127852877, + "loss": 0.803, + "step": 956 + }, + { + "epoch": 2.19, + "learning_rate": 0.00012441962989240952, + "loss": 0.722, + "step": 957 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001242754930697354, + "loss": 0.7944, + "step": 958 + }, + { + "epoch": 2.2, + "learning_rate": 0.00012413130263558587, + "loss": 0.7759, + "step": 959 + }, + { + "epoch": 2.2, + "learning_rate": 0.00012398705890839988, + "loss": 0.9407, + "step": 960 + }, + { + "epoch": 2.2, + "learning_rate": 0.00012384276220673402, + "loss": 0.726, + "step": 961 + }, + { + "epoch": 2.2, + "learning_rate": 0.00012369841284926188, + "loss": 0.7817, + "step": 962 + }, + { + "epoch": 2.21, + "learning_rate": 0.00012355401115477345, + "loss": 0.6845, + "step": 963 + }, + { + "epoch": 2.21, + "learning_rate": 0.00012340955744217412, + "loss": 0.7638, + "step": 964 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001232650520304843, + "loss": 0.8104, + "step": 965 + }, + { + "epoch": 2.21, + "learning_rate": 0.00012312049523883852, + "loss": 0.8676, + "step": 966 + }, + { + "epoch": 2.22, + "learning_rate": 0.0001229758873864848, + "loss": 0.7944, + "step": 967 + }, + { + "epoch": 2.22, + "learning_rate": 0.00012283122879278393, + "loss": 0.8001, + "step": 968 + }, + { + "epoch": 2.22, + "learning_rate": 0.00012268651977720866, + "loss": 0.7943, + "step": 969 + }, + { + "epoch": 2.22, + "learning_rate": 0.0001225417606593433, + "loss": 0.9679, + "step": 970 + }, + { + "epoch": 2.22, + "learning_rate": 0.00012239695175888263, + "loss": 0.773, + "step": 971 + }, + { + "epoch": 2.23, + "learning_rate": 0.00012225209339563145, + "loss": 0.7707, + "step": 972 + }, + { + "epoch": 2.23, + "learning_rate": 0.00012210718588950376, + "loss": 0.6727, + "step": 973 + }, + { + "epoch": 2.23, + "learning_rate": 0.00012196222956052214, + "loss": 0.7641, + "step": 974 + }, + { + "epoch": 2.23, + "learning_rate": 0.00012181722472881697, + "loss": 0.8506, + "step": 975 + }, + { + "epoch": 2.24, + "learning_rate": 0.00012167217171462566, + "loss": 0.8442, + "step": 976 + }, + { + "epoch": 2.24, + "learning_rate": 0.00012152707083829217, + "loss": 0.7853, + "step": 977 + }, + { + "epoch": 2.24, + "learning_rate": 0.00012138192242026614, + "loss": 0.7495, + "step": 978 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001212367267811021, + "loss": 0.739, + "step": 979 + }, + { + "epoch": 2.25, + "learning_rate": 0.00012109148424145898, + "loss": 0.6531, + "step": 980 + }, + { + "epoch": 2.25, + "learning_rate": 0.00012094619512209915, + "loss": 0.7721, + "step": 981 + }, + { + "epoch": 2.25, + "learning_rate": 0.00012080085974388802, + "loss": 0.7346, + "step": 982 + }, + { + "epoch": 2.25, + "learning_rate": 0.0001206554784277931, + "loss": 0.8709, + "step": 983 + }, + { + "epoch": 2.25, + "learning_rate": 0.00012051005149488326, + "loss": 0.8111, + "step": 984 + }, + { + "epoch": 2.26, + "learning_rate": 0.0001203645792663282, + "loss": 0.8296, + "step": 985 + }, + { + "epoch": 2.26, + "learning_rate": 0.00012021906206339766, + "loss": 0.7569, + "step": 986 + }, + { + "epoch": 2.26, + "learning_rate": 0.00012007350020746068, + "loss": 0.7945, + "step": 987 + }, + { + "epoch": 2.26, + "learning_rate": 0.00011992789401998492, + "loss": 0.7818, + "step": 988 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011978224382253589, + "loss": 0.59, + "step": 989 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011963654993677645, + "loss": 0.828, + "step": 990 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011949081268446571, + "loss": 0.7583, + "step": 991 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011934503238745878, + "loss": 0.7453, + "step": 992 + }, + { + "epoch": 2.27, + "learning_rate": 0.00011919920936770568, + "loss": 0.826, + "step": 993 + }, + { + "epoch": 2.28, + "learning_rate": 0.00011905334394725085, + "loss": 0.7673, + "step": 994 + }, + { + "epoch": 2.28, + "learning_rate": 0.00011890743644823242, + "loss": 0.9637, + "step": 995 + }, + { + "epoch": 2.28, + "learning_rate": 0.00011876148719288128, + "loss": 0.702, + "step": 996 + }, + { + "epoch": 2.28, + "learning_rate": 0.00011861549650352069, + "loss": 0.856, + "step": 997 + }, + { + "epoch": 2.29, + "learning_rate": 0.00011846946470256538, + "loss": 0.725, + "step": 998 + }, + { + "epoch": 2.29, + "learning_rate": 0.00011832339211252084, + "loss": 0.7615, + "step": 999 + }, + { + "epoch": 2.29, + "learning_rate": 0.00011817727905598268, + "loss": 0.7691, + "step": 1000 + }, + { + "epoch": 2.29, + "learning_rate": 0.00011803112585563587, + "loss": 0.8347, + "step": 1001 + }, + { + "epoch": 2.3, + "learning_rate": 0.00011788493283425397, + "loss": 0.908, + "step": 1002 + }, + { + "epoch": 2.3, + "learning_rate": 0.00011773870031469862, + "loss": 0.8724, + "step": 1003 + }, + { + "epoch": 2.3, + "learning_rate": 0.00011759242861991855, + "loss": 0.8801, + "step": 1004 + }, + { + "epoch": 2.3, + "learning_rate": 0.0001174461180729491, + "loss": 0.861, + "step": 1005 + }, + { + "epoch": 2.3, + "learning_rate": 0.00011729976899691137, + "loss": 0.8878, + "step": 1006 + }, + { + "epoch": 2.31, + "learning_rate": 0.00011715338171501156, + "loss": 0.7662, + "step": 1007 + }, + { + "epoch": 2.31, + "learning_rate": 0.00011700695655054026, + "loss": 0.7814, + "step": 1008 + }, + { + "epoch": 2.31, + "learning_rate": 0.00011686049382687168, + "loss": 0.8727, + "step": 1009 + }, + { + "epoch": 2.31, + "learning_rate": 0.000116713993867463, + "loss": 0.8036, + "step": 1010 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011656745699585371, + "loss": 0.957, + "step": 1011 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011642088353566469, + "loss": 0.9257, + "step": 1012 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011627427381059772, + "loss": 0.7994, + "step": 1013 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011612762814443459, + "loss": 0.6582, + "step": 1014 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011598094686103653, + "loss": 0.7195, + "step": 1015 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011583423028434344, + "loss": 0.6673, + "step": 1016 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011568747873837307, + "loss": 0.8075, + "step": 1017 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011554069254722051, + "loss": 0.8945, + "step": 1018 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011539387203505727, + "loss": 0.6828, + "step": 1019 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011524701752613074, + "loss": 0.7014, + "step": 1020 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011510012934476338, + "loss": 0.8388, + "step": 1021 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011495320781535186, + "loss": 0.685, + "step": 1022 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011480625326236677, + "loss": 0.7141, + "step": 1023 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011465926601035137, + "loss": 0.8078, + "step": 1024 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011451224638392129, + "loss": 0.7924, + "step": 1025 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011436519470776362, + "loss": 0.9223, + "step": 1026 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011421811130663623, + "loss": 0.8251, + "step": 1027 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011407099650536706, + "loss": 0.9127, + "step": 1028 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011392385062885334, + "loss": 0.7634, + "step": 1029 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011377667400206101, + "loss": 0.7472, + "step": 1030 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011362946695002383, + "loss": 0.7838, + "step": 1031 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011348222979784289, + "loss": 0.9502, + "step": 1032 + }, + { + "epoch": 2.37, + "learning_rate": 0.00011333496287068563, + "loss": 0.7066, + "step": 1033 + }, + { + "epoch": 2.37, + "learning_rate": 0.00011318766649378532, + "loss": 0.9988, + "step": 1034 + }, + { + "epoch": 2.37, + "learning_rate": 0.00011304034099244014, + "loss": 0.9448, + "step": 1035 + }, + { + "epoch": 2.37, + "learning_rate": 0.00011289298669201282, + "loss": 0.7764, + "step": 1036 + }, + { + "epoch": 2.38, + "learning_rate": 0.00011274560391792948, + "loss": 0.7351, + "step": 1037 + }, + { + "epoch": 2.38, + "learning_rate": 0.00011259819299567922, + "loss": 0.895, + "step": 1038 + }, + { + "epoch": 2.38, + "learning_rate": 0.00011245075425081328, + "loss": 0.718, + "step": 1039 + }, + { + "epoch": 2.38, + "learning_rate": 0.00011230328800894437, + "loss": 0.7811, + "step": 1040 + }, + { + "epoch": 2.38, + "learning_rate": 0.0001121557945957459, + "loss": 0.7859, + "step": 1041 + }, + { + "epoch": 2.39, + "learning_rate": 0.00011200827433695127, + "loss": 0.7916, + "step": 1042 + }, + { + "epoch": 2.39, + "learning_rate": 0.00011186072755835322, + "loss": 0.8321, + "step": 1043 + }, + { + "epoch": 2.39, + "learning_rate": 0.00011171315458580303, + "loss": 0.7648, + "step": 1044 + }, + { + "epoch": 2.39, + "learning_rate": 0.00011156555574520981, + "loss": 0.7691, + "step": 1045 + }, + { + "epoch": 2.4, + "learning_rate": 0.00011141793136253986, + "loss": 0.6978, + "step": 1046 + }, + { + "epoch": 2.4, + "learning_rate": 0.00011127028176381578, + "loss": 0.6725, + "step": 1047 + }, + { + "epoch": 2.4, + "learning_rate": 0.00011112260727511596, + "loss": 0.8165, + "step": 1048 + }, + { + "epoch": 2.4, + "learning_rate": 0.00011097490822257377, + "loss": 0.8662, + "step": 1049 + }, + { + "epoch": 2.41, + "learning_rate": 0.00011082718493237669, + "loss": 0.8784, + "step": 1050 + }, + { + "epoch": 2.41, + "learning_rate": 0.00011067943773076586, + "loss": 0.8533, + "step": 1051 + }, + { + "epoch": 2.41, + "learning_rate": 0.00011053166694403521, + "loss": 0.6602, + "step": 1052 + }, + { + "epoch": 2.41, + "learning_rate": 0.0001103838728985307, + "loss": 0.8363, + "step": 1053 + }, + { + "epoch": 2.41, + "learning_rate": 0.0001102360559206497, + "loss": 0.8044, + "step": 1054 + }, + { + "epoch": 2.42, + "learning_rate": 0.00011008821633684019, + "loss": 0.8684, + "step": 1055 + }, + { + "epoch": 2.42, + "learning_rate": 0.00010994035447360018, + "loss": 0.7158, + "step": 1056 + }, + { + "epoch": 2.42, + "learning_rate": 0.0001097924706574767, + "loss": 0.7729, + "step": 1057 + }, + { + "epoch": 2.42, + "learning_rate": 0.00010964456521506545, + "loss": 0.685, + "step": 1058 + }, + { + "epoch": 2.43, + "learning_rate": 0.00010949663847300976, + "loss": 0.8647, + "step": 1059 + }, + { + "epoch": 2.43, + "learning_rate": 0.000109348690758, + "loss": 0.836, + "step": 1060 + }, + { + "epoch": 2.43, + "learning_rate": 0.00010920072239677301, + "loss": 0.8494, + "step": 1061 + }, + { + "epoch": 2.43, + "learning_rate": 0.00010905273371611105, + "loss": 0.9494, + "step": 1062 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010890472504284133, + "loss": 0.7832, + "step": 1063 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010875669670383521, + "loss": 0.7709, + "step": 1064 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010860864902600747, + "loss": 0.8175, + "step": 1065 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010846058233631565, + "loss": 0.8179, + "step": 1066 + }, + { + "epoch": 2.44, + "learning_rate": 0.00010831249696175918, + "loss": 0.7686, + "step": 1067 + }, + { + "epoch": 2.45, + "learning_rate": 0.00010816439322937879, + "loss": 0.8491, + "step": 1068 + }, + { + "epoch": 2.45, + "learning_rate": 0.00010801627146625588, + "loss": 0.7961, + "step": 1069 + }, + { + "epoch": 2.45, + "learning_rate": 0.00010786813199951145, + "loss": 0.8408, + "step": 1070 + }, + { + "epoch": 2.45, + "learning_rate": 0.00010771997515630574, + "loss": 0.8916, + "step": 1071 + }, + { + "epoch": 2.46, + "learning_rate": 0.00010757180126383735, + "loss": 0.8035, + "step": 1072 + }, + { + "epoch": 2.46, + "learning_rate": 0.0001074236106493425, + "loss": 0.9132, + "step": 1073 + }, + { + "epoch": 2.46, + "learning_rate": 0.0001072754036400944, + "loss": 0.8029, + "step": 1074 + }, + { + "epoch": 2.46, + "learning_rate": 0.00010712718056340236, + "loss": 0.6981, + "step": 1075 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010697894174661127, + "loss": 0.7829, + "step": 1076 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010683068751710075, + "loss": 0.7699, + "step": 1077 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010668241820228444, + "loss": 0.7342, + "step": 1078 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010653413412960935, + "loss": 0.7729, + "step": 1079 + }, + { + "epoch": 2.47, + "learning_rate": 0.00010638583562655498, + "loss": 0.9097, + "step": 1080 + }, + { + "epoch": 2.48, + "learning_rate": 0.00010623752302063283, + "loss": 0.8692, + "step": 1081 + }, + { + "epoch": 2.48, + "learning_rate": 0.00010608919663938549, + "loss": 0.8861, + "step": 1082 + }, + { + "epoch": 2.48, + "learning_rate": 0.00010594085681038588, + "loss": 0.7454, + "step": 1083 + }, + { + "epoch": 2.48, + "learning_rate": 0.00010579250386123676, + "loss": 0.8291, + "step": 1084 + }, + { + "epoch": 2.49, + "learning_rate": 0.0001056441381195698, + "loss": 0.7643, + "step": 1085 + }, + { + "epoch": 2.49, + "learning_rate": 0.00010549575991304492, + "loss": 0.8242, + "step": 1086 + }, + { + "epoch": 2.49, + "learning_rate": 0.0001053473695693496, + "loss": 0.9521, + "step": 1087 + }, + { + "epoch": 2.49, + "learning_rate": 0.00010519896741619803, + "loss": 0.8142, + "step": 1088 + }, + { + "epoch": 2.49, + "learning_rate": 0.00010505055378133067, + "loss": 0.7955, + "step": 1089 + }, + { + "epoch": 2.5, + "learning_rate": 0.00010490212899251309, + "loss": 0.7363, + "step": 1090 + }, + { + "epoch": 2.5, + "learning_rate": 0.00010475369337753569, + "loss": 0.8173, + "step": 1091 + }, + { + "epoch": 2.5, + "learning_rate": 0.00010460524726421275, + "loss": 0.7659, + "step": 1092 + }, + { + "epoch": 2.5, + "learning_rate": 0.00010445679098038157, + "loss": 0.8618, + "step": 1093 + }, + { + "epoch": 2.51, + "learning_rate": 0.00010430832485390217, + "loss": 0.7606, + "step": 1094 + }, + { + "epoch": 2.51, + "learning_rate": 0.00010415984921265609, + "loss": 0.8721, + "step": 1095 + }, + { + "epoch": 2.51, + "learning_rate": 0.00010401136438454599, + "loss": 0.8152, + "step": 1096 + }, + { + "epoch": 2.51, + "learning_rate": 0.0001038628706974948, + "loss": 0.8934, + "step": 1097 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010371436847944503, + "loss": 0.8385, + "step": 1098 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010356585805835797, + "loss": 0.8581, + "step": 1099 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010341733976221313, + "loss": 0.788, + "step": 1100 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010326881391900724, + "loss": 0.7872, + "step": 1101 + }, + { + "epoch": 2.52, + "learning_rate": 0.00010312028085675391, + "loss": 0.819, + "step": 1102 + }, + { + "epoch": 2.53, + "learning_rate": 0.00010297174090348255, + "loss": 0.854, + "step": 1103 + }, + { + "epoch": 2.53, + "learning_rate": 0.00010282319438723782, + "loss": 0.7121, + "step": 1104 + }, + { + "epoch": 2.53, + "learning_rate": 0.00010267464163607889, + "loss": 0.8977, + "step": 1105 + }, + { + "epoch": 2.53, + "learning_rate": 0.00010252608297807871, + "loss": 0.8411, + "step": 1106 + }, + { + "epoch": 2.54, + "learning_rate": 0.00010237751874132322, + "loss": 0.834, + "step": 1107 + }, + { + "epoch": 2.54, + "learning_rate": 0.00010222894925391073, + "loss": 0.7582, + "step": 1108 + }, + { + "epoch": 2.54, + "learning_rate": 0.00010208037484395114, + "loss": 0.7773, + "step": 1109 + }, + { + "epoch": 2.54, + "learning_rate": 0.00010193179583956523, + "loss": 0.7294, + "step": 1110 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010178321256888385, + "loss": 0.89, + "step": 1111 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010163462536004742, + "loss": 0.7675, + "step": 1112 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010148603454120487, + "loss": 0.7291, + "step": 1113 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010133744044051328, + "loss": 0.8403, + "step": 1114 + }, + { + "epoch": 2.55, + "learning_rate": 0.00010118884338613688, + "loss": 0.8955, + "step": 1115 + }, + { + "epoch": 2.56, + "learning_rate": 0.00010104024370624644, + "loss": 0.7537, + "step": 1116 + }, + { + "epoch": 2.56, + "learning_rate": 0.00010089164172901851, + "loss": 0.8734, + "step": 1117 + }, + { + "epoch": 2.56, + "learning_rate": 0.00010074303778263474, + "loss": 0.7312, + "step": 1118 + }, + { + "epoch": 2.56, + "learning_rate": 0.00010059443219528117, + "loss": 0.7906, + "step": 1119 + }, + { + "epoch": 2.57, + "learning_rate": 0.00010044582529514739, + "loss": 0.7756, + "step": 1120 + }, + { + "epoch": 2.57, + "learning_rate": 0.00010029721741042586, + "loss": 0.9158, + "step": 1121 + }, + { + "epoch": 2.57, + "learning_rate": 0.00010014860886931139, + "loss": 0.8481, + "step": 1122 + }, + { + "epoch": 2.57, + "learning_rate": 0.0001, + "loss": 0.8187, + "step": 1123 + }, + { + "epoch": 2.58, + "learning_rate": 9.985139113068865e-05, + "loss": 0.8507, + "step": 1124 + }, + { + "epoch": 2.58, + "learning_rate": 9.970278258957415e-05, + "loss": 0.7585, + "step": 1125 + }, + { + "epoch": 2.58, + "learning_rate": 9.955417470485265e-05, + "loss": 0.7163, + "step": 1126 + }, + { + "epoch": 2.58, + "learning_rate": 9.940556780471885e-05, + "loss": 0.8124, + "step": 1127 + }, + { + "epoch": 2.58, + "learning_rate": 9.925696221736525e-05, + "loss": 0.924, + "step": 1128 + }, + { + "epoch": 2.59, + "learning_rate": 9.91083582709815e-05, + "loss": 0.843, + "step": 1129 + }, + { + "epoch": 2.59, + "learning_rate": 9.895975629375359e-05, + "loss": 0.8461, + "step": 1130 + }, + { + "epoch": 2.59, + "learning_rate": 9.881115661386314e-05, + "loss": 0.757, + "step": 1131 + }, + { + "epoch": 2.59, + "learning_rate": 9.866255955948676e-05, + "loss": 0.7779, + "step": 1132 + }, + { + "epoch": 2.6, + "learning_rate": 9.851396545879516e-05, + "loss": 0.8325, + "step": 1133 + }, + { + "epoch": 2.6, + "learning_rate": 9.836537463995262e-05, + "loss": 0.7117, + "step": 1134 + }, + { + "epoch": 2.6, + "learning_rate": 9.821678743111618e-05, + "loss": 0.7209, + "step": 1135 + }, + { + "epoch": 2.6, + "learning_rate": 9.806820416043478e-05, + "loss": 0.6621, + "step": 1136 + }, + { + "epoch": 2.6, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7836, + "step": 1137 + }, + { + "epoch": 2.61, + "learning_rate": 9.777105074608928e-05, + "loss": 0.8576, + "step": 1138 + }, + { + "epoch": 2.61, + "learning_rate": 9.762248125867678e-05, + "loss": 0.6352, + "step": 1139 + }, + { + "epoch": 2.61, + "learning_rate": 9.747391702192132e-05, + "loss": 0.7828, + "step": 1140 + }, + { + "epoch": 2.61, + "learning_rate": 9.732535836392113e-05, + "loss": 0.6583, + "step": 1141 + }, + { + "epoch": 2.62, + "learning_rate": 9.717680561276219e-05, + "loss": 0.9171, + "step": 1142 + }, + { + "epoch": 2.62, + "learning_rate": 9.702825909651748e-05, + "loss": 0.8694, + "step": 1143 + }, + { + "epoch": 2.62, + "learning_rate": 9.687971914324607e-05, + "loss": 0.9293, + "step": 1144 + }, + { + "epoch": 2.62, + "learning_rate": 9.673118608099276e-05, + "loss": 0.7273, + "step": 1145 + }, + { + "epoch": 2.63, + "learning_rate": 9.658266023778689e-05, + "loss": 0.8386, + "step": 1146 + }, + { + "epoch": 2.63, + "learning_rate": 9.643414194164204e-05, + "loss": 0.727, + "step": 1147 + }, + { + "epoch": 2.63, + "learning_rate": 9.628563152055498e-05, + "loss": 0.9991, + "step": 1148 + }, + { + "epoch": 2.63, + "learning_rate": 9.61371293025052e-05, + "loss": 0.7304, + "step": 1149 + }, + { + "epoch": 2.63, + "learning_rate": 9.598863561545404e-05, + "loss": 0.8146, + "step": 1150 + }, + { + "epoch": 2.64, + "learning_rate": 9.584015078734395e-05, + "loss": 0.8178, + "step": 1151 + }, + { + "epoch": 2.64, + "learning_rate": 9.569167514609786e-05, + "loss": 0.7202, + "step": 1152 + }, + { + "epoch": 2.64, + "learning_rate": 9.554320901961843e-05, + "loss": 0.728, + "step": 1153 + }, + { + "epoch": 2.64, + "learning_rate": 9.539475273578729e-05, + "loss": 0.7842, + "step": 1154 + }, + { + "epoch": 2.65, + "learning_rate": 9.524630662246432e-05, + "loss": 0.7706, + "step": 1155 + }, + { + "epoch": 2.65, + "learning_rate": 9.509787100748692e-05, + "loss": 0.802, + "step": 1156 + }, + { + "epoch": 2.65, + "learning_rate": 9.494944621866937e-05, + "loss": 0.9293, + "step": 1157 + }, + { + "epoch": 2.65, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8051, + "step": 1158 + }, + { + "epoch": 2.66, + "learning_rate": 9.465263043065045e-05, + "loss": 0.7449, + "step": 1159 + }, + { + "epoch": 2.66, + "learning_rate": 9.450424008695509e-05, + "loss": 0.7289, + "step": 1160 + }, + { + "epoch": 2.66, + "learning_rate": 9.43558618804302e-05, + "loss": 0.6778, + "step": 1161 + }, + { + "epoch": 2.66, + "learning_rate": 9.420749613876325e-05, + "loss": 0.7731, + "step": 1162 + }, + { + "epoch": 2.66, + "learning_rate": 9.405914318961414e-05, + "loss": 0.6934, + "step": 1163 + }, + { + "epoch": 2.67, + "learning_rate": 9.391080336061454e-05, + "loss": 0.9045, + "step": 1164 + }, + { + "epoch": 2.67, + "learning_rate": 9.376247697936719e-05, + "loss": 0.8016, + "step": 1165 + }, + { + "epoch": 2.67, + "learning_rate": 9.361416437344503e-05, + "loss": 0.6214, + "step": 1166 + }, + { + "epoch": 2.67, + "learning_rate": 9.34658658703907e-05, + "loss": 0.6771, + "step": 1167 + }, + { + "epoch": 2.68, + "learning_rate": 9.331758179771561e-05, + "loss": 0.748, + "step": 1168 + }, + { + "epoch": 2.68, + "learning_rate": 9.316931248289926e-05, + "loss": 0.665, + "step": 1169 + }, + { + "epoch": 2.68, + "learning_rate": 9.302105825338876e-05, + "loss": 0.901, + "step": 1170 + }, + { + "epoch": 2.68, + "learning_rate": 9.287281943659767e-05, + "loss": 0.8342, + "step": 1171 + }, + { + "epoch": 2.68, + "learning_rate": 9.272459635990562e-05, + "loss": 0.853, + "step": 1172 + }, + { + "epoch": 2.69, + "learning_rate": 9.257638935065753e-05, + "loss": 0.8093, + "step": 1173 + }, + { + "epoch": 2.69, + "learning_rate": 9.242819873616268e-05, + "loss": 0.8451, + "step": 1174 + }, + { + "epoch": 2.69, + "learning_rate": 9.228002484369429e-05, + "loss": 0.8628, + "step": 1175 + }, + { + "epoch": 2.69, + "learning_rate": 9.213186800048861e-05, + "loss": 0.7858, + "step": 1176 + }, + { + "epoch": 2.7, + "learning_rate": 9.198372853374415e-05, + "loss": 0.9236, + "step": 1177 + }, + { + "epoch": 2.7, + "learning_rate": 9.183560677062119e-05, + "loss": 0.7925, + "step": 1178 + }, + { + "epoch": 2.7, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7105, + "step": 1179 + }, + { + "epoch": 2.7, + "learning_rate": 9.153941766368439e-05, + "loss": 0.7521, + "step": 1180 + }, + { + "epoch": 2.71, + "learning_rate": 9.139135097399254e-05, + "loss": 0.8648, + "step": 1181 + }, + { + "epoch": 2.71, + "learning_rate": 9.124330329616482e-05, + "loss": 0.8409, + "step": 1182 + }, + { + "epoch": 2.71, + "learning_rate": 9.109527495715872e-05, + "loss": 0.7198, + "step": 1183 + }, + { + "epoch": 2.71, + "learning_rate": 9.094726628388899e-05, + "loss": 0.7365, + "step": 1184 + }, + { + "epoch": 2.71, + "learning_rate": 9.0799277603227e-05, + "loss": 0.7699, + "step": 1185 + }, + { + "epoch": 2.72, + "learning_rate": 9.065130924199998e-05, + "loss": 0.8041, + "step": 1186 + }, + { + "epoch": 2.72, + "learning_rate": 9.050336152699025e-05, + "loss": 0.8308, + "step": 1187 + }, + { + "epoch": 2.72, + "learning_rate": 9.035543478493458e-05, + "loss": 0.8139, + "step": 1188 + }, + { + "epoch": 2.72, + "learning_rate": 9.02075293425233e-05, + "loss": 0.7394, + "step": 1189 + }, + { + "epoch": 2.73, + "learning_rate": 9.005964552639984e-05, + "loss": 0.6738, + "step": 1190 + }, + { + "epoch": 2.73, + "learning_rate": 8.991178366315982e-05, + "loss": 0.9421, + "step": 1191 + }, + { + "epoch": 2.73, + "learning_rate": 8.976394407935034e-05, + "loss": 0.8747, + "step": 1192 + }, + { + "epoch": 2.73, + "learning_rate": 8.961612710146934e-05, + "loss": 0.8282, + "step": 1193 + }, + { + "epoch": 2.74, + "learning_rate": 8.94683330559648e-05, + "loss": 0.765, + "step": 1194 + }, + { + "epoch": 2.74, + "learning_rate": 8.932056226923416e-05, + "loss": 0.8515, + "step": 1195 + }, + { + "epoch": 2.74, + "learning_rate": 8.917281506762335e-05, + "loss": 0.6194, + "step": 1196 + }, + { + "epoch": 2.74, + "learning_rate": 8.902509177742626e-05, + "loss": 0.8852, + "step": 1197 + }, + { + "epoch": 2.74, + "learning_rate": 8.887739272488406e-05, + "loss": 0.7481, + "step": 1198 + }, + { + "epoch": 2.75, + "learning_rate": 8.872971823618424e-05, + "loss": 0.7979, + "step": 1199 + }, + { + "epoch": 2.75, + "learning_rate": 8.858206863746018e-05, + "loss": 0.8332, + "step": 1200 + }, + { + "epoch": 2.75, + "learning_rate": 8.843444425479022e-05, + "loss": 0.6716, + "step": 1201 + }, + { + "epoch": 2.75, + "learning_rate": 8.828684541419696e-05, + "loss": 0.9192, + "step": 1202 + }, + { + "epoch": 2.76, + "learning_rate": 8.813927244164679e-05, + "loss": 0.8463, + "step": 1203 + }, + { + "epoch": 2.76, + "learning_rate": 8.799172566304874e-05, + "loss": 0.6598, + "step": 1204 + }, + { + "epoch": 2.76, + "learning_rate": 8.784420540425412e-05, + "loss": 0.7823, + "step": 1205 + }, + { + "epoch": 2.76, + "learning_rate": 8.769671199105565e-05, + "loss": 0.8728, + "step": 1206 + }, + { + "epoch": 2.77, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7665, + "step": 1207 + }, + { + "epoch": 2.77, + "learning_rate": 8.74018070043208e-05, + "loss": 0.8008, + "step": 1208 + }, + { + "epoch": 2.77, + "learning_rate": 8.725439608207056e-05, + "loss": 0.6833, + "step": 1209 + }, + { + "epoch": 2.77, + "learning_rate": 8.710701330798719e-05, + "loss": 0.7801, + "step": 1210 + }, + { + "epoch": 2.77, + "learning_rate": 8.695965900755985e-05, + "loss": 0.6308, + "step": 1211 + }, + { + "epoch": 2.78, + "learning_rate": 8.68123335062147e-05, + "loss": 0.7851, + "step": 1212 + }, + { + "epoch": 2.78, + "learning_rate": 8.666503712931439e-05, + "loss": 0.7592, + "step": 1213 + }, + { + "epoch": 2.78, + "learning_rate": 8.651777020215712e-05, + "loss": 0.8727, + "step": 1214 + }, + { + "epoch": 2.78, + "learning_rate": 8.637053304997618e-05, + "loss": 0.903, + "step": 1215 + }, + { + "epoch": 2.79, + "learning_rate": 8.622332599793906e-05, + "loss": 0.8076, + "step": 1216 + }, + { + "epoch": 2.79, + "learning_rate": 8.607614937114671e-05, + "loss": 0.8975, + "step": 1217 + }, + { + "epoch": 2.79, + "learning_rate": 8.592900349463297e-05, + "loss": 0.8249, + "step": 1218 + }, + { + "epoch": 2.79, + "learning_rate": 8.578188869336377e-05, + "loss": 0.8529, + "step": 1219 + }, + { + "epoch": 2.79, + "learning_rate": 8.563480529223638e-05, + "loss": 0.8351, + "step": 1220 + }, + { + "epoch": 2.8, + "learning_rate": 8.548775361607872e-05, + "loss": 0.8934, + "step": 1221 + }, + { + "epoch": 2.8, + "learning_rate": 8.534073398964866e-05, + "loss": 0.8067, + "step": 1222 + }, + { + "epoch": 2.8, + "learning_rate": 8.519374673763326e-05, + "loss": 0.8508, + "step": 1223 + }, + { + "epoch": 2.8, + "learning_rate": 8.504679218464816e-05, + "loss": 0.7419, + "step": 1224 + }, + { + "epoch": 2.81, + "learning_rate": 8.489987065523668e-05, + "loss": 0.7808, + "step": 1225 + }, + { + "epoch": 2.81, + "learning_rate": 8.475298247386927e-05, + "loss": 0.8603, + "step": 1226 + }, + { + "epoch": 2.81, + "learning_rate": 8.460612796494272e-05, + "loss": 0.8818, + "step": 1227 + }, + { + "epoch": 2.81, + "learning_rate": 8.445930745277953e-05, + "loss": 0.779, + "step": 1228 + }, + { + "epoch": 2.82, + "learning_rate": 8.431252126162695e-05, + "loss": 0.766, + "step": 1229 + }, + { + "epoch": 2.82, + "learning_rate": 8.41657697156566e-05, + "loss": 0.8743, + "step": 1230 + }, + { + "epoch": 2.82, + "learning_rate": 8.40190531389635e-05, + "loss": 0.882, + "step": 1231 + }, + { + "epoch": 2.82, + "learning_rate": 8.387237185556545e-05, + "loss": 0.7422, + "step": 1232 + }, + { + "epoch": 2.82, + "learning_rate": 8.372572618940231e-05, + "loss": 0.9271, + "step": 1233 + }, + { + "epoch": 2.83, + "learning_rate": 8.357911646433535e-05, + "loss": 0.8051, + "step": 1234 + }, + { + "epoch": 2.83, + "learning_rate": 8.343254300414628e-05, + "loss": 0.782, + "step": 1235 + }, + { + "epoch": 2.83, + "learning_rate": 8.3286006132537e-05, + "loss": 0.8754, + "step": 1236 + }, + { + "epoch": 2.83, + "learning_rate": 8.313950617312835e-05, + "loss": 0.8249, + "step": 1237 + }, + { + "epoch": 2.84, + "learning_rate": 8.299304344945977e-05, + "loss": 0.8342, + "step": 1238 + }, + { + "epoch": 2.84, + "learning_rate": 8.284661828498847e-05, + "loss": 0.8593, + "step": 1239 + }, + { + "epoch": 2.84, + "learning_rate": 8.270023100308865e-05, + "loss": 0.7507, + "step": 1240 + }, + { + "epoch": 2.84, + "learning_rate": 8.255388192705093e-05, + "loss": 0.8462, + "step": 1241 + }, + { + "epoch": 2.85, + "learning_rate": 8.240757138008149e-05, + "loss": 0.8322, + "step": 1242 + }, + { + "epoch": 2.85, + "learning_rate": 8.22612996853014e-05, + "loss": 0.8963, + "step": 1243 + }, + { + "epoch": 2.85, + "learning_rate": 8.211506716574602e-05, + "loss": 0.7419, + "step": 1244 + }, + { + "epoch": 2.85, + "learning_rate": 8.196887414436416e-05, + "loss": 0.8225, + "step": 1245 + }, + { + "epoch": 2.85, + "learning_rate": 8.182272094401735e-05, + "loss": 0.8539, + "step": 1246 + }, + { + "epoch": 2.86, + "learning_rate": 8.167660788747919e-05, + "loss": 0.7852, + "step": 1247 + }, + { + "epoch": 2.86, + "learning_rate": 8.153053529743465e-05, + "loss": 0.9128, + "step": 1248 + }, + { + "epoch": 2.86, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7328, + "step": 1249 + }, + { + "epoch": 2.86, + "learning_rate": 8.123851280711877e-05, + "loss": 0.8816, + "step": 1250 + }, + { + "epoch": 2.87, + "learning_rate": 8.10925635517676e-05, + "loss": 0.7267, + "step": 1251 + }, + { + "epoch": 2.87, + "learning_rate": 8.094665605274913e-05, + "loss": 0.7362, + "step": 1252 + }, + { + "epoch": 2.87, + "learning_rate": 8.080079063229432e-05, + "loss": 0.7475, + "step": 1253 + }, + { + "epoch": 2.87, + "learning_rate": 8.065496761254126e-05, + "loss": 0.7727, + "step": 1254 + }, + { + "epoch": 2.88, + "learning_rate": 8.050918731553431e-05, + "loss": 0.746, + "step": 1255 + }, + { + "epoch": 2.88, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8132, + "step": 1256 + }, + { + "epoch": 2.88, + "learning_rate": 8.021775617746412e-05, + "loss": 0.6752, + "step": 1257 + }, + { + "epoch": 2.88, + "learning_rate": 8.007210598001512e-05, + "loss": 0.7468, + "step": 1258 + }, + { + "epoch": 2.88, + "learning_rate": 7.992649979253934e-05, + "loss": 0.9141, + "step": 1259 + }, + { + "epoch": 2.89, + "learning_rate": 7.978093793660233e-05, + "loss": 0.7706, + "step": 1260 + }, + { + "epoch": 2.89, + "learning_rate": 7.963542073367181e-05, + "loss": 0.8399, + "step": 1261 + }, + { + "epoch": 2.89, + "learning_rate": 7.948994850511677e-05, + "loss": 0.834, + "step": 1262 + }, + { + "epoch": 2.89, + "learning_rate": 7.934452157220694e-05, + "loss": 0.767, + "step": 1263 + }, + { + "epoch": 2.9, + "learning_rate": 7.9199140256112e-05, + "loss": 0.75, + "step": 1264 + }, + { + "epoch": 2.9, + "learning_rate": 7.905380487790088e-05, + "loss": 0.81, + "step": 1265 + }, + { + "epoch": 2.9, + "learning_rate": 7.890851575854108e-05, + "loss": 0.8931, + "step": 1266 + }, + { + "epoch": 2.9, + "learning_rate": 7.876327321889795e-05, + "loss": 0.8929, + "step": 1267 + }, + { + "epoch": 2.9, + "learning_rate": 7.861807757973387e-05, + "loss": 0.787, + "step": 1268 + }, + { + "epoch": 2.91, + "learning_rate": 7.847292916170784e-05, + "loss": 0.8072, + "step": 1269 + }, + { + "epoch": 2.91, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8121, + "step": 1270 + }, + { + "epoch": 2.91, + "learning_rate": 7.818277527118307e-05, + "loss": 0.7951, + "step": 1271 + }, + { + "epoch": 2.91, + "learning_rate": 7.803777043947789e-05, + "loss": 0.7093, + "step": 1272 + }, + { + "epoch": 2.92, + "learning_rate": 7.789281411049625e-05, + "loss": 0.7827, + "step": 1273 + }, + { + "epoch": 2.92, + "learning_rate": 7.774790660436858e-05, + "loss": 0.7433, + "step": 1274 + }, + { + "epoch": 2.92, + "learning_rate": 7.760304824111741e-05, + "loss": 0.7359, + "step": 1275 + }, + { + "epoch": 2.92, + "learning_rate": 7.745823934065671e-05, + "loss": 0.7157, + "step": 1276 + }, + { + "epoch": 2.93, + "learning_rate": 7.731348022279134e-05, + "loss": 0.961, + "step": 1277 + }, + { + "epoch": 2.93, + "learning_rate": 7.716877120721611e-05, + "loss": 0.7718, + "step": 1278 + }, + { + "epoch": 2.93, + "learning_rate": 7.702411261351523e-05, + "loss": 0.835, + "step": 1279 + }, + { + "epoch": 2.93, + "learning_rate": 7.68795047611615e-05, + "loss": 0.9129, + "step": 1280 + }, + { + "epoch": 2.93, + "learning_rate": 7.673494796951573e-05, + "loss": 0.7635, + "step": 1281 + }, + { + "epoch": 2.94, + "learning_rate": 7.659044255782593e-05, + "loss": 0.6873, + "step": 1282 + }, + { + "epoch": 2.94, + "learning_rate": 7.644598884522659e-05, + "loss": 0.6434, + "step": 1283 + }, + { + "epoch": 2.94, + "learning_rate": 7.630158715073813e-05, + "loss": 0.8408, + "step": 1284 + }, + { + "epoch": 2.94, + "learning_rate": 7.615723779326599e-05, + "loss": 0.9042, + "step": 1285 + }, + { + "epoch": 2.95, + "learning_rate": 7.601294109160012e-05, + "loss": 0.7996, + "step": 1286 + }, + { + "epoch": 2.95, + "learning_rate": 7.586869736441413e-05, + "loss": 0.923, + "step": 1287 + }, + { + "epoch": 2.95, + "learning_rate": 7.572450693026462e-05, + "loss": 0.7661, + "step": 1288 + }, + { + "epoch": 2.95, + "learning_rate": 7.55803701075905e-05, + "loss": 0.9105, + "step": 1289 + }, + { + "epoch": 2.96, + "learning_rate": 7.543628721471233e-05, + "loss": 0.8071, + "step": 1290 + }, + { + "epoch": 2.96, + "learning_rate": 7.52922585698315e-05, + "loss": 0.8234, + "step": 1291 + }, + { + "epoch": 2.96, + "learning_rate": 7.514828449102966e-05, + "loss": 0.8131, + "step": 1292 + }, + { + "epoch": 2.96, + "learning_rate": 7.500436529626786e-05, + "loss": 0.8149, + "step": 1293 + }, + { + "epoch": 2.96, + "learning_rate": 7.486050130338612e-05, + "loss": 0.8441, + "step": 1294 + }, + { + "epoch": 2.97, + "learning_rate": 7.471669283010232e-05, + "loss": 0.8269, + "step": 1295 + }, + { + "epoch": 2.97, + "learning_rate": 7.457294019401191e-05, + "loss": 0.632, + "step": 1296 + }, + { + "epoch": 2.97, + "learning_rate": 7.442924371258694e-05, + "loss": 0.8522, + "step": 1297 + }, + { + "epoch": 2.97, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8387, + "step": 1298 + }, + { + "epoch": 2.98, + "learning_rate": 7.414202048300072e-05, + "loss": 0.887, + "step": 1299 + }, + { + "epoch": 2.98, + "learning_rate": 7.399849436916077e-05, + "loss": 0.8273, + "step": 1300 + }, + { + "epoch": 2.98, + "learning_rate": 7.385502567862728e-05, + "loss": 0.7807, + "step": 1301 + }, + { + "epoch": 2.98, + "learning_rate": 7.371161472824536e-05, + "loss": 0.9077, + "step": 1302 + }, + { + "epoch": 2.99, + "learning_rate": 7.35682618347324e-05, + "loss": 0.8779, + "step": 1303 + }, + { + "epoch": 2.99, + "learning_rate": 7.342496731467767e-05, + "loss": 0.8595, + "step": 1304 + }, + { + "epoch": 2.99, + "learning_rate": 7.328173148454151e-05, + "loss": 0.8391, + "step": 1305 + }, + { + "epoch": 2.99, + "learning_rate": 7.31385546606546e-05, + "loss": 0.7559, + "step": 1306 + }, + { + "epoch": 2.99, + "learning_rate": 7.29954371592174e-05, + "loss": 0.8926, + "step": 1307 + }, + { + "epoch": 3.0, + "learning_rate": 7.285237929629928e-05, + "loss": 0.8443, + "step": 1308 + }, + { + "epoch": 3.0, + "learning_rate": 7.27093813878379e-05, + "loss": 0.7854, + "step": 1309 + }, + { + "epoch": 3.0, + "learning_rate": 7.256644374963857e-05, + "loss": 0.9361, + "step": 1310 + }, + { + "epoch": 3.0, + "learning_rate": 7.242356669737344e-05, + "loss": 0.7515, + "step": 1311 + }, + { + "epoch": 3.01, + "learning_rate": 7.228075054658096e-05, + "loss": 0.5228, + "step": 1312 + }, + { + "epoch": 3.01, + "learning_rate": 7.213799561266489e-05, + "loss": 0.8614, + "step": 1313 + }, + { + "epoch": 3.01, + "learning_rate": 7.199530221089398e-05, + "loss": 0.6461, + "step": 1314 + }, + { + "epoch": 3.01, + "learning_rate": 7.185267065640104e-05, + "loss": 0.6926, + "step": 1315 + }, + { + "epoch": 3.01, + "learning_rate": 7.171010126418218e-05, + "loss": 0.8601, + "step": 1316 + }, + { + "epoch": 3.02, + "learning_rate": 7.156759434909639e-05, + "loss": 0.784, + "step": 1317 + }, + { + "epoch": 3.02, + "learning_rate": 7.142515022586456e-05, + "loss": 1.0793, + "step": 1318 + }, + { + "epoch": 3.02, + "learning_rate": 7.1282769209069e-05, + "loss": 0.71, + "step": 1319 + }, + { + "epoch": 3.02, + "learning_rate": 7.114045161315261e-05, + "loss": 0.7129, + "step": 1320 + }, + { + "epoch": 3.03, + "learning_rate": 7.099819775241819e-05, + "loss": 0.6223, + "step": 1321 + }, + { + "epoch": 3.03, + "learning_rate": 7.085600794102783e-05, + "loss": 0.643, + "step": 1322 + }, + { + "epoch": 3.03, + "learning_rate": 7.071388249300218e-05, + "loss": 0.7678, + "step": 1323 + }, + { + "epoch": 3.03, + "learning_rate": 7.057182172221967e-05, + "loss": 0.6995, + "step": 1324 + }, + { + "epoch": 3.04, + "learning_rate": 7.042982594241601e-05, + "loss": 0.6812, + "step": 1325 + }, + { + "epoch": 3.04, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7234, + "step": 1326 + }, + { + "epoch": 3.04, + "learning_rate": 7.014603060996938e-05, + "loss": 0.8338, + "step": 1327 + }, + { + "epoch": 3.04, + "learning_rate": 7.00042316840773e-05, + "loss": 0.9738, + "step": 1328 + }, + { + "epoch": 3.04, + "learning_rate": 6.98624990026644e-05, + "loss": 0.6211, + "step": 1329 + }, + { + "epoch": 3.05, + "learning_rate": 6.972083287874177e-05, + "loss": 0.7343, + "step": 1330 + }, + { + "epoch": 3.05, + "learning_rate": 6.957923362517348e-05, + "loss": 0.7291, + "step": 1331 + }, + { + "epoch": 3.05, + "learning_rate": 6.943770155467593e-05, + "loss": 0.7687, + "step": 1332 + }, + { + "epoch": 3.05, + "learning_rate": 6.929623697981718e-05, + "loss": 0.7509, + "step": 1333 + }, + { + "epoch": 3.06, + "learning_rate": 6.915484021301613e-05, + "loss": 0.769, + "step": 1334 + }, + { + "epoch": 3.06, + "learning_rate": 6.90135115665421e-05, + "loss": 0.7605, + "step": 1335 + }, + { + "epoch": 3.06, + "learning_rate": 6.887225135251381e-05, + "loss": 0.7519, + "step": 1336 + }, + { + "epoch": 3.06, + "learning_rate": 6.873105988289892e-05, + "loss": 0.7648, + "step": 1337 + }, + { + "epoch": 3.07, + "learning_rate": 6.858993746951328e-05, + "loss": 0.8969, + "step": 1338 + }, + { + "epoch": 3.07, + "learning_rate": 6.844888442402018e-05, + "loss": 0.7229, + "step": 1339 + }, + { + "epoch": 3.07, + "learning_rate": 6.830790105792973e-05, + "loss": 0.6294, + "step": 1340 + }, + { + "epoch": 3.07, + "learning_rate": 6.816698768259824e-05, + "loss": 0.7872, + "step": 1341 + }, + { + "epoch": 3.07, + "learning_rate": 6.802614460922728e-05, + "loss": 0.7555, + "step": 1342 + }, + { + "epoch": 3.08, + "learning_rate": 6.788537214886335e-05, + "loss": 0.7431, + "step": 1343 + }, + { + "epoch": 3.08, + "learning_rate": 6.774467061239687e-05, + "loss": 0.7502, + "step": 1344 + }, + { + "epoch": 3.08, + "learning_rate": 6.760404031056169e-05, + "loss": 0.9202, + "step": 1345 + }, + { + "epoch": 3.08, + "learning_rate": 6.74634815539343e-05, + "loss": 0.8221, + "step": 1346 + }, + { + "epoch": 3.09, + "learning_rate": 6.732299465293322e-05, + "loss": 0.8935, + "step": 1347 + }, + { + "epoch": 3.09, + "learning_rate": 6.718257991781828e-05, + "loss": 0.6869, + "step": 1348 + }, + { + "epoch": 3.09, + "learning_rate": 6.704223765868991e-05, + "loss": 0.6931, + "step": 1349 + }, + { + "epoch": 3.09, + "learning_rate": 6.690196818548846e-05, + "loss": 0.7308, + "step": 1350 + }, + { + "epoch": 3.1, + "learning_rate": 6.67617718079936e-05, + "loss": 0.779, + "step": 1351 + }, + { + "epoch": 3.1, + "learning_rate": 6.662164883582354e-05, + "loss": 0.7807, + "step": 1352 + }, + { + "epoch": 3.1, + "learning_rate": 6.648159957843438e-05, + "loss": 0.7942, + "step": 1353 + }, + { + "epoch": 3.1, + "learning_rate": 6.63416243451194e-05, + "loss": 0.842, + "step": 1354 + }, + { + "epoch": 3.1, + "learning_rate": 6.62017234450084e-05, + "loss": 0.9713, + "step": 1355 + }, + { + "epoch": 3.11, + "learning_rate": 6.60618971870671e-05, + "loss": 0.5946, + "step": 1356 + }, + { + "epoch": 3.11, + "learning_rate": 6.592214588009625e-05, + "loss": 0.656, + "step": 1357 + }, + { + "epoch": 3.11, + "learning_rate": 6.578246983273118e-05, + "loss": 0.7192, + "step": 1358 + }, + { + "epoch": 3.11, + "learning_rate": 6.564286935344089e-05, + "loss": 0.7485, + "step": 1359 + }, + { + "epoch": 3.12, + "learning_rate": 6.550334475052767e-05, + "loss": 0.8379, + "step": 1360 + }, + { + "epoch": 3.12, + "learning_rate": 6.536389633212609e-05, + "loss": 0.9204, + "step": 1361 + }, + { + "epoch": 3.12, + "learning_rate": 6.522452440620254e-05, + "loss": 0.7924, + "step": 1362 + }, + { + "epoch": 3.12, + "learning_rate": 6.508522928055445e-05, + "loss": 0.7988, + "step": 1363 + }, + { + "epoch": 3.12, + "learning_rate": 6.494601126280963e-05, + "loss": 0.7678, + "step": 1364 + }, + { + "epoch": 3.13, + "learning_rate": 6.480687066042561e-05, + "loss": 0.7079, + "step": 1365 + }, + { + "epoch": 3.13, + "learning_rate": 6.466780778068903e-05, + "loss": 0.7104, + "step": 1366 + }, + { + "epoch": 3.13, + "learning_rate": 6.452882293071468e-05, + "loss": 0.7226, + "step": 1367 + }, + { + "epoch": 3.13, + "learning_rate": 6.43899164174453e-05, + "loss": 0.8358, + "step": 1368 + }, + { + "epoch": 3.14, + "learning_rate": 6.42510885476504e-05, + "loss": 0.6752, + "step": 1369 + }, + { + "epoch": 3.14, + "learning_rate": 6.411233962792593e-05, + "loss": 0.7962, + "step": 1370 + }, + { + "epoch": 3.14, + "learning_rate": 6.397366996469343e-05, + "loss": 0.8052, + "step": 1371 + }, + { + "epoch": 3.14, + "learning_rate": 6.383507986419939e-05, + "loss": 0.9013, + "step": 1372 + }, + { + "epoch": 3.15, + "learning_rate": 6.369656963251467e-05, + "loss": 0.798, + "step": 1373 + }, + { + "epoch": 3.15, + "learning_rate": 6.355813957553364e-05, + "loss": 0.7121, + "step": 1374 + }, + { + "epoch": 3.15, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7275, + "step": 1375 + }, + { + "epoch": 3.15, + "learning_rate": 6.328152120837439e-05, + "loss": 0.7393, + "step": 1376 + }, + { + "epoch": 3.15, + "learning_rate": 6.314333350909701e-05, + "loss": 0.9145, + "step": 1377 + }, + { + "epoch": 3.16, + "learning_rate": 6.300522720632367e-05, + "loss": 0.8225, + "step": 1378 + }, + { + "epoch": 3.16, + "learning_rate": 6.286720260505668e-05, + "loss": 0.842, + "step": 1379 + }, + { + "epoch": 3.16, + "learning_rate": 6.2729260010118e-05, + "loss": 0.9227, + "step": 1380 + }, + { + "epoch": 3.16, + "learning_rate": 6.259139972614845e-05, + "loss": 0.8438, + "step": 1381 + }, + { + "epoch": 3.17, + "learning_rate": 6.245362205760704e-05, + "loss": 0.9213, + "step": 1382 + }, + { + "epoch": 3.17, + "learning_rate": 6.231592730877035e-05, + "loss": 0.7469, + "step": 1383 + }, + { + "epoch": 3.17, + "learning_rate": 6.217831578373185e-05, + "loss": 0.7289, + "step": 1384 + }, + { + "epoch": 3.17, + "learning_rate": 6.204078778640121e-05, + "loss": 0.8306, + "step": 1385 + }, + { + "epoch": 3.18, + "learning_rate": 6.190334362050365e-05, + "loss": 0.7807, + "step": 1386 + }, + { + "epoch": 3.18, + "learning_rate": 6.176598358957919e-05, + "loss": 0.7564, + "step": 1387 + }, + { + "epoch": 3.18, + "learning_rate": 6.162870799698209e-05, + "loss": 0.8306, + "step": 1388 + }, + { + "epoch": 3.18, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7317, + "step": 1389 + }, + { + "epoch": 3.18, + "learning_rate": 6.135441133925382e-05, + "loss": 0.8923, + "step": 1390 + }, + { + "epoch": 3.19, + "learning_rate": 6.121739087989613e-05, + "loss": 0.7723, + "step": 1391 + }, + { + "epoch": 3.19, + "learning_rate": 6.108045607041125e-05, + "loss": 0.796, + "step": 1392 + }, + { + "epoch": 3.19, + "learning_rate": 6.0943607213214425e-05, + "loss": 0.7907, + "step": 1393 + }, + { + "epoch": 3.19, + "learning_rate": 6.0806844610530956e-05, + "loss": 0.7709, + "step": 1394 + }, + { + "epoch": 3.2, + "learning_rate": 6.0670168564395705e-05, + "loss": 0.8841, + "step": 1395 + }, + { + "epoch": 3.2, + "learning_rate": 6.053357937665237e-05, + "loss": 0.6325, + "step": 1396 + }, + { + "epoch": 3.2, + "learning_rate": 6.039707734895279e-05, + "loss": 0.8047, + "step": 1397 + }, + { + "epoch": 3.2, + "learning_rate": 6.0260662782756374e-05, + "loss": 0.7933, + "step": 1398 + }, + { + "epoch": 3.21, + "learning_rate": 6.012433597932936e-05, + "loss": 0.8016, + "step": 1399 + }, + { + "epoch": 3.21, + "learning_rate": 5.998809723974407e-05, + "loss": 0.8992, + "step": 1400 + }, + { + "epoch": 3.21, + "learning_rate": 5.985194686487854e-05, + "loss": 0.7384, + "step": 1401 + }, + { + "epoch": 3.21, + "learning_rate": 5.971588515541546e-05, + "loss": 0.7214, + "step": 1402 + }, + { + "epoch": 3.21, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7394, + "step": 1403 + }, + { + "epoch": 3.22, + "learning_rate": 5.94440289344481e-05, + "loss": 0.6268, + "step": 1404 + }, + { + "epoch": 3.22, + "learning_rate": 5.9308235023327604e-05, + "loss": 0.8049, + "step": 1405 + }, + { + "epoch": 3.22, + "learning_rate": 5.9172530978375894e-05, + "loss": 0.8396, + "step": 1406 + }, + { + "epoch": 3.22, + "learning_rate": 5.9036917099290026e-05, + "loss": 0.7694, + "step": 1407 + }, + { + "epoch": 3.23, + "learning_rate": 5.890139368556791e-05, + "loss": 0.7289, + "step": 1408 + }, + { + "epoch": 3.23, + "learning_rate": 5.8765961036507736e-05, + "loss": 0.7949, + "step": 1409 + }, + { + "epoch": 3.23, + "learning_rate": 5.863061945120719e-05, + "loss": 0.9371, + "step": 1410 + }, + { + "epoch": 3.23, + "learning_rate": 5.8495369228562894e-05, + "loss": 0.7323, + "step": 1411 + }, + { + "epoch": 3.23, + "learning_rate": 5.836021066726962e-05, + "loss": 0.8331, + "step": 1412 + }, + { + "epoch": 3.24, + "learning_rate": 5.8225144065819745e-05, + "loss": 0.768, + "step": 1413 + }, + { + "epoch": 3.24, + "learning_rate": 5.809016972250263e-05, + "loss": 0.7804, + "step": 1414 + }, + { + "epoch": 3.24, + "learning_rate": 5.795528793540379e-05, + "loss": 0.771, + "step": 1415 + }, + { + "epoch": 3.24, + "learning_rate": 5.782049900240432e-05, + "loss": 0.7431, + "step": 1416 + }, + { + "epoch": 3.25, + "learning_rate": 5.768580322118034e-05, + "loss": 0.8618, + "step": 1417 + }, + { + "epoch": 3.25, + "learning_rate": 5.755120088920225e-05, + "loss": 0.7639, + "step": 1418 + }, + { + "epoch": 3.25, + "learning_rate": 5.7416692303733946e-05, + "loss": 0.8375, + "step": 1419 + }, + { + "epoch": 3.25, + "learning_rate": 5.728227776183244e-05, + "loss": 0.7409, + "step": 1420 + }, + { + "epoch": 3.26, + "learning_rate": 5.714795756034695e-05, + "loss": 0.7529, + "step": 1421 + }, + { + "epoch": 3.26, + "learning_rate": 5.701373199591835e-05, + "loss": 0.8878, + "step": 1422 + }, + { + "epoch": 3.26, + "learning_rate": 5.687960136497861e-05, + "loss": 0.6923, + "step": 1423 + }, + { + "epoch": 3.26, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.8628, + "step": 1424 + }, + { + "epoch": 3.26, + "learning_rate": 5.6611626088244194e-05, + "loss": 0.6949, + "step": 1425 + }, + { + "epoch": 3.27, + "learning_rate": 5.6477782034262436e-05, + "loss": 0.7278, + "step": 1426 + }, + { + "epoch": 3.27, + "learning_rate": 5.634403409739402e-05, + "loss": 0.8781, + "step": 1427 + }, + { + "epoch": 3.27, + "learning_rate": 5.621038257301601e-05, + "loss": 0.7329, + "step": 1428 + }, + { + "epoch": 3.27, + "learning_rate": 5.6076827756292495e-05, + "loss": 0.7195, + "step": 1429 + }, + { + "epoch": 3.28, + "learning_rate": 5.594336994217415e-05, + "loss": 0.7283, + "step": 1430 + }, + { + "epoch": 3.28, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.8064, + "step": 1431 + }, + { + "epoch": 3.28, + "learning_rate": 5.5676746500483336e-05, + "loss": 0.8488, + "step": 1432 + }, + { + "epoch": 3.28, + "learning_rate": 5.55435814617383e-05, + "loss": 0.8925, + "step": 1433 + }, + { + "epoch": 3.29, + "learning_rate": 5.5410514603251985e-05, + "loss": 0.7677, + "step": 1434 + }, + { + "epoch": 3.29, + "learning_rate": 5.5277546218897294e-05, + "loss": 0.8037, + "step": 1435 + }, + { + "epoch": 3.29, + "learning_rate": 5.514467660232965e-05, + "loss": 0.8046, + "step": 1436 + }, + { + "epoch": 3.29, + "learning_rate": 5.5011906046986473e-05, + "loss": 0.7885, + "step": 1437 + }, + { + "epoch": 3.29, + "learning_rate": 5.487923484608629e-05, + "loss": 0.8264, + "step": 1438 + }, + { + "epoch": 3.3, + "learning_rate": 5.4746663292628234e-05, + "loss": 0.7551, + "step": 1439 + }, + { + "epoch": 3.3, + "learning_rate": 5.4614191679391444e-05, + "loss": 0.8766, + "step": 1440 + }, + { + "epoch": 3.3, + "learning_rate": 5.448182029893423e-05, + "loss": 0.8992, + "step": 1441 + }, + { + "epoch": 3.3, + "learning_rate": 5.434954944359365e-05, + "loss": 0.6505, + "step": 1442 + }, + { + "epoch": 3.31, + "learning_rate": 5.4217379405484636e-05, + "loss": 0.8743, + "step": 1443 + }, + { + "epoch": 3.31, + "learning_rate": 5.408531047649964e-05, + "loss": 0.6965, + "step": 1444 + }, + { + "epoch": 3.31, + "learning_rate": 5.395334294830765e-05, + "loss": 0.7663, + "step": 1445 + }, + { + "epoch": 3.31, + "learning_rate": 5.382147711235377e-05, + "loss": 0.86, + "step": 1446 + }, + { + "epoch": 3.32, + "learning_rate": 5.3689713259858586e-05, + "loss": 0.7524, + "step": 1447 + }, + { + "epoch": 3.32, + "learning_rate": 5.355805168181738e-05, + "loss": 0.9115, + "step": 1448 + }, + { + "epoch": 3.32, + "learning_rate": 5.342649266899955e-05, + "loss": 0.8342, + "step": 1449 + }, + { + "epoch": 3.32, + "learning_rate": 5.329503651194805e-05, + "loss": 0.8447, + "step": 1450 + }, + { + "epoch": 3.32, + "learning_rate": 5.316368350097869e-05, + "loss": 0.7877, + "step": 1451 + }, + { + "epoch": 3.33, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7965, + "step": 1452 + }, + { + "epoch": 3.33, + "learning_rate": 5.290128807740976e-05, + "loss": 0.7844, + "step": 1453 + }, + { + "epoch": 3.33, + "learning_rate": 5.2770246244300224e-05, + "loss": 0.7405, + "step": 1454 + }, + { + "epoch": 3.33, + "learning_rate": 5.263930871625151e-05, + "loss": 0.7782, + "step": 1455 + }, + { + "epoch": 3.34, + "learning_rate": 5.2508475782434093e-05, + "loss": 0.7789, + "step": 1456 + }, + { + "epoch": 3.34, + "learning_rate": 5.237774773178734e-05, + "loss": 0.8943, + "step": 1457 + }, + { + "epoch": 3.34, + "learning_rate": 5.224712485301898e-05, + "loss": 0.7712, + "step": 1458 + }, + { + "epoch": 3.34, + "learning_rate": 5.211660743460458e-05, + "loss": 0.8608, + "step": 1459 + }, + { + "epoch": 3.34, + "learning_rate": 5.198619576478678e-05, + "loss": 0.7212, + "step": 1460 + }, + { + "epoch": 3.35, + "learning_rate": 5.1855890131574614e-05, + "loss": 0.7588, + "step": 1461 + }, + { + "epoch": 3.35, + "learning_rate": 5.17256908227429e-05, + "loss": 0.8001, + "step": 1462 + }, + { + "epoch": 3.35, + "learning_rate": 5.159559812583181e-05, + "loss": 0.8327, + "step": 1463 + }, + { + "epoch": 3.35, + "learning_rate": 5.146561232814593e-05, + "loss": 0.8874, + "step": 1464 + }, + { + "epoch": 3.36, + "learning_rate": 5.133573371675375e-05, + "loss": 0.6802, + "step": 1465 + }, + { + "epoch": 3.36, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7581, + "step": 1466 + }, + { + "epoch": 3.36, + "learning_rate": 5.1076299199940645e-05, + "loss": 0.8714, + "step": 1467 + }, + { + "epoch": 3.36, + "learning_rate": 5.094674386747067e-05, + "loss": 0.6667, + "step": 1468 + }, + { + "epoch": 3.37, + "learning_rate": 5.081729686719508e-05, + "loss": 0.8107, + "step": 1469 + }, + { + "epoch": 3.37, + "learning_rate": 5.068795848499257e-05, + "loss": 0.8891, + "step": 1470 + }, + { + "epoch": 3.37, + "learning_rate": 5.0558729006501846e-05, + "loss": 0.7259, + "step": 1471 + }, + { + "epoch": 3.37, + "learning_rate": 5.042960871712112e-05, + "loss": 0.8035, + "step": 1472 + }, + { + "epoch": 3.37, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7042, + "step": 1473 + }, + { + "epoch": 3.38, + "learning_rate": 5.0171696846076446e-05, + "loss": 0.7852, + "step": 1474 + }, + { + "epoch": 3.38, + "learning_rate": 5.004290583400075e-05, + "loss": 0.8489, + "step": 1475 + }, + { + "epoch": 3.38, + "learning_rate": 4.9914225150210335e-05, + "loss": 0.7696, + "step": 1476 + }, + { + "epoch": 3.38, + "learning_rate": 4.97856550788915e-05, + "loss": 0.7, + "step": 1477 + }, + { + "epoch": 3.39, + "learning_rate": 4.9657195903986185e-05, + "loss": 0.8373, + "step": 1478 + }, + { + "epoch": 3.39, + "learning_rate": 4.952884790919141e-05, + "loss": 0.8822, + "step": 1479 + }, + { + "epoch": 3.39, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7292, + "step": 1480 + }, + { + "epoch": 3.39, + "learning_rate": 4.927248659349355e-05, + "loss": 0.8165, + "step": 1481 + }, + { + "epoch": 3.4, + "learning_rate": 4.914447383875432e-05, + "loss": 0.7782, + "step": 1482 + }, + { + "epoch": 3.4, + "learning_rate": 4.901657339645226e-05, + "loss": 0.8172, + "step": 1483 + }, + { + "epoch": 3.4, + "learning_rate": 4.888878554905051e-05, + "loss": 0.8072, + "step": 1484 + }, + { + "epoch": 3.4, + "learning_rate": 4.876111057876347e-05, + "loss": 0.7715, + "step": 1485 + }, + { + "epoch": 3.4, + "learning_rate": 4.863354876755637e-05, + "loss": 0.7384, + "step": 1486 + }, + { + "epoch": 3.41, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7881, + "step": 1487 + }, + { + "epoch": 3.41, + "learning_rate": 4.837876574899237e-05, + "loss": 0.7962, + "step": 1488 + }, + { + "epoch": 3.41, + "learning_rate": 4.8251545104313836e-05, + "loss": 0.5635, + "step": 1489 + }, + { + "epoch": 3.41, + "learning_rate": 4.812443874407059e-05, + "loss": 0.7454, + "step": 1490 + }, + { + "epoch": 3.42, + "learning_rate": 4.7997446948972015e-05, + "loss": 0.8505, + "step": 1491 + }, + { + "epoch": 3.42, + "learning_rate": 4.787056999947455e-05, + "loss": 0.6157, + "step": 1492 + }, + { + "epoch": 3.42, + "learning_rate": 4.774380817578101e-05, + "loss": 0.7731, + "step": 1493 + }, + { + "epoch": 3.42, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8062, + "step": 1494 + }, + { + "epoch": 3.42, + "learning_rate": 4.74906310253448e-05, + "loss": 0.7027, + "step": 1495 + }, + { + "epoch": 3.43, + "learning_rate": 4.736421625773396e-05, + "loss": 0.7, + "step": 1496 + }, + { + "epoch": 3.43, + "learning_rate": 4.723791773418942e-05, + "loss": 0.7822, + "step": 1497 + }, + { + "epoch": 3.43, + "learning_rate": 4.7111735733636466e-05, + "loss": 0.6308, + "step": 1498 + }, + { + "epoch": 3.43, + "learning_rate": 4.698567053474315e-05, + "loss": 0.6722, + "step": 1499 + }, + { + "epoch": 3.44, + "learning_rate": 4.685972241591956e-05, + "loss": 0.749, + "step": 1500 + }, + { + "epoch": 3.44, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7784, + "step": 1501 + }, + { + "epoch": 3.44, + "learning_rate": 4.6608178530828174e-05, + "loss": 0.7971, + "step": 1502 + }, + { + "epoch": 3.44, + "learning_rate": 4.648258332008523e-05, + "loss": 0.8398, + "step": 1503 + }, + { + "epoch": 3.45, + "learning_rate": 4.6357106300460374e-05, + "loss": 0.6559, + "step": 1504 + }, + { + "epoch": 3.45, + "learning_rate": 4.6231747749064644e-05, + "loss": 0.7837, + "step": 1505 + }, + { + "epoch": 3.45, + "learning_rate": 4.610650794274759e-05, + "loss": 0.8072, + "step": 1506 + }, + { + "epoch": 3.45, + "learning_rate": 4.598138715809633e-05, + "loss": 0.7441, + "step": 1507 + }, + { + "epoch": 3.45, + "learning_rate": 4.585638567143529e-05, + "loss": 0.8233, + "step": 1508 + }, + { + "epoch": 3.46, + "learning_rate": 4.573150375882527e-05, + "loss": 0.8868, + "step": 1509 + }, + { + "epoch": 3.46, + "learning_rate": 4.560674169606317e-05, + "loss": 0.7059, + "step": 1510 + }, + { + "epoch": 3.46, + "learning_rate": 4.548209975868108e-05, + "loss": 0.8349, + "step": 1511 + }, + { + "epoch": 3.46, + "learning_rate": 4.5357578221945794e-05, + "loss": 0.817, + "step": 1512 + }, + { + "epoch": 3.47, + "learning_rate": 4.523317736085831e-05, + "loss": 0.7375, + "step": 1513 + }, + { + "epoch": 3.47, + "learning_rate": 4.5108897450153054e-05, + "loss": 0.8338, + "step": 1514 + }, + { + "epoch": 3.47, + "learning_rate": 4.498473876429726e-05, + "loss": 0.9212, + "step": 1515 + }, + { + "epoch": 3.47, + "learning_rate": 4.4860701577490595e-05, + "loss": 0.7182, + "step": 1516 + }, + { + "epoch": 3.48, + "learning_rate": 4.473678616366433e-05, + "loss": 0.8677, + "step": 1517 + }, + { + "epoch": 3.48, + "learning_rate": 4.461299279648077e-05, + "loss": 0.7868, + "step": 1518 + }, + { + "epoch": 3.48, + "learning_rate": 4.4489321749332744e-05, + "loss": 0.7078, + "step": 1519 + }, + { + "epoch": 3.48, + "learning_rate": 4.436577329534291e-05, + "loss": 0.6872, + "step": 1520 + }, + { + "epoch": 3.48, + "learning_rate": 4.424234770736314e-05, + "loss": 0.7523, + "step": 1521 + }, + { + "epoch": 3.49, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7107, + "step": 1522 + }, + { + "epoch": 3.49, + "learning_rate": 4.3995866219484326e-05, + "loss": 0.8932, + "step": 1523 + }, + { + "epoch": 3.49, + "learning_rate": 4.387281086392994e-05, + "loss": 0.7811, + "step": 1524 + }, + { + "epoch": 3.49, + "learning_rate": 4.374987946307385e-05, + "loss": 0.8946, + "step": 1525 + }, + { + "epoch": 3.5, + "learning_rate": 4.362707228840531e-05, + "loss": 0.8496, + "step": 1526 + }, + { + "epoch": 3.5, + "learning_rate": 4.350438961113911e-05, + "loss": 0.6998, + "step": 1527 + }, + { + "epoch": 3.5, + "learning_rate": 4.3381831702215084e-05, + "loss": 0.6792, + "step": 1528 + }, + { + "epoch": 3.5, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7644, + "step": 1529 + }, + { + "epoch": 3.51, + "learning_rate": 4.3137091271775e-05, + "loss": 0.6055, + "step": 1530 + }, + { + "epoch": 3.51, + "learning_rate": 4.301490929075852e-05, + "loss": 0.7126, + "step": 1531 + }, + { + "epoch": 3.51, + "learning_rate": 4.289285315908237e-05, + "loss": 0.7635, + "step": 1532 + }, + { + "epoch": 3.51, + "learning_rate": 4.277092314630278e-05, + "loss": 0.9089, + "step": 1533 + }, + { + "epoch": 3.51, + "learning_rate": 4.264911952169735e-05, + "loss": 0.7267, + "step": 1534 + }, + { + "epoch": 3.52, + "learning_rate": 4.2527442554264605e-05, + "loss": 0.6774, + "step": 1535 + }, + { + "epoch": 3.52, + "learning_rate": 4.240589251272342e-05, + "loss": 0.8402, + "step": 1536 + }, + { + "epoch": 3.52, + "learning_rate": 4.228446966551226e-05, + "loss": 0.8603, + "step": 1537 + }, + { + "epoch": 3.52, + "learning_rate": 4.2163174280788697e-05, + "loss": 0.6459, + "step": 1538 + }, + { + "epoch": 3.53, + "learning_rate": 4.2042006626428906e-05, + "loss": 0.7192, + "step": 1539 + }, + { + "epoch": 3.53, + "learning_rate": 4.192096697002686e-05, + "loss": 0.8621, + "step": 1540 + }, + { + "epoch": 3.53, + "learning_rate": 4.1800055578893883e-05, + "loss": 0.8194, + "step": 1541 + }, + { + "epoch": 3.53, + "learning_rate": 4.167927272005805e-05, + "loss": 0.8702, + "step": 1542 + }, + { + "epoch": 3.53, + "learning_rate": 4.155861866026364e-05, + "loss": 0.8677, + "step": 1543 + }, + { + "epoch": 3.54, + "learning_rate": 4.143809366597037e-05, + "loss": 0.7971, + "step": 1544 + }, + { + "epoch": 3.54, + "learning_rate": 4.131769800335292e-05, + "loss": 0.7896, + "step": 1545 + }, + { + "epoch": 3.54, + "learning_rate": 4.119743193830048e-05, + "loss": 0.889, + "step": 1546 + }, + { + "epoch": 3.54, + "learning_rate": 4.10772957364159e-05, + "loss": 0.7497, + "step": 1547 + }, + { + "epoch": 3.55, + "learning_rate": 4.0957289663015255e-05, + "loss": 0.9096, + "step": 1548 + }, + { + "epoch": 3.55, + "learning_rate": 4.083741398312727e-05, + "loss": 0.8658, + "step": 1549 + }, + { + "epoch": 3.55, + "learning_rate": 4.071766896149273e-05, + "loss": 0.5634, + "step": 1550 + }, + { + "epoch": 3.55, + "learning_rate": 4.059805486256376e-05, + "loss": 0.6693, + "step": 1551 + }, + { + "epoch": 3.56, + "learning_rate": 4.0478571950503486e-05, + "loss": 0.7128, + "step": 1552 + }, + { + "epoch": 3.56, + "learning_rate": 4.035922048918519e-05, + "loss": 0.7838, + "step": 1553 + }, + { + "epoch": 3.56, + "learning_rate": 4.024000074219187e-05, + "loss": 0.9549, + "step": 1554 + }, + { + "epoch": 3.56, + "learning_rate": 4.012091297281574e-05, + "loss": 0.6245, + "step": 1555 + }, + { + "epoch": 3.56, + "learning_rate": 4.0001957444057426e-05, + "loss": 0.7671, + "step": 1556 + }, + { + "epoch": 3.57, + "learning_rate": 3.988313441862553e-05, + "loss": 0.6645, + "step": 1557 + }, + { + "epoch": 3.57, + "learning_rate": 3.976444415893608e-05, + "loss": 0.8291, + "step": 1558 + }, + { + "epoch": 3.57, + "learning_rate": 3.96458869271119e-05, + "loss": 0.8715, + "step": 1559 + }, + { + "epoch": 3.57, + "learning_rate": 3.952746298498195e-05, + "loss": 0.8423, + "step": 1560 + }, + { + "epoch": 3.58, + "learning_rate": 3.940917259408085e-05, + "loss": 0.8303, + "step": 1561 + }, + { + "epoch": 3.58, + "learning_rate": 3.929101601564834e-05, + "loss": 0.7876, + "step": 1562 + }, + { + "epoch": 3.58, + "learning_rate": 3.9172993510628574e-05, + "loss": 0.7409, + "step": 1563 + }, + { + "epoch": 3.58, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.8988, + "step": 1564 + }, + { + "epoch": 3.59, + "learning_rate": 3.8937351763122845e-05, + "loss": 1.0367, + "step": 1565 + }, + { + "epoch": 3.59, + "learning_rate": 3.8819733041042515e-05, + "loss": 0.682, + "step": 1566 + }, + { + "epoch": 3.59, + "learning_rate": 3.870224943318491e-05, + "loss": 0.815, + "step": 1567 + }, + { + "epoch": 3.59, + "learning_rate": 3.858490119900794e-05, + "loss": 0.6516, + "step": 1568 + }, + { + "epoch": 3.59, + "learning_rate": 3.846768859767066e-05, + "loss": 0.7371, + "step": 1569 + }, + { + "epoch": 3.6, + "learning_rate": 3.8350611888032474e-05, + "loss": 0.7401, + "step": 1570 + }, + { + "epoch": 3.6, + "learning_rate": 3.823367132865265e-05, + "loss": 0.7305, + "step": 1571 + }, + { + "epoch": 3.6, + "learning_rate": 3.8116867177789936e-05, + "loss": 0.7422, + "step": 1572 + }, + { + "epoch": 3.6, + "learning_rate": 3.8000199693401675e-05, + "loss": 0.7621, + "step": 1573 + }, + { + "epoch": 3.61, + "learning_rate": 3.788366913314339e-05, + "loss": 0.935, + "step": 1574 + }, + { + "epoch": 3.61, + "learning_rate": 3.776727575436829e-05, + "loss": 0.7587, + "step": 1575 + }, + { + "epoch": 3.61, + "learning_rate": 3.7651019814126654e-05, + "loss": 0.9029, + "step": 1576 + }, + { + "epoch": 3.61, + "learning_rate": 3.753490156916511e-05, + "loss": 0.8324, + "step": 1577 + }, + { + "epoch": 3.62, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7316, + "step": 1578 + }, + { + "epoch": 3.62, + "learning_rate": 3.730307919054803e-05, + "loss": 0.684, + "step": 1579 + }, + { + "epoch": 3.62, + "learning_rate": 3.718737556886316e-05, + "loss": 0.7547, + "step": 1580 + }, + { + "epoch": 3.62, + "learning_rate": 3.7071810666398496e-05, + "loss": 0.8581, + "step": 1581 + }, + { + "epoch": 3.62, + "learning_rate": 3.695638473837466e-05, + "loss": 0.7707, + "step": 1582 + }, + { + "epoch": 3.63, + "learning_rate": 3.684109803970531e-05, + "loss": 0.755, + "step": 1583 + }, + { + "epoch": 3.63, + "learning_rate": 3.6725950824996535e-05, + "loss": 0.8436, + "step": 1584 + }, + { + "epoch": 3.63, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7491, + "step": 1585 + }, + { + "epoch": 3.63, + "learning_rate": 3.649607586434474e-05, + "loss": 0.6946, + "step": 1586 + }, + { + "epoch": 3.64, + "learning_rate": 3.6381348626071475e-05, + "loss": 0.7697, + "step": 1587 + }, + { + "epoch": 3.64, + "learning_rate": 3.626676188709743e-05, + "loss": 0.8108, + "step": 1588 + }, + { + "epoch": 3.64, + "learning_rate": 3.6152315900482905e-05, + "loss": 0.7676, + "step": 1589 + }, + { + "epoch": 3.64, + "learning_rate": 3.603801091897731e-05, + "loss": 0.8506, + "step": 1590 + }, + { + "epoch": 3.64, + "learning_rate": 3.592384719501878e-05, + "loss": 0.7521, + "step": 1591 + }, + { + "epoch": 3.65, + "learning_rate": 3.580982498073344e-05, + "loss": 0.8371, + "step": 1592 + }, + { + "epoch": 3.65, + "learning_rate": 3.5695944527934865e-05, + "loss": 0.816, + "step": 1593 + }, + { + "epoch": 3.65, + "learning_rate": 3.5582206088123535e-05, + "loss": 0.7097, + "step": 1594 + }, + { + "epoch": 3.65, + "learning_rate": 3.546860991248641e-05, + "loss": 0.7147, + "step": 1595 + }, + { + "epoch": 3.66, + "learning_rate": 3.5355156251896136e-05, + "loss": 0.7807, + "step": 1596 + }, + { + "epoch": 3.66, + "learning_rate": 3.524184535691068e-05, + "loss": 0.8517, + "step": 1597 + }, + { + "epoch": 3.66, + "learning_rate": 3.5128677477772734e-05, + "loss": 0.8549, + "step": 1598 + }, + { + "epoch": 3.66, + "learning_rate": 3.501565286440914e-05, + "loss": 0.7514, + "step": 1599 + }, + { + "epoch": 3.67, + "learning_rate": 3.490277176643033e-05, + "loss": 0.8055, + "step": 1600 + }, + { + "epoch": 3.67, + "learning_rate": 3.4790034433129725e-05, + "loss": 0.5494, + "step": 1601 + }, + { + "epoch": 3.67, + "learning_rate": 3.467744111348338e-05, + "loss": 0.9018, + "step": 1602 + }, + { + "epoch": 3.67, + "learning_rate": 3.4564992056149214e-05, + "loss": 0.7319, + "step": 1603 + }, + { + "epoch": 3.67, + "learning_rate": 3.445268750946651e-05, + "loss": 0.8997, + "step": 1604 + }, + { + "epoch": 3.68, + "learning_rate": 3.434052772145554e-05, + "loss": 0.7977, + "step": 1605 + }, + { + "epoch": 3.68, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7205, + "step": 1606 + }, + { + "epoch": 3.68, + "learning_rate": 3.411664341193041e-05, + "loss": 0.848, + "step": 1607 + }, + { + "epoch": 3.68, + "learning_rate": 3.400491938485596e-05, + "loss": 0.7864, + "step": 1608 + }, + { + "epoch": 3.69, + "learning_rate": 3.389334110533161e-05, + "loss": 0.7184, + "step": 1609 + }, + { + "epoch": 3.69, + "learning_rate": 3.378190881977359e-05, + "loss": 0.8362, + "step": 1610 + }, + { + "epoch": 3.69, + "learning_rate": 3.367062277427567e-05, + "loss": 0.6743, + "step": 1611 + }, + { + "epoch": 3.69, + "learning_rate": 3.3559483214608824e-05, + "loss": 0.7561, + "step": 1612 + }, + { + "epoch": 3.7, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.7342, + "step": 1613 + }, + { + "epoch": 3.7, + "learning_rate": 3.333764453423357e-05, + "loss": 0.7918, + "step": 1614 + }, + { + "epoch": 3.7, + "learning_rate": 3.322694590344719e-05, + "loss": 0.75, + "step": 1615 + }, + { + "epoch": 3.7, + "learning_rate": 3.3116394738334866e-05, + "loss": 0.7874, + "step": 1616 + }, + { + "epoch": 3.7, + "learning_rate": 3.300599128304443e-05, + "loss": 0.7555, + "step": 1617 + }, + { + "epoch": 3.71, + "learning_rate": 3.2895735781397685e-05, + "loss": 0.8434, + "step": 1618 + }, + { + "epoch": 3.71, + "learning_rate": 3.278562847688951e-05, + "loss": 0.8756, + "step": 1619 + }, + { + "epoch": 3.71, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.8765, + "step": 1620 + }, + { + "epoch": 3.71, + "learning_rate": 3.256585943163176e-05, + "loss": 0.8501, + "step": 1621 + }, + { + "epoch": 3.72, + "learning_rate": 3.2456198176233543e-05, + "loss": 1.0232, + "step": 1622 + }, + { + "epoch": 3.72, + "learning_rate": 3.234668608867547e-05, + "loss": 0.7117, + "step": 1623 + }, + { + "epoch": 3.72, + "learning_rate": 3.2237323410810715e-05, + "loss": 0.9795, + "step": 1624 + }, + { + "epoch": 3.72, + "learning_rate": 3.212811038416251e-05, + "loss": 0.887, + "step": 1625 + }, + { + "epoch": 3.73, + "learning_rate": 3.201904724992352e-05, + "loss": 0.7008, + "step": 1626 + }, + { + "epoch": 3.73, + "learning_rate": 3.191013424895536e-05, + "loss": 0.7542, + "step": 1627 + }, + { + "epoch": 3.73, + "learning_rate": 3.18013716217882e-05, + "loss": 0.871, + "step": 1628 + }, + { + "epoch": 3.73, + "learning_rate": 3.1692759608620004e-05, + "loss": 0.7761, + "step": 1629 + }, + { + "epoch": 3.73, + "learning_rate": 3.158429844931611e-05, + "loss": 0.842, + "step": 1630 + }, + { + "epoch": 3.74, + "learning_rate": 3.1475988383408774e-05, + "loss": 0.8322, + "step": 1631 + }, + { + "epoch": 3.74, + "learning_rate": 3.136782965009658e-05, + "loss": 0.7911, + "step": 1632 + }, + { + "epoch": 3.74, + "learning_rate": 3.1259822488243806e-05, + "loss": 0.8911, + "step": 1633 + }, + { + "epoch": 3.74, + "learning_rate": 3.115196713638e-05, + "loss": 0.9232, + "step": 1634 + }, + { + "epoch": 3.75, + "learning_rate": 3.104426383269957e-05, + "loss": 0.8265, + "step": 1635 + }, + { + "epoch": 3.75, + "learning_rate": 3.093671281506099e-05, + "loss": 0.7861, + "step": 1636 + }, + { + "epoch": 3.75, + "learning_rate": 3.0829314320986433e-05, + "loss": 0.6548, + "step": 1637 + }, + { + "epoch": 3.75, + "learning_rate": 3.072206858766134e-05, + "loss": 0.7974, + "step": 1638 + }, + { + "epoch": 3.75, + "learning_rate": 3.061497585193369e-05, + "loss": 0.849, + "step": 1639 + }, + { + "epoch": 3.76, + "learning_rate": 3.050803635031355e-05, + "loss": 0.7438, + "step": 1640 + }, + { + "epoch": 3.76, + "learning_rate": 3.040125031897264e-05, + "loss": 0.838, + "step": 1641 + }, + { + "epoch": 3.76, + "learning_rate": 3.029461799374378e-05, + "loss": 0.8879, + "step": 1642 + }, + { + "epoch": 3.76, + "learning_rate": 3.0188139610120248e-05, + "loss": 0.7747, + "step": 1643 + }, + { + "epoch": 3.77, + "learning_rate": 3.0081815403255332e-05, + "loss": 0.7179, + "step": 1644 + }, + { + "epoch": 3.77, + "learning_rate": 2.9975645607961955e-05, + "loss": 0.7618, + "step": 1645 + }, + { + "epoch": 3.77, + "learning_rate": 2.9869630458711927e-05, + "loss": 0.6977, + "step": 1646 + }, + { + "epoch": 3.77, + "learning_rate": 2.9763770189635497e-05, + "loss": 0.8052, + "step": 1647 + }, + { + "epoch": 3.78, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.728, + "step": 1648 + }, + { + "epoch": 3.78, + "learning_rate": 2.955251522681408e-05, + "loss": 0.8593, + "step": 1649 + }, + { + "epoch": 3.78, + "learning_rate": 2.944712099961736e-05, + "loss": 0.8347, + "step": 1650 + }, + { + "epoch": 3.78, + "learning_rate": 2.9341882585689905e-05, + "loss": 0.733, + "step": 1651 + }, + { + "epoch": 3.78, + "learning_rate": 2.9236800217446593e-05, + "loss": 0.6998, + "step": 1652 + }, + { + "epoch": 3.79, + "learning_rate": 2.9131874126957727e-05, + "loss": 0.8923, + "step": 1653 + }, + { + "epoch": 3.79, + "learning_rate": 2.9027104545948414e-05, + "loss": 0.5376, + "step": 1654 + }, + { + "epoch": 3.79, + "learning_rate": 2.892249170579826e-05, + "loss": 0.7465, + "step": 1655 + }, + { + "epoch": 3.79, + "learning_rate": 2.8818035837540537e-05, + "loss": 0.7833, + "step": 1656 + }, + { + "epoch": 3.8, + "learning_rate": 2.8713737171861986e-05, + "loss": 0.7611, + "step": 1657 + }, + { + "epoch": 3.8, + "learning_rate": 2.8609595939102153e-05, + "loss": 0.7226, + "step": 1658 + }, + { + "epoch": 3.8, + "learning_rate": 2.8505612369252832e-05, + "loss": 0.8847, + "step": 1659 + }, + { + "epoch": 3.8, + "learning_rate": 2.840178669195763e-05, + "loss": 0.7511, + "step": 1660 + }, + { + "epoch": 3.81, + "learning_rate": 2.8298119136511558e-05, + "loss": 0.6833, + "step": 1661 + }, + { + "epoch": 3.81, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7595, + "step": 1662 + }, + { + "epoch": 3.81, + "learning_rate": 2.8091259306599904e-05, + "loss": 0.7486, + "step": 1663 + }, + { + "epoch": 3.81, + "learning_rate": 2.7988067488976156e-05, + "loss": 0.8106, + "step": 1664 + }, + { + "epoch": 3.81, + "learning_rate": 2.7885034706884185e-05, + "loss": 0.8012, + "step": 1665 + }, + { + "epoch": 3.82, + "learning_rate": 2.7782161187867818e-05, + "loss": 0.7598, + "step": 1666 + }, + { + "epoch": 3.82, + "learning_rate": 2.7679447159119164e-05, + "loss": 0.6638, + "step": 1667 + }, + { + "epoch": 3.82, + "learning_rate": 2.7576892847478207e-05, + "loss": 0.6576, + "step": 1668 + }, + { + "epoch": 3.82, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.8174, + "step": 1669 + }, + { + "epoch": 3.83, + "learning_rate": 2.737226428111471e-05, + "loss": 0.868, + "step": 1670 + }, + { + "epoch": 3.83, + "learning_rate": 2.7270190478306378e-05, + "loss": 0.6411, + "step": 1671 + }, + { + "epoch": 3.83, + "learning_rate": 2.7168277296433053e-05, + "loss": 0.7872, + "step": 1672 + }, + { + "epoch": 3.83, + "learning_rate": 2.7066524960565965e-05, + "loss": 0.7556, + "step": 1673 + }, + { + "epoch": 3.84, + "learning_rate": 2.6964933695421192e-05, + "loss": 0.8606, + "step": 1674 + }, + { + "epoch": 3.84, + "learning_rate": 2.6863503725359107e-05, + "loss": 0.7776, + "step": 1675 + }, + { + "epoch": 3.84, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7095, + "step": 1676 + }, + { + "epoch": 3.84, + "learning_rate": 2.666112856614259e-05, + "loss": 0.8587, + "step": 1677 + }, + { + "epoch": 3.84, + "learning_rate": 2.65601838239258e-05, + "loss": 0.8568, + "step": 1678 + }, + { + "epoch": 3.85, + "learning_rate": 2.6459401270665894e-05, + "loss": 0.7725, + "step": 1679 + }, + { + "epoch": 3.85, + "learning_rate": 2.6358781128937172e-05, + "loss": 0.8665, + "step": 1680 + }, + { + "epoch": 3.85, + "learning_rate": 2.625832362095528e-05, + "loss": 0.8286, + "step": 1681 + }, + { + "epoch": 3.85, + "learning_rate": 2.6158028968576743e-05, + "loss": 0.9445, + "step": 1682 + }, + { + "epoch": 3.86, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7562, + "step": 1683 + }, + { + "epoch": 3.86, + "learning_rate": 2.5957929116256675e-05, + "loss": 0.8086, + "step": 1684 + }, + { + "epoch": 3.86, + "learning_rate": 2.5858124358227853e-05, + "loss": 0.8513, + "step": 1685 + }, + { + "epoch": 3.86, + "learning_rate": 2.5758483339626738e-05, + "loss": 0.7107, + "step": 1686 + }, + { + "epoch": 3.86, + "learning_rate": 2.565900628050659e-05, + "loss": 0.7926, + "step": 1687 + }, + { + "epoch": 3.87, + "learning_rate": 2.5559693400558658e-05, + "loss": 0.7839, + "step": 1688 + }, + { + "epoch": 3.87, + "learning_rate": 2.546054491911147e-05, + "loss": 0.8132, + "step": 1689 + }, + { + "epoch": 3.87, + "learning_rate": 2.536156105513062e-05, + "loss": 0.6755, + "step": 1690 + }, + { + "epoch": 3.87, + "learning_rate": 2.52627420272181e-05, + "loss": 0.7823, + "step": 1691 + }, + { + "epoch": 3.88, + "learning_rate": 2.5164088053611845e-05, + "loss": 0.8078, + "step": 1692 + }, + { + "epoch": 3.88, + "learning_rate": 2.5065599352185254e-05, + "loss": 0.7328, + "step": 1693 + }, + { + "epoch": 3.88, + "learning_rate": 2.4967276140446826e-05, + "loss": 0.9089, + "step": 1694 + }, + { + "epoch": 3.88, + "learning_rate": 2.48691186355395e-05, + "loss": 0.7683, + "step": 1695 + }, + { + "epoch": 3.89, + "learning_rate": 2.477112705424024e-05, + "loss": 0.7681, + "step": 1696 + }, + { + "epoch": 3.89, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.8331, + "step": 1697 + }, + { + "epoch": 3.89, + "learning_rate": 2.4575642527741415e-05, + "loss": 0.7678, + "step": 1698 + }, + { + "epoch": 3.89, + "learning_rate": 2.447815001426177e-05, + "loss": 0.7815, + "step": 1699 + }, + { + "epoch": 3.89, + "learning_rate": 2.4380824287829074e-05, + "loss": 0.9155, + "step": 1700 + }, + { + "epoch": 3.9, + "learning_rate": 2.428366556338344e-05, + "loss": 0.7475, + "step": 1701 + }, + { + "epoch": 3.9, + "learning_rate": 2.4186674055496083e-05, + "loss": 0.6909, + "step": 1702 + }, + { + "epoch": 3.9, + "learning_rate": 2.4089849978368918e-05, + "loss": 0.7278, + "step": 1703 + }, + { + "epoch": 3.9, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8053, + "step": 1704 + }, + { + "epoch": 3.91, + "learning_rate": 2.389670497135379e-05, + "loss": 0.6703, + "step": 1705 + }, + { + "epoch": 3.91, + "learning_rate": 2.3800384468018954e-05, + "loss": 0.7334, + "step": 1706 + }, + { + "epoch": 3.91, + "learning_rate": 2.370423224854975e-05, + "loss": 0.7021, + "step": 1707 + }, + { + "epoch": 3.91, + "learning_rate": 2.3608248525294628e-05, + "loss": 0.7711, + "step": 1708 + }, + { + "epoch": 3.92, + "learning_rate": 2.3512433510229858e-05, + "loss": 0.8555, + "step": 1709 + }, + { + "epoch": 3.92, + "learning_rate": 2.3416787414959097e-05, + "loss": 0.7019, + "step": 1710 + }, + { + "epoch": 3.92, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.9331, + "step": 1711 + }, + { + "epoch": 3.92, + "learning_rate": 2.322600282834888e-05, + "loss": 0.7915, + "step": 1712 + }, + { + "epoch": 3.92, + "learning_rate": 2.3130864758349645e-05, + "loss": 0.8168, + "step": 1713 + }, + { + "epoch": 3.93, + "learning_rate": 2.303589645082411e-05, + "loss": 0.7711, + "step": 1714 + }, + { + "epoch": 3.93, + "learning_rate": 2.2941098115506065e-05, + "loss": 0.7319, + "step": 1715 + }, + { + "epoch": 3.93, + "learning_rate": 2.2846469961753915e-05, + "loss": 0.7473, + "step": 1716 + }, + { + "epoch": 3.93, + "learning_rate": 2.27520121985502e-05, + "loss": 0.7365, + "step": 1717 + }, + { + "epoch": 3.94, + "learning_rate": 2.265772503450122e-05, + "loss": 0.9078, + "step": 1718 + }, + { + "epoch": 3.94, + "learning_rate": 2.256360867783648e-05, + "loss": 0.6878, + "step": 1719 + }, + { + "epoch": 3.94, + "learning_rate": 2.246966333640823e-05, + "loss": 0.7913, + "step": 1720 + }, + { + "epoch": 3.94, + "learning_rate": 2.2375889217691137e-05, + "loss": 0.8684, + "step": 1721 + }, + { + "epoch": 3.95, + "learning_rate": 2.2282286528781605e-05, + "loss": 0.7516, + "step": 1722 + }, + { + "epoch": 3.95, + "learning_rate": 2.218885547639754e-05, + "loss": 0.787, + "step": 1723 + }, + { + "epoch": 3.95, + "learning_rate": 2.2095596266877782e-05, + "loss": 0.801, + "step": 1724 + }, + { + "epoch": 3.95, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.8423, + "step": 1725 + }, + { + "epoch": 3.95, + "learning_rate": 2.1909594199888372e-05, + "loss": 0.6984, + "step": 1726 + }, + { + "epoch": 3.96, + "learning_rate": 2.181685175319702e-05, + "loss": 0.7593, + "step": 1727 + }, + { + "epoch": 3.96, + "learning_rate": 2.172428197092561e-05, + "loss": 0.7661, + "step": 1728 + }, + { + "epoch": 3.96, + "learning_rate": 2.1631885057510838e-05, + "loss": 0.8231, + "step": 1729 + }, + { + "epoch": 3.96, + "learning_rate": 2.153966121700769e-05, + "loss": 0.7426, + "step": 1730 + }, + { + "epoch": 3.97, + "learning_rate": 2.1447610653088947e-05, + "loss": 0.7836, + "step": 1731 + }, + { + "epoch": 3.97, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.9467, + "step": 1732 + }, + { + "epoch": 3.97, + "learning_rate": 2.126403016778168e-05, + "loss": 0.8632, + "step": 1733 + }, + { + "epoch": 3.97, + "learning_rate": 2.117250065182349e-05, + "loss": 0.8532, + "step": 1734 + }, + { + "epoch": 3.97, + "learning_rate": 2.1081145223309395e-05, + "loss": 0.769, + "step": 1735 + }, + { + "epoch": 3.98, + "learning_rate": 2.0989964083994252e-05, + "loss": 0.6967, + "step": 1736 + }, + { + "epoch": 3.98, + "learning_rate": 2.08989574352481e-05, + "loss": 0.7737, + "step": 1737 + }, + { + "epoch": 3.98, + "learning_rate": 2.0808125478055505e-05, + "loss": 0.5646, + "step": 1738 + }, + { + "epoch": 3.98, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.7515, + "step": 1739 + }, + { + "epoch": 3.99, + "learning_rate": 2.0626986440340035e-05, + "loss": 0.718, + "step": 1740 + }, + { + "epoch": 3.99, + "learning_rate": 2.053667975985567e-05, + "loss": 0.8102, + "step": 1741 + }, + { + "epoch": 3.99, + "learning_rate": 2.0446548571000935e-05, + "loss": 0.8485, + "step": 1742 + }, + { + "epoch": 3.99, + "learning_rate": 2.035659307282699e-05, + "loss": 0.7086, + "step": 1743 + }, + { + "epoch": 4.0, + "learning_rate": 2.0266813463997092e-05, + "loss": 0.7731, + "step": 1744 + }, + { + "epoch": 4.0, + "learning_rate": 2.0177209942785958e-05, + "loss": 0.5973, + "step": 1745 + }, + { + "epoch": 4.0, + "learning_rate": 2.008778270707944e-05, + "loss": 0.8096, + "step": 1746 + }, + { + "epoch": 4.0, + "learning_rate": 1.99985319543741e-05, + "loss": 0.7078, + "step": 1747 + }, + { + "epoch": 4.0, + "learning_rate": 1.990945788177676e-05, + "loss": 0.752, + "step": 1748 + }, + { + "epoch": 4.01, + "learning_rate": 1.9820560686003986e-05, + "loss": 0.8476, + "step": 1749 + }, + { + "epoch": 4.01, + "learning_rate": 1.973184056338173e-05, + "loss": 0.6699, + "step": 1750 + }, + { + "epoch": 4.01, + "learning_rate": 1.9643297709844964e-05, + "loss": 0.853, + "step": 1751 + }, + { + "epoch": 4.01, + "learning_rate": 1.955493232093708e-05, + "loss": 0.7417, + "step": 1752 + }, + { + "epoch": 4.02, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7898, + "step": 1753 + }, + { + "epoch": 4.02, + "learning_rate": 1.937873471722158e-05, + "loss": 0.7147, + "step": 1754 + }, + { + "epoch": 4.02, + "learning_rate": 1.9290902891539474e-05, + "loss": 0.7374, + "step": 1755 + }, + { + "epoch": 4.02, + "learning_rate": 1.9203249308736394e-05, + "loss": 0.6925, + "step": 1756 + }, + { + "epoch": 4.03, + "learning_rate": 1.9115774162391876e-05, + "loss": 0.7645, + "step": 1757 + }, + { + "epoch": 4.03, + "learning_rate": 1.9028477645691335e-05, + "loss": 0.7832, + "step": 1758 + }, + { + "epoch": 4.03, + "learning_rate": 1.8941359951425674e-05, + "loss": 0.7343, + "step": 1759 + }, + { + "epoch": 4.03, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.836, + "step": 1760 + }, + { + "epoch": 4.03, + "learning_rate": 1.876766179938785e-05, + "loss": 0.812, + "step": 1761 + }, + { + "epoch": 4.04, + "learning_rate": 1.8681081725221184e-05, + "loss": 0.6929, + "step": 1762 + }, + { + "epoch": 4.04, + "learning_rate": 1.8594681240699707e-05, + "loss": 0.689, + "step": 1763 + }, + { + "epoch": 4.04, + "learning_rate": 1.850846053663554e-05, + "loss": 0.6537, + "step": 1764 + }, + { + "epoch": 4.04, + "learning_rate": 1.842241980344369e-05, + "loss": 0.7959, + "step": 1765 + }, + { + "epoch": 4.05, + "learning_rate": 1.8336559231141726e-05, + "loss": 0.8377, + "step": 1766 + }, + { + "epoch": 4.05, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7923, + "step": 1767 + }, + { + "epoch": 4.05, + "learning_rate": 1.8165379327288113e-05, + "loss": 0.8056, + "step": 1768 + }, + { + "epoch": 4.05, + "learning_rate": 1.808006037378053e-05, + "loss": 0.7997, + "step": 1769 + }, + { + "epoch": 4.05, + "learning_rate": 1.7994922337250274e-05, + "loss": 0.8435, + "step": 1770 + }, + { + "epoch": 4.06, + "learning_rate": 1.790996540572133e-05, + "loss": 0.7299, + "step": 1771 + }, + { + "epoch": 4.06, + "learning_rate": 1.7825189766817728e-05, + "loss": 0.6559, + "step": 1772 + }, + { + "epoch": 4.06, + "learning_rate": 1.774059560776318e-05, + "loss": 0.8019, + "step": 1773 + }, + { + "epoch": 4.06, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.8159, + "step": 1774 + }, + { + "epoch": 4.07, + "learning_rate": 1.75719524760916e-05, + "loss": 0.7666, + "step": 1775 + }, + { + "epoch": 4.07, + "learning_rate": 1.748790387591629e-05, + "loss": 0.734, + "step": 1776 + }, + { + "epoch": 4.07, + "learning_rate": 1.7404037500472713e-05, + "loss": 0.6765, + "step": 1777 + }, + { + "epoch": 4.07, + "learning_rate": 1.7320353534976474e-05, + "loss": 0.7715, + "step": 1778 + }, + { + "epoch": 4.08, + "learning_rate": 1.723685216424029e-05, + "loss": 0.8079, + "step": 1779 + }, + { + "epoch": 4.08, + "learning_rate": 1.715353357267371e-05, + "loss": 0.7619, + "step": 1780 + }, + { + "epoch": 4.08, + "learning_rate": 1.707039794428259e-05, + "loss": 0.746, + "step": 1781 + }, + { + "epoch": 4.08, + "learning_rate": 1.6987445462668694e-05, + "loss": 0.8855, + "step": 1782 + }, + { + "epoch": 4.08, + "learning_rate": 1.6904676311029287e-05, + "loss": 0.8, + "step": 1783 + }, + { + "epoch": 4.09, + "learning_rate": 1.6822090672156855e-05, + "loss": 0.647, + "step": 1784 + }, + { + "epoch": 4.09, + "learning_rate": 1.6739688728438528e-05, + "loss": 0.8357, + "step": 1785 + }, + { + "epoch": 4.09, + "learning_rate": 1.6657470661855745e-05, + "loss": 0.723, + "step": 1786 + }, + { + "epoch": 4.09, + "learning_rate": 1.6575436653983922e-05, + "loss": 0.7253, + "step": 1787 + }, + { + "epoch": 4.1, + "learning_rate": 1.649358688599191e-05, + "loss": 0.6337, + "step": 1788 + }, + { + "epoch": 4.1, + "learning_rate": 1.641192153864175e-05, + "loss": 0.7545, + "step": 1789 + }, + { + "epoch": 4.1, + "learning_rate": 1.633044079228817e-05, + "loss": 0.7717, + "step": 1790 + }, + { + "epoch": 4.1, + "learning_rate": 1.624914482687818e-05, + "loss": 0.7845, + "step": 1791 + }, + { + "epoch": 4.11, + "learning_rate": 1.6168033821950735e-05, + "loss": 0.7738, + "step": 1792 + }, + { + "epoch": 4.11, + "learning_rate": 1.6087107956636337e-05, + "loss": 0.8245, + "step": 1793 + }, + { + "epoch": 4.11, + "learning_rate": 1.6006367409656564e-05, + "loss": 0.6616, + "step": 1794 + }, + { + "epoch": 4.11, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.7386, + "step": 1795 + }, + { + "epoch": 4.11, + "learning_rate": 1.584544298354059e-05, + "loss": 0.8618, + "step": 1796 + }, + { + "epoch": 4.12, + "learning_rate": 1.5765259459799663e-05, + "loss": 0.7219, + "step": 1797 + }, + { + "epoch": 4.12, + "learning_rate": 1.5685261965183195e-05, + "loss": 0.8272, + "step": 1798 + }, + { + "epoch": 4.12, + "learning_rate": 1.5605450676362464e-05, + "loss": 0.8477, + "step": 1799 + }, + { + "epoch": 4.12, + "learning_rate": 1.5525825769597624e-05, + "loss": 0.6754, + "step": 1800 + }, + { + "epoch": 4.13, + "learning_rate": 1.544638742073713e-05, + "loss": 0.7364, + "step": 1801 + }, + { + "epoch": 4.13, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7713, + "step": 1802 + }, + { + "epoch": 4.13, + "learning_rate": 1.528807109806273e-05, + "loss": 0.6916, + "step": 1803 + }, + { + "epoch": 4.13, + "learning_rate": 1.5209193473884232e-05, + "loss": 0.6522, + "step": 1804 + }, + { + "epoch": 4.14, + "learning_rate": 1.5130503106880078e-05, + "loss": 0.9412, + "step": 1805 + }, + { + "epoch": 4.14, + "learning_rate": 1.5052000170834901e-05, + "loss": 0.6304, + "step": 1806 + }, + { + "epoch": 4.14, + "learning_rate": 1.4973684839119362e-05, + "loss": 0.6736, + "step": 1807 + }, + { + "epoch": 4.14, + "learning_rate": 1.4895557284689799e-05, + "loss": 0.7606, + "step": 1808 + }, + { + "epoch": 4.14, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.8353, + "step": 1809 + }, + { + "epoch": 4.15, + "learning_rate": 1.4739866197440044e-05, + "loss": 0.795, + "step": 1810 + }, + { + "epoch": 4.15, + "learning_rate": 1.4662303008457534e-05, + "loss": 0.6965, + "step": 1811 + }, + { + "epoch": 4.15, + "learning_rate": 1.458492828443555e-05, + "loss": 0.8361, + "step": 1812 + }, + { + "epoch": 4.15, + "learning_rate": 1.450774219625316e-05, + "loss": 0.8829, + "step": 1813 + }, + { + "epoch": 4.16, + "learning_rate": 1.443074491437283e-05, + "loss": 0.7409, + "step": 1814 + }, + { + "epoch": 4.16, + "learning_rate": 1.4353936608840013e-05, + "loss": 0.5861, + "step": 1815 + }, + { + "epoch": 4.16, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.6337, + "step": 1816 + }, + { + "epoch": 4.16, + "learning_rate": 1.420088760491174e-05, + "loss": 0.7182, + "step": 1817 + }, + { + "epoch": 4.16, + "learning_rate": 1.4124647244519029e-05, + "loss": 0.7752, + "step": 1818 + }, + { + "epoch": 4.17, + "learning_rate": 1.4048596536478531e-05, + "loss": 0.7425, + "step": 1819 + }, + { + "epoch": 4.17, + "learning_rate": 1.3972735648745294e-05, + "loss": 0.6673, + "step": 1820 + }, + { + "epoch": 4.17, + "learning_rate": 1.3897064748855082e-05, + "loss": 0.6985, + "step": 1821 + }, + { + "epoch": 4.17, + "learning_rate": 1.3821584003924127e-05, + "loss": 0.8485, + "step": 1822 + }, + { + "epoch": 4.18, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.8048, + "step": 1823 + }, + { + "epoch": 4.18, + "learning_rate": 1.3671193645304781e-05, + "loss": 0.831, + "step": 1824 + }, + { + "epoch": 4.18, + "learning_rate": 1.3596284363747569e-05, + "loss": 0.7399, + "step": 1825 + }, + { + "epoch": 4.18, + "learning_rate": 1.3521565901411326e-05, + "loss": 0.7591, + "step": 1826 + }, + { + "epoch": 4.19, + "learning_rate": 1.3447038423308845e-05, + "loss": 0.7853, + "step": 1827 + }, + { + "epoch": 4.19, + "learning_rate": 1.3372702094031108e-05, + "loss": 0.8306, + "step": 1828 + }, + { + "epoch": 4.19, + "learning_rate": 1.329855707774703e-05, + "loss": 0.7283, + "step": 1829 + }, + { + "epoch": 4.19, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.645, + "step": 1830 + }, + { + "epoch": 4.19, + "learning_rate": 1.3150841638722355e-05, + "loss": 0.7112, + "step": 1831 + }, + { + "epoch": 4.2, + "learning_rate": 1.3077271542205515e-05, + "loss": 0.6324, + "step": 1832 + }, + { + "epoch": 4.2, + "learning_rate": 1.300389341112913e-05, + "loss": 0.684, + "step": 1833 + }, + { + "epoch": 4.2, + "learning_rate": 1.2930707407545917e-05, + "loss": 0.9168, + "step": 1834 + }, + { + "epoch": 4.2, + "learning_rate": 1.2857713693084271e-05, + "loss": 0.6535, + "step": 1835 + }, + { + "epoch": 4.21, + "learning_rate": 1.2784912428947992e-05, + "loss": 0.8242, + "step": 1836 + }, + { + "epoch": 4.21, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7418, + "step": 1837 + }, + { + "epoch": 4.21, + "learning_rate": 1.263988789434104e-05, + "loss": 0.827, + "step": 1838 + }, + { + "epoch": 4.21, + "learning_rate": 1.256766494415137e-05, + "loss": 0.8472, + "step": 1839 + }, + { + "epoch": 4.22, + "learning_rate": 1.2495635084848356e-05, + "loss": 0.8094, + "step": 1840 + }, + { + "epoch": 4.22, + "learning_rate": 1.242379847550712e-05, + "loss": 0.7753, + "step": 1841 + }, + { + "epoch": 4.22, + "learning_rate": 1.2352155274775967e-05, + "loss": 0.727, + "step": 1842 + }, + { + "epoch": 4.22, + "learning_rate": 1.2280705640876134e-05, + "loss": 0.7839, + "step": 1843 + }, + { + "epoch": 4.22, + "learning_rate": 1.220944973160133e-05, + "loss": 0.8324, + "step": 1844 + }, + { + "epoch": 4.23, + "learning_rate": 1.2138387704317421e-05, + "loss": 0.874, + "step": 1845 + }, + { + "epoch": 4.23, + "learning_rate": 1.2067519715962116e-05, + "loss": 0.6834, + "step": 1846 + }, + { + "epoch": 4.23, + "learning_rate": 1.199684592304462e-05, + "loss": 0.6735, + "step": 1847 + }, + { + "epoch": 4.23, + "learning_rate": 1.1926366481645213e-05, + "loss": 0.7326, + "step": 1848 + }, + { + "epoch": 4.24, + "learning_rate": 1.1856081547414966e-05, + "loss": 0.7304, + "step": 1849 + }, + { + "epoch": 4.24, + "learning_rate": 1.1785991275575426e-05, + "loss": 0.7984, + "step": 1850 + }, + { + "epoch": 4.24, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.6654, + "step": 1851 + }, + { + "epoch": 4.24, + "learning_rate": 1.1646395337804683e-05, + "loss": 0.6408, + "step": 1852 + }, + { + "epoch": 4.25, + "learning_rate": 1.157688998016564e-05, + "loss": 0.8186, + "step": 1853 + }, + { + "epoch": 4.25, + "learning_rate": 1.1507579901500909e-05, + "loss": 0.7184, + "step": 1854 + }, + { + "epoch": 4.25, + "learning_rate": 1.1438465254879115e-05, + "loss": 0.7188, + "step": 1855 + }, + { + "epoch": 4.25, + "learning_rate": 1.1369546192937264e-05, + "loss": 0.7905, + "step": 1856 + }, + { + "epoch": 4.25, + "learning_rate": 1.1300822867880378e-05, + "loss": 0.7035, + "step": 1857 + }, + { + "epoch": 4.26, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.71, + "step": 1858 + }, + { + "epoch": 4.26, + "learning_rate": 1.1163964035079976e-05, + "loss": 0.6428, + "step": 1859 + }, + { + "epoch": 4.26, + "learning_rate": 1.1095828829583843e-05, + "loss": 0.9633, + "step": 1860 + }, + { + "epoch": 4.26, + "learning_rate": 1.102788996546672e-05, + "loss": 0.7724, + "step": 1861 + }, + { + "epoch": 4.27, + "learning_rate": 1.0960147592768988e-05, + "loss": 0.7484, + "step": 1862 + }, + { + "epoch": 4.27, + "learning_rate": 1.0892601861096985e-05, + "loss": 0.5761, + "step": 1863 + }, + { + "epoch": 4.27, + "learning_rate": 1.082525291962283e-05, + "loss": 0.8085, + "step": 1864 + }, + { + "epoch": 4.27, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7404, + "step": 1865 + }, + { + "epoch": 4.27, + "learning_rate": 1.0691146001783081e-05, + "loss": 0.7665, + "step": 1866 + }, + { + "epoch": 4.28, + "learning_rate": 1.0624388321587387e-05, + "loss": 0.7864, + "step": 1867 + }, + { + "epoch": 4.28, + "learning_rate": 1.0557828023928607e-05, + "loss": 0.7268, + "step": 1868 + }, + { + "epoch": 4.28, + "learning_rate": 1.0491465255802602e-05, + "loss": 0.8272, + "step": 1869 + }, + { + "epoch": 4.28, + "learning_rate": 1.0425300163768902e-05, + "loss": 0.7142, + "step": 1870 + }, + { + "epoch": 4.29, + "learning_rate": 1.0359332893950512e-05, + "loss": 0.8686, + "step": 1871 + }, + { + "epoch": 4.29, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.6337, + "step": 1872 + }, + { + "epoch": 4.29, + "learning_rate": 1.0227992403267073e-05, + "loss": 0.6917, + "step": 1873 + }, + { + "epoch": 4.29, + "learning_rate": 1.0162619472462354e-05, + "loss": 0.8307, + "step": 1874 + }, + { + "epoch": 4.3, + "learning_rate": 1.009744494399295e-05, + "loss": 0.7187, + "step": 1875 + }, + { + "epoch": 4.3, + "learning_rate": 1.0032468961794317e-05, + "loss": 0.7681, + "step": 1876 + }, + { + "epoch": 4.3, + "learning_rate": 9.967691669363333e-06, + "loss": 0.8099, + "step": 1877 + }, + { + "epoch": 4.3, + "learning_rate": 9.903113209758096e-06, + "loss": 0.7846, + "step": 1878 + }, + { + "epoch": 4.3, + "learning_rate": 9.838733725597615e-06, + "loss": 0.822, + "step": 1879 + }, + { + "epoch": 4.31, + "learning_rate": 9.774553359061489e-06, + "loss": 0.6103, + "step": 1880 + }, + { + "epoch": 4.31, + "learning_rate": 9.710572251889504e-06, + "loss": 0.9409, + "step": 1881 + }, + { + "epoch": 4.31, + "learning_rate": 9.646790545381445e-06, + "loss": 0.6709, + "step": 1882 + }, + { + "epoch": 4.31, + "learning_rate": 9.583208380396713e-06, + "loss": 0.7539, + "step": 1883 + }, + { + "epoch": 4.32, + "learning_rate": 9.51982589735403e-06, + "loss": 0.7002, + "step": 1884 + }, + { + "epoch": 4.32, + "learning_rate": 9.45664323623111e-06, + "loss": 0.8161, + "step": 1885 + }, + { + "epoch": 4.32, + "learning_rate": 9.393660536564408e-06, + "loss": 0.6838, + "step": 1886 + }, + { + "epoch": 4.32, + "learning_rate": 9.330877937448723e-06, + "loss": 0.7319, + "step": 1887 + }, + { + "epoch": 4.33, + "learning_rate": 9.268295577536978e-06, + "loss": 0.6035, + "step": 1888 + }, + { + "epoch": 4.33, + "learning_rate": 9.205913595039883e-06, + "loss": 0.7876, + "step": 1889 + }, + { + "epoch": 4.33, + "learning_rate": 9.14373212772559e-06, + "loss": 0.8669, + "step": 1890 + }, + { + "epoch": 4.33, + "learning_rate": 9.081751312919406e-06, + "loss": 0.7629, + "step": 1891 + }, + { + "epoch": 4.33, + "learning_rate": 9.019971287503571e-06, + "loss": 0.7332, + "step": 1892 + }, + { + "epoch": 4.34, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7497, + "step": 1893 + }, + { + "epoch": 4.34, + "learning_rate": 8.897014150154237e-06, + "loss": 0.7535, + "step": 1894 + }, + { + "epoch": 4.34, + "learning_rate": 8.835837309766726e-06, + "loss": 0.727, + "step": 1895 + }, + { + "epoch": 4.34, + "learning_rate": 8.774861801861e-06, + "loss": 0.8342, + "step": 1896 + }, + { + "epoch": 4.35, + "learning_rate": 8.714087761099078e-06, + "loss": 0.7453, + "step": 1897 + }, + { + "epoch": 4.35, + "learning_rate": 8.653515321698025e-06, + "loss": 0.766, + "step": 1898 + }, + { + "epoch": 4.35, + "learning_rate": 8.593144617429726e-06, + "loss": 0.7517, + "step": 1899 + }, + { + "epoch": 4.35, + "learning_rate": 8.532975781620512e-06, + "loss": 0.8128, + "step": 1900 + }, + { + "epoch": 4.36, + "learning_rate": 8.473008947150873e-06, + "loss": 0.6856, + "step": 1901 + }, + { + "epoch": 4.36, + "learning_rate": 8.413244246455254e-06, + "loss": 0.8751, + "step": 1902 + }, + { + "epoch": 4.36, + "learning_rate": 8.35368181152163e-06, + "loss": 0.8064, + "step": 1903 + }, + { + "epoch": 4.36, + "learning_rate": 8.29432177389129e-06, + "loss": 0.7193, + "step": 1904 + }, + { + "epoch": 4.36, + "learning_rate": 8.235164264658568e-06, + "loss": 0.7799, + "step": 1905 + }, + { + "epoch": 4.37, + "learning_rate": 8.176209414470525e-06, + "loss": 0.8164, + "step": 1906 + }, + { + "epoch": 4.37, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7392, + "step": 1907 + }, + { + "epoch": 4.37, + "learning_rate": 8.058908211578475e-06, + "loss": 0.8726, + "step": 1908 + }, + { + "epoch": 4.37, + "learning_rate": 8.000562117929589e-06, + "loss": 0.7835, + "step": 1909 + }, + { + "epoch": 4.38, + "learning_rate": 7.942419201435013e-06, + "loss": 0.7701, + "step": 1910 + }, + { + "epoch": 4.38, + "learning_rate": 7.884479590501092e-06, + "loss": 0.7495, + "step": 1911 + }, + { + "epoch": 4.38, + "learning_rate": 7.826743413085192e-06, + "loss": 0.6697, + "step": 1912 + }, + { + "epoch": 4.38, + "learning_rate": 7.769210796695415e-06, + "loss": 0.8891, + "step": 1913 + }, + { + "epoch": 4.38, + "learning_rate": 7.711881868390291e-06, + "loss": 0.8632, + "step": 1914 + }, + { + "epoch": 4.39, + "learning_rate": 7.65475675477848e-06, + "loss": 0.8605, + "step": 1915 + }, + { + "epoch": 4.39, + "learning_rate": 7.5978355820185865e-06, + "loss": 0.6544, + "step": 1916 + }, + { + "epoch": 4.39, + "learning_rate": 7.541118475818787e-06, + "loss": 0.6335, + "step": 1917 + }, + { + "epoch": 4.39, + "learning_rate": 7.484605561436575e-06, + "loss": 0.7718, + "step": 1918 + }, + { + "epoch": 4.4, + "learning_rate": 7.428296963678527e-06, + "loss": 0.7532, + "step": 1919 + }, + { + "epoch": 4.4, + "learning_rate": 7.372192806899947e-06, + "loss": 0.8253, + "step": 1920 + }, + { + "epoch": 4.4, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.8023, + "step": 1921 + }, + { + "epoch": 4.4, + "learning_rate": 7.260598311444822e-06, + "loss": 0.8046, + "step": 1922 + }, + { + "epoch": 4.41, + "learning_rate": 7.205108219220335e-06, + "loss": 0.8704, + "step": 1923 + }, + { + "epoch": 4.41, + "learning_rate": 7.1498230608789465e-06, + "loss": 0.7408, + "step": 1924 + }, + { + "epoch": 4.41, + "learning_rate": 7.094742958515721e-06, + "loss": 0.7736, + "step": 1925 + }, + { + "epoch": 4.41, + "learning_rate": 7.039868033772956e-06, + "loss": 0.7339, + "step": 1926 + }, + { + "epoch": 4.41, + "learning_rate": 6.985198407839755e-06, + "loss": 0.7555, + "step": 1927 + }, + { + "epoch": 4.42, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7876, + "step": 1928 + }, + { + "epoch": 4.42, + "learning_rate": 6.876475534891236e-06, + "loss": 0.8501, + "step": 1929 + }, + { + "epoch": 4.42, + "learning_rate": 6.822422527986161e-06, + "loss": 0.72, + "step": 1930 + }, + { + "epoch": 4.42, + "learning_rate": 6.768575300110514e-06, + "loss": 0.9524, + "step": 1931 + }, + { + "epoch": 4.43, + "learning_rate": 6.714933970183812e-06, + "loss": 0.7231, + "step": 1932 + }, + { + "epoch": 4.43, + "learning_rate": 6.661498656670828e-06, + "loss": 0.7651, + "step": 1933 + }, + { + "epoch": 4.43, + "learning_rate": 6.60826947758132e-06, + "loss": 0.911, + "step": 1934 + }, + { + "epoch": 4.43, + "learning_rate": 6.555246550469907e-06, + "loss": 0.7647, + "step": 1935 + }, + { + "epoch": 4.44, + "learning_rate": 6.5024299924355996e-06, + "loss": 0.8634, + "step": 1936 + }, + { + "epoch": 4.44, + "learning_rate": 6.4498199201217024e-06, + "loss": 0.8882, + "step": 1937 + }, + { + "epoch": 4.44, + "learning_rate": 6.39741644971551e-06, + "loss": 0.7741, + "step": 1938 + }, + { + "epoch": 4.44, + "learning_rate": 6.345219696948046e-06, + "loss": 0.7915, + "step": 1939 + }, + { + "epoch": 4.44, + "learning_rate": 6.2932297770937785e-06, + "loss": 0.8229, + "step": 1940 + }, + { + "epoch": 4.45, + "learning_rate": 6.2414468049703965e-06, + "loss": 0.7476, + "step": 1941 + }, + { + "epoch": 4.45, + "learning_rate": 6.189870894938587e-06, + "loss": 0.6, + "step": 1942 + }, + { + "epoch": 4.45, + "learning_rate": 6.138502160901727e-06, + "loss": 0.7207, + "step": 1943 + }, + { + "epoch": 4.45, + "learning_rate": 6.087340716305623e-06, + "loss": 0.8019, + "step": 1944 + }, + { + "epoch": 4.46, + "learning_rate": 6.036386674138339e-06, + "loss": 0.7223, + "step": 1945 + }, + { + "epoch": 4.46, + "learning_rate": 5.9856401469299054e-06, + "loss": 0.602, + "step": 1946 + }, + { + "epoch": 4.46, + "learning_rate": 5.935101246752029e-06, + "loss": 0.9012, + "step": 1947 + }, + { + "epoch": 4.46, + "learning_rate": 5.88477008521785e-06, + "loss": 0.6789, + "step": 1948 + }, + { + "epoch": 4.47, + "learning_rate": 5.834646773481811e-06, + "loss": 0.738, + "step": 1949 + }, + { + "epoch": 4.47, + "learning_rate": 5.7847314222392755e-06, + "loss": 0.8363, + "step": 1950 + }, + { + "epoch": 4.47, + "learning_rate": 5.735024141726319e-06, + "loss": 0.8111, + "step": 1951 + }, + { + "epoch": 4.47, + "learning_rate": 5.6855250417195525e-06, + "loss": 0.6808, + "step": 1952 + }, + { + "epoch": 4.47, + "learning_rate": 5.636234231535775e-06, + "loss": 0.8633, + "step": 1953 + }, + { + "epoch": 4.48, + "learning_rate": 5.587151820031811e-06, + "loss": 0.7437, + "step": 1954 + }, + { + "epoch": 4.48, + "learning_rate": 5.538277915604273e-06, + "loss": 0.7094, + "step": 1955 + }, + { + "epoch": 4.48, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6237, + "step": 1956 + }, + { + "epoch": 4.48, + "learning_rate": 5.441156059262109e-06, + "loss": 0.7038, + "step": 1957 + }, + { + "epoch": 4.49, + "learning_rate": 5.392908321837276e-06, + "loss": 0.7563, + "step": 1958 + }, + { + "epoch": 4.49, + "learning_rate": 5.344869520468021e-06, + "loss": 0.88, + "step": 1959 + }, + { + "epoch": 4.49, + "learning_rate": 5.297039761246137e-06, + "loss": 0.7696, + "step": 1960 + }, + { + "epoch": 4.49, + "learning_rate": 5.24941914980176e-06, + "loss": 0.7964, + "step": 1961 + }, + { + "epoch": 4.49, + "learning_rate": 5.202007791303165e-06, + "loss": 0.6229, + "step": 1962 + }, + { + "epoch": 4.5, + "learning_rate": 5.154805790456485e-06, + "loss": 0.8556, + "step": 1963 + }, + { + "epoch": 4.5, + "learning_rate": 5.107813251505455e-06, + "loss": 0.7431, + "step": 1964 + }, + { + "epoch": 4.5, + "learning_rate": 5.061030278231305e-06, + "loss": 0.8423, + "step": 1965 + }, + { + "epoch": 4.5, + "learning_rate": 5.014456973952375e-06, + "loss": 0.6179, + "step": 1966 + }, + { + "epoch": 4.51, + "learning_rate": 4.968093441523958e-06, + "loss": 0.7416, + "step": 1967 + }, + { + "epoch": 4.51, + "learning_rate": 4.921939783338137e-06, + "loss": 0.5964, + "step": 1968 + }, + { + "epoch": 4.51, + "learning_rate": 4.8759961013234545e-06, + "loss": 0.7621, + "step": 1969 + }, + { + "epoch": 4.51, + "learning_rate": 4.830262496944693e-06, + "loss": 0.8042, + "step": 1970 + }, + { + "epoch": 4.52, + "learning_rate": 4.7847390712027594e-06, + "loss": 0.8731, + "step": 1971 + }, + { + "epoch": 4.52, + "learning_rate": 4.739425924634366e-06, + "loss": 0.8133, + "step": 1972 + }, + { + "epoch": 4.52, + "learning_rate": 4.694323157311808e-06, + "loss": 0.6752, + "step": 1973 + }, + { + "epoch": 4.52, + "learning_rate": 4.6494308688427635e-06, + "loss": 0.7556, + "step": 1974 + }, + { + "epoch": 4.52, + "learning_rate": 4.604749158370125e-06, + "loss": 0.7755, + "step": 1975 + }, + { + "epoch": 4.53, + "learning_rate": 4.560278124571671e-06, + "loss": 0.8052, + "step": 1976 + }, + { + "epoch": 4.53, + "learning_rate": 4.516017865659949e-06, + "loss": 0.7026, + "step": 1977 + }, + { + "epoch": 4.53, + "learning_rate": 4.4719684793819935e-06, + "loss": 0.7411, + "step": 1978 + }, + { + "epoch": 4.53, + "learning_rate": 4.4281300630191865e-06, + "loss": 0.7057, + "step": 1979 + }, + { + "epoch": 4.54, + "learning_rate": 4.384502713386918e-06, + "loss": 0.8068, + "step": 1980 + }, + { + "epoch": 4.54, + "learning_rate": 4.341086526834492e-06, + "loss": 0.6784, + "step": 1981 + }, + { + "epoch": 4.54, + "learning_rate": 4.2978815992448576e-06, + "loss": 0.756, + "step": 1982 + }, + { + "epoch": 4.54, + "learning_rate": 4.254888026034398e-06, + "loss": 0.5434, + "step": 1983 + }, + { + "epoch": 4.55, + "learning_rate": 4.21210590215273e-06, + "loss": 0.6606, + "step": 1984 + }, + { + "epoch": 4.55, + "learning_rate": 4.1695353220825184e-06, + "loss": 0.6787, + "step": 1985 + }, + { + "epoch": 4.55, + "learning_rate": 4.127176379839193e-06, + "loss": 0.7113, + "step": 1986 + }, + { + "epoch": 4.55, + "learning_rate": 4.085029168970855e-06, + "loss": 0.843, + "step": 1987 + }, + { + "epoch": 4.55, + "learning_rate": 4.043093782557939e-06, + "loss": 0.761, + "step": 1988 + }, + { + "epoch": 4.56, + "learning_rate": 4.001370313213115e-06, + "loss": 0.7123, + "step": 1989 + }, + { + "epoch": 4.56, + "learning_rate": 3.959858853081033e-06, + "loss": 0.8301, + "step": 1990 + }, + { + "epoch": 4.56, + "learning_rate": 3.918559493838114e-06, + "loss": 0.6627, + "step": 1991 + }, + { + "epoch": 4.56, + "learning_rate": 3.877472326692389e-06, + "loss": 0.928, + "step": 1992 + }, + { + "epoch": 4.57, + "learning_rate": 3.83659744238325e-06, + "loss": 0.7167, + "step": 1993 + }, + { + "epoch": 4.57, + "learning_rate": 3.7959349311812665e-06, + "loss": 0.8577, + "step": 1994 + }, + { + "epoch": 4.57, + "learning_rate": 3.755484882888005e-06, + "loss": 0.888, + "step": 1995 + }, + { + "epoch": 4.57, + "learning_rate": 3.7152473868358404e-06, + "loss": 0.7254, + "step": 1996 + }, + { + "epoch": 4.58, + "learning_rate": 3.6752225318876787e-06, + "loss": 0.673, + "step": 1997 + }, + { + "epoch": 4.58, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7485, + "step": 1998 + }, + { + "epoch": 4.58, + "learning_rate": 3.5958110984068873e-06, + "loss": 0.8042, + "step": 1999 + }, + { + "epoch": 4.58, + "learning_rate": 3.5564246952512815e-06, + "loss": 0.8469, + "step": 2000 + }, + { + "epoch": 4.58, + "learning_rate": 3.5172512839533934e-06, + "loss": 0.8005, + "step": 2001 + }, + { + "epoch": 4.59, + "learning_rate": 3.478290951026153e-06, + "loss": 0.8436, + "step": 2002 + }, + { + "epoch": 4.59, + "learning_rate": 3.4395437825119116e-06, + "loss": 0.7417, + "step": 2003 + }, + { + "epoch": 4.59, + "learning_rate": 3.401009863982296e-06, + "loss": 0.8275, + "step": 2004 + }, + { + "epoch": 4.59, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.6207, + "step": 2005 + }, + { + "epoch": 4.6, + "learning_rate": 3.3245821168083945e-06, + "loss": 0.8, + "step": 2006 + }, + { + "epoch": 4.6, + "learning_rate": 3.2866884569517807e-06, + "loss": 0.7947, + "step": 2007 + }, + { + "epoch": 4.6, + "learning_rate": 3.2490083846547836e-06, + "loss": 0.6588, + "step": 2008 + }, + { + "epoch": 4.6, + "learning_rate": 3.2115419831323714e-06, + "loss": 0.7956, + "step": 2009 + }, + { + "epoch": 4.6, + "learning_rate": 3.1742893351276117e-06, + "loss": 0.8266, + "step": 2010 + }, + { + "epoch": 4.61, + "learning_rate": 3.137250522911528e-06, + "loss": 0.7252, + "step": 2011 + }, + { + "epoch": 4.61, + "learning_rate": 3.100425628282899e-06, + "loss": 0.8708, + "step": 2012 + }, + { + "epoch": 4.61, + "learning_rate": 3.063814732568038e-06, + "loss": 0.742, + "step": 2013 + }, + { + "epoch": 4.61, + "learning_rate": 3.0274179166206784e-06, + "loss": 0.7555, + "step": 2014 + }, + { + "epoch": 4.62, + "learning_rate": 2.9912352608217785e-06, + "loss": 0.7412, + "step": 2015 + }, + { + "epoch": 4.62, + "learning_rate": 2.9552668450792965e-06, + "loss": 0.8129, + "step": 2016 + }, + { + "epoch": 4.62, + "learning_rate": 2.919512748828079e-06, + "loss": 0.7253, + "step": 2017 + }, + { + "epoch": 4.62, + "learning_rate": 2.883973051029654e-06, + "loss": 0.6802, + "step": 2018 + }, + { + "epoch": 4.63, + "learning_rate": 2.848647830172024e-06, + "loss": 0.7207, + "step": 2019 + }, + { + "epoch": 4.63, + "learning_rate": 2.8135371642695864e-06, + "loss": 0.7983, + "step": 2020 + }, + { + "epoch": 4.63, + "learning_rate": 2.778641130862858e-06, + "loss": 0.8213, + "step": 2021 + }, + { + "epoch": 4.63, + "learning_rate": 2.7439598070183703e-06, + "loss": 0.8352, + "step": 2022 + }, + { + "epoch": 4.63, + "learning_rate": 2.7094932693284558e-06, + "loss": 0.6755, + "step": 2023 + }, + { + "epoch": 4.64, + "learning_rate": 2.6752415939111155e-06, + "loss": 0.7513, + "step": 2024 + }, + { + "epoch": 4.64, + "learning_rate": 2.6412048564098403e-06, + "loss": 0.7148, + "step": 2025 + }, + { + "epoch": 4.64, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7626, + "step": 2026 + }, + { + "epoch": 4.64, + "learning_rate": 2.5737764953558176e-06, + "loss": 0.8691, + "step": 2027 + }, + { + "epoch": 4.65, + "learning_rate": 2.540385020715963e-06, + "loss": 0.6038, + "step": 2028 + }, + { + "epoch": 4.65, + "learning_rate": 2.5072087818176382e-06, + "loss": 0.8152, + "step": 2029 + }, + { + "epoch": 4.65, + "learning_rate": 2.4742478519292455e-06, + "loss": 0.6593, + "step": 2030 + }, + { + "epoch": 4.65, + "learning_rate": 2.4415023038437344e-06, + "loss": 0.6944, + "step": 2031 + }, + { + "epoch": 4.66, + "learning_rate": 2.408972209878335e-06, + "loss": 0.8039, + "step": 2032 + }, + { + "epoch": 4.66, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.8279, + "step": 2033 + }, + { + "epoch": 4.66, + "learning_rate": 2.344558671197694e-06, + "loss": 0.7275, + "step": 2034 + }, + { + "epoch": 4.66, + "learning_rate": 2.3126753687372273e-06, + "loss": 0.6573, + "step": 2035 + }, + { + "epoch": 4.66, + "learning_rate": 2.2810078049061103e-06, + "loss": 0.7047, + "step": 2036 + }, + { + "epoch": 4.67, + "learning_rate": 2.249556049640933e-06, + "loss": 0.7538, + "step": 2037 + }, + { + "epoch": 4.67, + "learning_rate": 2.2183201724016667e-06, + "loss": 0.7396, + "step": 2038 + }, + { + "epoch": 4.67, + "learning_rate": 2.1873002421715306e-06, + "loss": 0.848, + "step": 2039 + }, + { + "epoch": 4.67, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.8045, + "step": 2040 + }, + { + "epoch": 4.68, + "learning_rate": 2.1259084962867327e-06, + "loss": 0.8433, + "step": 2041 + }, + { + "epoch": 4.68, + "learning_rate": 2.0955368162133505e-06, + "loss": 0.8063, + "step": 2042 + }, + { + "epoch": 4.68, + "learning_rate": 2.065381354311313e-06, + "loss": 0.8202, + "step": 2043 + }, + { + "epoch": 4.68, + "learning_rate": 2.035442177177782e-06, + "loss": 0.8419, + "step": 2044 + }, + { + "epoch": 4.68, + "learning_rate": 2.005719350932267e-06, + "loss": 0.8671, + "step": 2045 + }, + { + "epoch": 4.69, + "learning_rate": 1.9762129412164598e-06, + "loss": 0.7617, + "step": 2046 + }, + { + "epoch": 4.69, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.6608, + "step": 2047 + }, + { + "epoch": 4.69, + "learning_rate": 1.91784963155085e-06, + "loss": 0.821, + "step": 2048 + }, + { + "epoch": 4.69, + "learning_rate": 1.8889928604941431e-06, + "loss": 0.7216, + "step": 2049 + }, + { + "epoch": 4.7, + "learning_rate": 1.860352763753004e-06, + "loss": 0.8276, + "step": 2050 + }, + { + "epoch": 4.7, + "learning_rate": 1.8319294045779921e-06, + "loss": 0.8474, + "step": 2051 + }, + { + "epoch": 4.7, + "learning_rate": 1.803722845740985e-06, + "loss": 0.8173, + "step": 2052 + }, + { + "epoch": 4.7, + "learning_rate": 1.7757331495350306e-06, + "loss": 0.6578, + "step": 2053 + }, + { + "epoch": 4.71, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.8437, + "step": 2054 + }, + { + "epoch": 4.71, + "learning_rate": 1.7204045917938671e-06, + "loss": 0.8754, + "step": 2055 + }, + { + "epoch": 4.71, + "learning_rate": 1.6930658524496156e-06, + "loss": 0.7193, + "step": 2056 + }, + { + "epoch": 4.71, + "learning_rate": 1.6659442201180543e-06, + "loss": 0.7511, + "step": 2057 + }, + { + "epoch": 4.71, + "learning_rate": 1.639039754696281e-06, + "loss": 0.7002, + "step": 2058 + }, + { + "epoch": 4.72, + "learning_rate": 1.6123525156017227e-06, + "loss": 0.7741, + "step": 2059 + }, + { + "epoch": 4.72, + "learning_rate": 1.585882561772112e-06, + "loss": 0.8037, + "step": 2060 + }, + { + "epoch": 4.72, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7227, + "step": 2061 + }, + { + "epoch": 4.72, + "learning_rate": 1.5335947432591702e-06, + "loss": 0.7362, + "step": 2062 + }, + { + "epoch": 4.73, + "learning_rate": 1.5077769940514242e-06, + "loss": 0.8061, + "step": 2063 + }, + { + "epoch": 4.73, + "learning_rate": 1.4821767610595837e-06, + "loss": 0.7704, + "step": 2064 + }, + { + "epoch": 4.73, + "learning_rate": 1.4567941008207465e-06, + "loss": 0.6577, + "step": 2065 + }, + { + "epoch": 4.73, + "learning_rate": 1.4316290693915156e-06, + "loss": 0.7877, + "step": 2066 + }, + { + "epoch": 4.74, + "learning_rate": 1.4066817223478912e-06, + "loss": 0.7932, + "step": 2067 + }, + { + "epoch": 4.74, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.8258, + "step": 2068 + }, + { + "epoch": 4.74, + "learning_rate": 1.3574403013175252e-06, + "loss": 0.7161, + "step": 2069 + }, + { + "epoch": 4.74, + "learning_rate": 1.333146336078528e-06, + "loss": 0.7057, + "step": 2070 + }, + { + "epoch": 4.74, + "learning_rate": 1.3090702727203918e-06, + "loss": 0.6727, + "step": 2071 + }, + { + "epoch": 4.75, + "learning_rate": 1.2852121644141623e-06, + "loss": 0.7107, + "step": 2072 + }, + { + "epoch": 4.75, + "learning_rate": 1.2615720638495142e-06, + "loss": 0.6526, + "step": 2073 + }, + { + "epoch": 4.75, + "learning_rate": 1.2381500232347187e-06, + "loss": 0.811, + "step": 2074 + }, + { + "epoch": 4.75, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.8009, + "step": 2075 + }, + { + "epoch": 4.76, + "learning_rate": 1.191960328279551e-06, + "loss": 0.9445, + "step": 2076 + }, + { + "epoch": 4.76, + "learning_rate": 1.1691927759473254e-06, + "loss": 0.7614, + "step": 2077 + }, + { + "epoch": 4.76, + "learning_rate": 1.1466434875809785e-06, + "loss": 0.927, + "step": 2078 + }, + { + "epoch": 4.76, + "learning_rate": 1.1243125129797194e-06, + "loss": 0.7276, + "step": 2079 + }, + { + "epoch": 4.77, + "learning_rate": 1.1021999014606322e-06, + "loss": 0.6772, + "step": 2080 + }, + { + "epoch": 4.77, + "learning_rate": 1.0803057018585527e-06, + "loss": 0.7344, + "step": 2081 + }, + { + "epoch": 4.77, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7197, + "step": 2082 + }, + { + "epoch": 4.77, + "learning_rate": 1.0371727313329027e-06, + "loss": 0.7858, + "step": 2083 + }, + { + "epoch": 4.77, + "learning_rate": 1.0159340556668007e-06, + "loss": 0.8079, + "step": 2084 + }, + { + "epoch": 4.78, + "learning_rate": 9.949139824324661e-07, + "loss": 0.8665, + "step": 2085 + }, + { + "epoch": 4.78, + "learning_rate": 9.74112558051865e-07, + "loss": 0.6908, + "step": 2086 + }, + { + "epoch": 4.78, + "learning_rate": 9.535298284641725e-07, + "loss": 0.7742, + "step": 2087 + }, + { + "epoch": 4.78, + "learning_rate": 9.331658391255161e-07, + "loss": 0.7953, + "step": 2088 + }, + { + "epoch": 4.79, + "learning_rate": 9.130206350089765e-07, + "loss": 0.7418, + "step": 2089 + }, + { + "epoch": 4.79, + "learning_rate": 8.930942606044435e-07, + "loss": 0.8128, + "step": 2090 + }, + { + "epoch": 4.79, + "learning_rate": 8.733867599185486e-07, + "loss": 0.8256, + "step": 2091 + }, + { + "epoch": 4.79, + "learning_rate": 8.538981764745102e-07, + "loss": 0.7857, + "step": 2092 + }, + { + "epoch": 4.79, + "learning_rate": 8.346285533121223e-07, + "loss": 0.7894, + "step": 2093 + }, + { + "epoch": 4.8, + "learning_rate": 8.155779329875767e-07, + "loss": 0.7239, + "step": 2094 + }, + { + "epoch": 4.8, + "learning_rate": 7.967463575734413e-07, + "loss": 0.8374, + "step": 2095 + }, + { + "epoch": 4.8, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7095, + "step": 2096 + }, + { + "epoch": 4.8, + "learning_rate": 7.597405073476948e-07, + "loss": 0.7145, + "step": 2097 + }, + { + "epoch": 4.81, + "learning_rate": 7.415663142620654e-07, + "loss": 0.8099, + "step": 2098 + }, + { + "epoch": 4.81, + "learning_rate": 7.236113295385982e-07, + "loss": 0.7572, + "step": 2099 + }, + { + "epoch": 4.81, + "learning_rate": 7.05875592830163e-07, + "loss": 0.6466, + "step": 2100 + }, + { + "epoch": 4.81, + "learning_rate": 6.883591433054614e-07, + "loss": 0.7514, + "step": 2101 + }, + { + "epoch": 4.82, + "learning_rate": 6.710620196488604e-07, + "loss": 0.8009, + "step": 2102 + }, + { + "epoch": 4.82, + "learning_rate": 6.539842600603918e-07, + "loss": 0.8409, + "step": 2103 + }, + { + "epoch": 4.82, + "learning_rate": 6.371259022556198e-07, + "loss": 0.7622, + "step": 2104 + }, + { + "epoch": 4.82, + "learning_rate": 6.204869834655625e-07, + "loss": 0.674, + "step": 2105 + }, + { + "epoch": 4.82, + "learning_rate": 6.040675404366258e-07, + "loss": 0.7014, + "step": 2106 + }, + { + "epoch": 4.83, + "learning_rate": 5.878676094305147e-07, + "loss": 0.7956, + "step": 2107 + }, + { + "epoch": 4.83, + "learning_rate": 5.718872262241215e-07, + "loss": 0.7478, + "step": 2108 + }, + { + "epoch": 4.83, + "learning_rate": 5.561264261095045e-07, + "loss": 0.7502, + "step": 2109 + }, + { + "epoch": 4.83, + "learning_rate": 5.405852438937764e-07, + "loss": 0.6969, + "step": 2110 + }, + { + "epoch": 4.84, + "learning_rate": 5.252637138990268e-07, + "loss": 0.9293, + "step": 2111 + }, + { + "epoch": 4.84, + "learning_rate": 5.101618699622668e-07, + "loss": 0.7855, + "step": 2112 + }, + { + "epoch": 4.84, + "learning_rate": 4.952797454353064e-07, + "loss": 0.6301, + "step": 2113 + }, + { + "epoch": 4.84, + "learning_rate": 4.80617373184744e-07, + "loss": 0.8037, + "step": 2114 + }, + { + "epoch": 4.85, + "learning_rate": 4.6617478559186633e-07, + "loss": 0.6548, + "step": 2115 + }, + { + "epoch": 4.85, + "learning_rate": 4.5195201455253687e-07, + "loss": 0.8039, + "step": 2116 + }, + { + "epoch": 4.85, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.6482, + "step": 2117 + }, + { + "epoch": 4.85, + "learning_rate": 4.241660472907749e-07, + "loss": 0.6428, + "step": 2118 + }, + { + "epoch": 4.85, + "learning_rate": 4.1060291243255613e-07, + "loss": 0.8192, + "step": 2119 + }, + { + "epoch": 4.86, + "learning_rate": 3.972597168562131e-07, + "loss": 0.7162, + "step": 2120 + }, + { + "epoch": 4.86, + "learning_rate": 3.841364900296518e-07, + "loss": 0.8107, + "step": 2121 + }, + { + "epoch": 4.86, + "learning_rate": 3.712332609350222e-07, + "loss": 0.9086, + "step": 2122 + }, + { + "epoch": 4.86, + "learning_rate": 3.585500580685852e-07, + "loss": 0.7639, + "step": 2123 + }, + { + "epoch": 4.87, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.8498, + "step": 2124 + }, + { + "epoch": 4.87, + "learning_rate": 3.338438425757651e-07, + "loss": 0.8228, + "step": 2125 + }, + { + "epoch": 4.87, + "learning_rate": 3.218208845120807e-07, + "loss": 0.707, + "step": 2126 + }, + { + "epoch": 4.87, + "learning_rate": 3.1001806180189776e-07, + "loss": 0.772, + "step": 2127 + }, + { + "epoch": 4.88, + "learning_rate": 2.984354005112766e-07, + "loss": 0.787, + "step": 2128 + }, + { + "epoch": 4.88, + "learning_rate": 2.870729262200889e-07, + "loss": 0.7106, + "step": 2129 + }, + { + "epoch": 4.88, + "learning_rate": 2.7593066402189504e-07, + "loss": 0.6522, + "step": 2130 + }, + { + "epoch": 4.88, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7751, + "step": 2131 + }, + { + "epoch": 4.88, + "learning_rate": 2.5430687384713215e-07, + "loss": 0.857, + "step": 2132 + }, + { + "epoch": 4.89, + "learning_rate": 2.438253936258517e-07, + "loss": 0.7069, + "step": 2133 + }, + { + "epoch": 4.89, + "learning_rate": 2.3356422100805353e-07, + "loss": 0.638, + "step": 2134 + }, + { + "epoch": 4.89, + "learning_rate": 2.2352337865514383e-07, + "loss": 0.7217, + "step": 2135 + }, + { + "epoch": 4.89, + "learning_rate": 2.137028887419068e-07, + "loss": 0.8347, + "step": 2136 + }, + { + "epoch": 4.9, + "learning_rate": 2.041027729565381e-07, + "loss": 0.6577, + "step": 2137 + }, + { + "epoch": 4.9, + "learning_rate": 1.947230525005006e-07, + "loss": 0.8272, + "step": 2138 + }, + { + "epoch": 4.9, + "learning_rate": 1.8556374808853526e-07, + "loss": 0.7341, + "step": 2139 + }, + { + "epoch": 4.9, + "learning_rate": 1.7662487994862808e-07, + "loss": 0.7144, + "step": 2140 + }, + { + "epoch": 4.9, + "learning_rate": 1.679064678218989e-07, + "loss": 0.9138, + "step": 2141 + }, + { + "epoch": 4.91, + "learning_rate": 1.5940853096262365e-07, + "loss": 0.7085, + "step": 2142 + }, + { + "epoch": 4.91, + "learning_rate": 1.5113108813816778e-07, + "loss": 0.8218, + "step": 2143 + }, + { + "epoch": 4.91, + "learning_rate": 1.4307415762893074e-07, + "loss": 0.6645, + "step": 2144 + }, + { + "epoch": 4.91, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7422, + "step": 2145 + }, + { + "epoch": 4.92, + "learning_rate": 1.2762190424278063e-07, + "loss": 0.7858, + "step": 2146 + }, + { + "epoch": 4.92, + "learning_rate": 1.2022661549154769e-07, + "loss": 0.885, + "step": 2147 + }, + { + "epoch": 4.92, + "learning_rate": 1.1305190730686032e-07, + "loss": 0.7081, + "step": 2148 + }, + { + "epoch": 4.92, + "learning_rate": 1.0609779553377719e-07, + "loss": 0.864, + "step": 2149 + }, + { + "epoch": 4.93, + "learning_rate": 9.936429553017989e-08, + "loss": 0.7852, + "step": 2150 + }, + { + "epoch": 4.93, + "learning_rate": 9.285142216675091e-08, + "loss": 0.7688, + "step": 2151 + }, + { + "epoch": 4.93, + "learning_rate": 8.655918982689581e-08, + "loss": 0.8124, + "step": 2152 + }, + { + "epoch": 4.93, + "learning_rate": 8.048761240677661e-08, + "loss": 0.8, + "step": 2153 + }, + { + "epoch": 4.93, + "learning_rate": 7.463670331523398e-08, + "loss": 0.948, + "step": 2154 + }, + { + "epoch": 4.94, + "learning_rate": 6.900647547376515e-08, + "loss": 0.9025, + "step": 2155 + }, + { + "epoch": 4.94, + "learning_rate": 6.359694131650162e-08, + "loss": 0.7678, + "step": 2156 + }, + { + "epoch": 4.94, + "learning_rate": 5.840811279020919e-08, + "loss": 0.7312, + "step": 2157 + }, + { + "epoch": 4.94, + "learning_rate": 5.344000135419913e-08, + "loss": 0.833, + "step": 2158 + }, + { + "epoch": 4.95, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.7492, + "step": 2159 + }, + { + "epoch": 4.95, + "learning_rate": 4.416597315307636e-08, + "loss": 0.7587, + "step": 2160 + }, + { + "epoch": 4.95, + "learning_rate": 3.9860076869291384e-08, + "loss": 0.7138, + "step": 2161 + }, + { + "epoch": 4.95, + "learning_rate": 3.577493863841097e-08, + "loss": 0.5712, + "step": 2162 + }, + { + "epoch": 4.96, + "learning_rate": 3.191056748228505e-08, + "loss": 0.8738, + "step": 2163 + }, + { + "epoch": 4.96, + "learning_rate": 2.8266971935231313e-08, + "loss": 0.6495, + "step": 2164 + }, + { + "epoch": 4.96, + "learning_rate": 2.4844160043990817e-08, + "loss": 0.621, + "step": 2165 + }, + { + "epoch": 4.96, + "learning_rate": 2.164213936770576e-08, + "loss": 0.8455, + "step": 2166 + }, + { + "epoch": 4.96, + "learning_rate": 1.8660916977919496e-08, + "loss": 0.7205, + "step": 2167 + }, + { + "epoch": 4.97, + "learning_rate": 1.5900499458543215e-08, + "loss": 0.8107, + "step": 2168 + }, + { + "epoch": 4.97, + "learning_rate": 1.3360892905844857e-08, + "loss": 0.827, + "step": 2169 + }, + { + "epoch": 4.97, + "learning_rate": 1.1042102928460196e-08, + "loss": 0.7905, + "step": 2170 + }, + { + "epoch": 4.97, + "learning_rate": 8.944134647326241e-09, + "loss": 0.6544, + "step": 2171 + }, + { + "epoch": 4.98, + "learning_rate": 7.066992695736741e-09, + "loss": 0.7825, + "step": 2172 + }, + { + "epoch": 4.98, + "learning_rate": 5.410681219286673e-09, + "loss": 0.8077, + "step": 2173 + }, + { + "epoch": 4.98, + "learning_rate": 3.975203875861144e-09, + "loss": 0.7348, + "step": 2174 + }, + { + "epoch": 4.98, + "learning_rate": 2.760563835679797e-09, + "loss": 0.7083, + "step": 2175 + }, + { + "epoch": 4.99, + "learning_rate": 1.7667637812079917e-09, + "loss": 0.8476, + "step": 2176 + }, + { + "epoch": 4.99, + "learning_rate": 9.938059072123195e-10, + "loss": 0.7302, + "step": 2177 + }, + { + "epoch": 4.99, + "learning_rate": 4.416919207606007e-10, + "loss": 0.6682, + "step": 2178 + }, + { + "epoch": 4.99, + "learning_rate": 1.1042304115527202e-10, + "loss": 0.7609, + "step": 2179 + }, + { + "epoch": 4.99, + "learning_rate": 0.0, + "loss": 0.7523, + "step": 2180 + } + ], + "max_steps": 2180, + "num_train_epochs": 5, + "total_flos": 612326401605632.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2180/training_args.bin b/checkpoint-2180/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4aa0907a784d65549a9c45257c4d455176479607 --- /dev/null +++ b/checkpoint-2180/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adff180a74f6fc1e6a420417eadde6ef8ff75561e442f481bfe772c93f46e2ae +size 6011 diff --git a/checkpoint-2180/zero_to_fp32.py b/checkpoint-2180/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..c98caae31534368be22b67fc4ae906836c992a8d --- /dev/null +++ b/checkpoint-2180/zero_to_fp32.py @@ -0,0 +1,587 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag) diff --git a/checkpoint-436/README.md b/checkpoint-436/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/checkpoint-436/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-436/adapter_config.json b/checkpoint-436/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a764b53e4dc8b17af932aa1de32ced6a340469f0 --- /dev/null +++ b/checkpoint-436/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "liuhaotian/llava-v1.5-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 256, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "up_proj", + "v_proj", + "down_proj", + "q_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-436/adapter_model.bin b/checkpoint-436/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..635cba20c5f0cfe9d0d57e4012bd3878a40b9e25 --- /dev/null +++ b/checkpoint-436/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:970aa638e9c93c61fcdcde9ff4bb46c99e4ac3c0a3202d7234e03fa7411d3503 +size 639786637 diff --git a/checkpoint-436/global_step436/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-436/global_step436/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..799ed87c98b15433d8acf3aaf3afade5fe82c772 --- /dev/null +++ b/checkpoint-436/global_step436/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d96c50c9da2c2ff3d5e752a494135c62f8c7190d24ef9d68f84139065f5ae8e5 +size 1022391865 diff --git a/checkpoint-436/global_step436/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-436/global_step436/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31d52e61fd8ebca5336a570307ae8dd1a781fb17 --- /dev/null +++ b/checkpoint-436/global_step436/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c76001008238664f532951b25dcf3714bfaa24531aa5a3d1c9b390e66d432f09 +size 1022391865 diff --git a/checkpoint-436/global_step436/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-436/global_step436/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5f109e13edc4f7f57336038021ee714868a6711 --- /dev/null +++ b/checkpoint-436/global_step436/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:269f2cfb3a08d9d95731c7c27bfe694a8ffe21ecfdc3db73edc4439a615f4b9d +size 1022391865 diff --git a/checkpoint-436/global_step436/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-436/global_step436/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13b78a3535ff6769b36ae19ebe19b4e53c8b924f --- /dev/null +++ b/checkpoint-436/global_step436/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e00bf09e278f097b207adefa842185c2eed4f3f29f2e1a6bcd36980a9eeb0f88 +size 1022391865 diff --git a/checkpoint-436/global_step436/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-436/global_step436/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2397875a80d5f90c5520f78ae96534416812440b --- /dev/null +++ b/checkpoint-436/global_step436/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5600359786d80c3968768f656c6b40f0470a46631113f7e86b68d2be6dfe6a29 +size 3521982567 diff --git a/checkpoint-436/global_step436/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-436/global_step436/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb1fb60668a3c07ae5ab20926ff76d2d6ff49ddc --- /dev/null +++ b/checkpoint-436/global_step436/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acfcb80f7597ec3cb1ea3dc68f29068863cfb82f5561a4a287af217c8220f217 +size 3521982567 diff --git a/checkpoint-436/global_step436/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-436/global_step436/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23c087f405c9809c9d2822839de9770f14545492 --- /dev/null +++ b/checkpoint-436/global_step436/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2731564945b6cf184abdcdeb2d7af1a6e59f390d522e1b9c97d47bfd401786a6 +size 3521982567 diff --git a/checkpoint-436/global_step436/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-436/global_step436/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42342ab462b8023ac553f11987199048518207f7 --- /dev/null +++ b/checkpoint-436/global_step436/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4403d832f9aa4f022fd2ea60a12b0120532ee8922ad5603d76e913848e23f4e +size 3521982567 diff --git a/checkpoint-436/latest b/checkpoint-436/latest new file mode 100644 index 0000000000000000000000000000000000000000..1d7b384c9e8423d824aa564819f6a5e23fa060c3 --- /dev/null +++ b/checkpoint-436/latest @@ -0,0 +1 @@ +global_step436 \ No newline at end of file diff --git a/checkpoint-436/rng_state_0.pth b/checkpoint-436/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf088ddabdd541117b860667517628b5060c6fb8 --- /dev/null +++ b/checkpoint-436/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be3dd4b881c4e0e791a0b434719b360a1293ca625fc6b885f61d81b3106002cb +size 17655 diff --git a/checkpoint-436/rng_state_1.pth b/checkpoint-436/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..dbe46678ab7cc2bc98ef4715f5f29afa05bcae17 --- /dev/null +++ b/checkpoint-436/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39b0225bb93d65f5a46c8d8fada8da0256f618b20e445604bec78f894d259a57 +size 17655 diff --git a/checkpoint-436/rng_state_2.pth b/checkpoint-436/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..752e37a9ce22f451cf86562a3da3f31c76fb7f5e --- /dev/null +++ b/checkpoint-436/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d51c84a839e8c16117b96ffecdb4d4560a9256838c64408e1856f08c010db0d +size 17655 diff --git a/checkpoint-436/rng_state_3.pth b/checkpoint-436/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..293fd79f9acd1ef94a073ac939582bb14edea55e --- /dev/null +++ b/checkpoint-436/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f92d96e84121ecb4d05582bf67e3314b107b2f6c4495f55c9c43949b88f2fdb4 +size 17655 diff --git a/checkpoint-436/special_tokens_map.json b/checkpoint-436/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/checkpoint-436/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-436/tokenizer.model b/checkpoint-436/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-436/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-436/tokenizer_config.json b/checkpoint-436/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..740756b4bef305e27d0bb4d2e1a40dd8847797f7 --- /dev/null +++ b/checkpoint-436/tokenizer_config.json @@ -0,0 +1,35 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 2048, + "pad_token": null, + "padding_side": "right", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-436/trainer_state.json b/checkpoint-436/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..feec0503ce452216884aa83e2610646d15c5f744 --- /dev/null +++ b/checkpoint-436/trainer_state.json @@ -0,0 +1,2632 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9988545246277205, + "global_step": 436, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.946, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 6.060606060606061e-06, + "loss": 1.908, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 9.090909090909091e-06, + "loss": 2.1083, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 1.2121212121212122e-05, + "loss": 2.3218, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 1.5151515151515153e-05, + "loss": 1.8338, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 1.8181818181818182e-05, + "loss": 2.0202, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 2.1212121212121215e-05, + "loss": 2.1332, + "step": 7 + }, + { + "epoch": 0.02, + "learning_rate": 2.4242424242424244e-05, + "loss": 1.8593, + "step": 8 + }, + { + "epoch": 0.02, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.5359, + "step": 9 + }, + { + "epoch": 0.02, + "learning_rate": 3.0303030303030306e-05, + "loss": 1.327, + "step": 10 + }, + { + "epoch": 0.03, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.7252, + "step": 11 + }, + { + "epoch": 0.03, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.4351, + "step": 12 + }, + { + "epoch": 0.03, + "learning_rate": 3.939393939393939e-05, + "loss": 1.2774, + "step": 13 + }, + { + "epoch": 0.03, + "learning_rate": 4.242424242424243e-05, + "loss": 1.5145, + "step": 14 + }, + { + "epoch": 0.03, + "learning_rate": 4.545454545454546e-05, + "loss": 1.1529, + "step": 15 + }, + { + "epoch": 0.04, + "learning_rate": 4.848484848484849e-05, + "loss": 1.0047, + "step": 16 + }, + { + "epoch": 0.04, + "learning_rate": 5.151515151515152e-05, + "loss": 1.3872, + "step": 17 + }, + { + "epoch": 0.04, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.1229, + "step": 18 + }, + { + "epoch": 0.04, + "learning_rate": 5.757575757575758e-05, + "loss": 1.3386, + "step": 19 + }, + { + "epoch": 0.05, + "learning_rate": 6.060606060606061e-05, + "loss": 1.2493, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 6.363636363636364e-05, + "loss": 1.1427, + "step": 21 + }, + { + "epoch": 0.05, + "learning_rate": 6.666666666666667e-05, + "loss": 1.0895, + "step": 22 + }, + { + "epoch": 0.05, + "learning_rate": 6.96969696969697e-05, + "loss": 1.1989, + "step": 23 + }, + { + "epoch": 0.05, + "learning_rate": 7.272727272727273e-05, + "loss": 1.0438, + "step": 24 + }, + { + "epoch": 0.06, + "learning_rate": 7.575757575757576e-05, + "loss": 1.176, + "step": 25 + }, + { + "epoch": 0.06, + "learning_rate": 7.878787878787879e-05, + "loss": 1.1372, + "step": 26 + }, + { + "epoch": 0.06, + "learning_rate": 8.181818181818183e-05, + "loss": 1.2983, + "step": 27 + }, + { + "epoch": 0.06, + "learning_rate": 8.484848484848486e-05, + "loss": 0.9371, + "step": 28 + }, + { + "epoch": 0.07, + "learning_rate": 8.787878787878789e-05, + "loss": 1.2299, + "step": 29 + }, + { + "epoch": 0.07, + "learning_rate": 9.090909090909092e-05, + "loss": 0.9441, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 9.393939393939395e-05, + "loss": 1.0011, + "step": 31 + }, + { + "epoch": 0.07, + "learning_rate": 9.696969696969698e-05, + "loss": 1.1704, + "step": 32 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 1.1193, + "step": 33 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010303030303030303, + "loss": 1.1559, + "step": 34 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010606060606060606, + "loss": 0.8677, + "step": 35 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010909090909090909, + "loss": 1.0865, + "step": 36 + }, + { + "epoch": 0.08, + "learning_rate": 0.00011212121212121212, + "loss": 1.0922, + "step": 37 + }, + { + "epoch": 0.09, + "learning_rate": 0.00011515151515151516, + "loss": 0.9434, + "step": 38 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001181818181818182, + "loss": 0.9144, + "step": 39 + }, + { + "epoch": 0.09, + "learning_rate": 0.00012121212121212122, + "loss": 0.9546, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.00012424242424242425, + "loss": 1.0654, + "step": 41 + }, + { + "epoch": 0.1, + "learning_rate": 0.00012727272727272728, + "loss": 0.8077, + "step": 42 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001303030303030303, + "loss": 1.0758, + "step": 43 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013333333333333334, + "loss": 1.1512, + "step": 44 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013636363636363637, + "loss": 0.84, + "step": 45 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001393939393939394, + "loss": 1.0567, + "step": 46 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014242424242424243, + "loss": 1.0165, + "step": 47 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014545454545454546, + "loss": 0.8678, + "step": 48 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014848484848484849, + "loss": 1.055, + "step": 49 + }, + { + "epoch": 0.11, + "learning_rate": 0.00015151515151515152, + "loss": 1.0669, + "step": 50 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015454545454545454, + "loss": 0.9915, + "step": 51 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015757575757575757, + "loss": 0.993, + "step": 52 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001606060606060606, + "loss": 1.1085, + "step": 53 + }, + { + "epoch": 0.12, + "learning_rate": 0.00016363636363636366, + "loss": 0.9391, + "step": 54 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001666666666666667, + "loss": 0.975, + "step": 55 + }, + { + "epoch": 0.13, + "learning_rate": 0.00016969696969696972, + "loss": 1.0697, + "step": 56 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017272727272727275, + "loss": 0.9462, + "step": 57 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017575757575757578, + "loss": 1.1209, + "step": 58 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001787878787878788, + "loss": 1.0648, + "step": 59 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018181818181818183, + "loss": 0.9964, + "step": 60 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018484848484848484, + "loss": 0.8451, + "step": 61 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001878787878787879, + "loss": 0.8437, + "step": 62 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019090909090909092, + "loss": 1.1271, + "step": 63 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019393939393939395, + "loss": 1.161, + "step": 64 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019696969696969698, + "loss": 1.0032, + "step": 65 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 66 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019999988957695886, + "loss": 0.9543, + "step": 67 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999955830807923, + "loss": 1.0274, + "step": 68 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999900619409279, + "loss": 0.9334, + "step": 69 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001999982332362188, + "loss": 1.0398, + "step": 70 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999723943616433, + "loss": 0.9049, + "step": 71 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999602479612417, + "loss": 0.7452, + "step": 72 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999458931878073, + "loss": 0.8762, + "step": 73 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999293300730427, + "loss": 1.0941, + "step": 74 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999105586535268, + "loss": 0.7713, + "step": 75 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019998895789707154, + "loss": 0.9233, + "step": 76 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998663910709416, + "loss": 0.8634, + "step": 77 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998409950054146, + "loss": 0.9697, + "step": 78 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998133908302209, + "loss": 1.0816, + "step": 79 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001999783578606323, + "loss": 0.9659, + "step": 80 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019997515583995603, + "loss": 0.9644, + "step": 81 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019997173302806478, + "loss": 0.8561, + "step": 82 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019996808943251773, + "loss": 1.0016, + "step": 83 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001999642250613616, + "loss": 0.8951, + "step": 84 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019996013992313073, + "loss": 1.0157, + "step": 85 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019995583402684694, + "loss": 0.9414, + "step": 86 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019995130738201966, + "loss": 0.8097, + "step": 87 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019994655999864582, + "loss": 0.8606, + "step": 88 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001999415918872098, + "loss": 1.0427, + "step": 89 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019993640305868352, + "loss": 0.9578, + "step": 90 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019993099352452623, + "loss": 1.1097, + "step": 91 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019992536329668478, + "loss": 0.8119, + "step": 92 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019991951238759325, + "loss": 0.9915, + "step": 93 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001999134408101731, + "loss": 0.838, + "step": 94 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019990714857783326, + "loss": 0.8935, + "step": 95 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019990063570446984, + "loss": 0.7914, + "step": 96 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019989390220446622, + "loss": 0.8724, + "step": 97 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019988694809269314, + "loss": 1.0374, + "step": 98 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019987977338450845, + "loss": 0.9028, + "step": 99 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019987237809575723, + "loss": 0.9986, + "step": 100 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019986476224277165, + "loss": 1.113, + "step": 101 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019985692584237108, + "loss": 0.8395, + "step": 102 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019984886891186184, + "loss": 1.0134, + "step": 103 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001998405914690374, + "loss": 0.8845, + "step": 104 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019983209353217812, + "loss": 0.7507, + "step": 105 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019982337512005138, + "loss": 0.9073, + "step": 106 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019981443625191148, + "loss": 0.9973, + "step": 107 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019980527694749952, + "loss": 1.0733, + "step": 108 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019979589722704346, + "loss": 0.9148, + "step": 109 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019978629711125812, + "loss": 0.8385, + "step": 110 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019977647662134488, + "loss": 0.75, + "step": 111 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019976643577899195, + "loss": 0.9002, + "step": 112 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019975617460637416, + "loss": 0.8754, + "step": 113 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001997456931261529, + "loss": 0.8886, + "step": 114 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019973499136147606, + "loss": 1.0058, + "step": 115 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019972406933597812, + "loss": 0.9276, + "step": 116 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019971292707377991, + "loss": 0.9922, + "step": 117 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019970156459948873, + "loss": 0.9507, + "step": 118 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001996899819381981, + "loss": 0.9619, + "step": 119 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019967817911548794, + "loss": 0.8163, + "step": 120 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019966615615742424, + "loss": 1.0647, + "step": 121 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001996539130905593, + "loss": 0.9348, + "step": 122 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019964144994193142, + "loss": 1.0523, + "step": 123 + }, + { + "epoch": 0.28, + "learning_rate": 0.000199628766739065, + "loss": 0.9063, + "step": 124 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019961586350997033, + "loss": 1.0227, + "step": 125 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001996027402831438, + "loss": 1.006, + "step": 126 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019958939708756746, + "loss": 0.9082, + "step": 127 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019957583395270923, + "loss": 0.8756, + "step": 128 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995620509085228, + "loss": 0.8311, + "step": 129 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019954804798544745, + "loss": 1.0332, + "step": 130 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019953382521440815, + "loss": 0.9427, + "step": 131 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019951938262681527, + "loss": 0.838, + "step": 132 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995047202545647, + "loss": 0.8509, + "step": 133 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019948983813003774, + "loss": 0.8944, + "step": 134 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019947473628610099, + "loss": 0.9569, + "step": 135 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019945941475610623, + "loss": 0.7805, + "step": 136 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019944387357389052, + "loss": 0.9337, + "step": 137 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994281127737759, + "loss": 0.8712, + "step": 138 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994121323905695, + "loss": 0.9264, + "step": 139 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001993959324595634, + "loss": 0.9323, + "step": 140 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019937951301653444, + "loss": 0.8331, + "step": 141 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001993628740977444, + "loss": 0.902, + "step": 142 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001993460157399396, + "loss": 0.8676, + "step": 143 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019932893798035116, + "loss": 0.8525, + "step": 144 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019931164085669456, + "loss": 0.8571, + "step": 145 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019929412440716985, + "loss": 1.0006, + "step": 146 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019927638867046142, + "loss": 0.9849, + "step": 147 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019925843368573794, + "loss": 0.9064, + "step": 148 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001992402594926523, + "loss": 0.9716, + "step": 149 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001992218661313415, + "loss": 0.7553, + "step": 150 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019920325364242654, + "loss": 0.7921, + "step": 151 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019918442206701245, + "loss": 0.7994, + "step": 152 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001991653714466879, + "loss": 0.8296, + "step": 153 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019914610182352548, + "loss": 0.8116, + "step": 154 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019912661324008148, + "loss": 0.9844, + "step": 155 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019910690573939557, + "loss": 0.865, + "step": 156 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019908697936499103, + "loss": 0.959, + "step": 157 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019906683416087448, + "loss": 0.7727, + "step": 158 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019904647017153582, + "loss": 0.707, + "step": 159 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019902588744194813, + "loss": 0.8597, + "step": 160 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019900508601756756, + "loss": 0.9146, + "step": 161 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001989840659443332, + "loss": 0.9571, + "step": 162 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001989628272686671, + "loss": 0.8537, + "step": 163 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019894137003747403, + "loss": 0.828, + "step": 164 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019891969429814145, + "loss": 0.8055, + "step": 165 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001988978000985394, + "loss": 0.8432, + "step": 166 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001988756874870203, + "loss": 0.8101, + "step": 167 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019885335651241903, + "loss": 0.9072, + "step": 168 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001988308072240527, + "loss": 0.7862, + "step": 169 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019880803967172047, + "loss": 0.8303, + "step": 170 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019878505390570362, + "loss": 0.9489, + "step": 171 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001987618499767653, + "loss": 1.0125, + "step": 172 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001987384279361505, + "loss": 0.809, + "step": 173 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019871478783558587, + "loss": 0.9488, + "step": 174 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001986909297272796, + "loss": 0.9664, + "step": 175 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001986668536639215, + "loss": 0.9657, + "step": 176 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001986425596986825, + "loss": 0.8123, + "step": 177 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019861804788521493, + "loss": 0.9482, + "step": 178 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019859331827765212, + "loss": 0.879, + "step": 179 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019856837093060848, + "loss": 0.896, + "step": 180 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019854320589917927, + "loss": 1.0729, + "step": 181 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019851782323894042, + "loss": 0.9844, + "step": 182 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001984922230059486, + "loss": 0.9131, + "step": 183 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019846640525674082, + "loss": 0.9417, + "step": 184 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019844037004833473, + "loss": 0.9633, + "step": 185 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001984141174382279, + "loss": 0.968, + "step": 186 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019838764748439827, + "loss": 0.8447, + "step": 187 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019836096024530373, + "loss": 0.8638, + "step": 188 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019833405577988195, + "loss": 0.9346, + "step": 189 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001983069341475504, + "loss": 0.8969, + "step": 190 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019827959540820613, + "loss": 0.8499, + "step": 191 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019825203962222572, + "loss": 0.8041, + "step": 192 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019822426685046497, + "loss": 0.9216, + "step": 193 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019819627715425903, + "loss": 0.906, + "step": 194 + }, + { + "epoch": 0.45, + "learning_rate": 0.000198168070595422, + "loss": 0.8969, + "step": 195 + }, + { + "epoch": 0.45, + "learning_rate": 0.000198139647236247, + "loss": 0.7949, + "step": 196 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019811100713950587, + "loss": 0.8996, + "step": 197 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019808215036844917, + "loss": 0.9118, + "step": 198 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001980530769868059, + "loss": 0.7355, + "step": 199 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019802378705878354, + "loss": 0.8344, + "step": 200 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019799428064906774, + "loss": 0.9639, + "step": 201 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001979645578228222, + "loss": 0.852, + "step": 202 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001979346186456887, + "loss": 0.8493, + "step": 203 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019790446318378665, + "loss": 0.851, + "step": 204 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019787409150371328, + "loss": 0.7161, + "step": 205 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019784350367254322, + "loss": 0.9846, + "step": 206 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001978126997578285, + "loss": 0.7883, + "step": 207 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019778167982759833, + "loss": 0.8691, + "step": 208 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019775044395035907, + "loss": 0.928, + "step": 209 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001977189921950939, + "loss": 0.8244, + "step": 210 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001976873246312628, + "loss": 1.0413, + "step": 211 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001976554413288023, + "loss": 0.8261, + "step": 212 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001976233423581255, + "loss": 0.823, + "step": 213 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019759102779012166, + "loss": 0.9386, + "step": 214 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019755849769615628, + "loss": 0.8156, + "step": 215 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019752575214807076, + "loss": 0.8556, + "step": 216 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019749279121818235, + "loss": 0.7769, + "step": 217 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019745961497928406, + "loss": 1.0772, + "step": 218 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019742622350464418, + "loss": 0.8147, + "step": 219 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001973926168680066, + "loss": 0.9529, + "step": 220 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019735879514359018, + "loss": 0.8688, + "step": 221 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019732475840608888, + "loss": 0.9647, + "step": 222 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019729050673067156, + "loss": 0.837, + "step": 223 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019725604019298163, + "loss": 0.9211, + "step": 224 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019722135886913715, + "loss": 0.9434, + "step": 225 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001971864628357304, + "loss": 0.6506, + "step": 226 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019715135216982798, + "loss": 0.8052, + "step": 227 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019711602694897037, + "loss": 0.7852, + "step": 228 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019708048725117192, + "loss": 0.9283, + "step": 229 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001970447331549207, + "loss": 0.9081, + "step": 230 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019700876473917824, + "loss": 0.9036, + "step": 231 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019697258208337934, + "loss": 0.716, + "step": 232 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019693618526743197, + "loss": 0.8192, + "step": 233 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001968995743717171, + "loss": 0.9773, + "step": 234 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019686274947708848, + "loss": 0.8698, + "step": 235 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001968257106648724, + "loss": 0.9062, + "step": 236 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019678845801686764, + "loss": 0.8984, + "step": 237 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019675099161534521, + "loss": 0.8087, + "step": 238 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019671331154304822, + "loss": 0.8272, + "step": 239 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019667541788319162, + "loss": 0.784, + "step": 240 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019663731071946206, + "loss": 0.8777, + "step": 241 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019659899013601772, + "loss": 0.8534, + "step": 242 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019656045621748808, + "loss": 0.9645, + "step": 243 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019652170904897387, + "loss": 0.9692, + "step": 244 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019648274871604662, + "loss": 0.838, + "step": 245 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019644357530474872, + "loss": 0.7445, + "step": 246 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001964041889015931, + "loss": 0.9065, + "step": 247 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019636458959356316, + "loss": 0.7806, + "step": 248 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019632477746811232, + "loss": 0.7971, + "step": 249 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019628475261316417, + "loss": 0.8409, + "step": 250 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019624451511711198, + "loss": 0.7432, + "step": 251 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019620406506881875, + "loss": 0.9096, + "step": 252 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019616340255761676, + "loss": 0.8004, + "step": 253 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019612252767330763, + "loss": 0.7978, + "step": 254 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001960814405061619, + "loss": 0.9535, + "step": 255 + }, + { + "epoch": 0.59, + "learning_rate": 0.000196040141146919, + "loss": 0.9945, + "step": 256 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001959986296867869, + "loss": 0.9703, + "step": 257 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019595690621744208, + "loss": 0.9639, + "step": 258 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019591497083102914, + "loss": 0.9312, + "step": 259 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019587282362016083, + "loss": 0.7709, + "step": 260 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001958304646779175, + "loss": 0.8547, + "step": 261 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019578789409784727, + "loss": 0.8081, + "step": 262 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019574511197396563, + "loss": 0.8476, + "step": 263 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019570211840075517, + "loss": 0.9658, + "step": 264 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019565891347316552, + "loss": 0.7778, + "step": 265 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001956154972866131, + "loss": 0.9926, + "step": 266 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001955718699369808, + "loss": 0.957, + "step": 267 + }, + { + "epoch": 0.61, + "learning_rate": 0.000195528031520618, + "loss": 0.9396, + "step": 268 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019548398213434007, + "loss": 0.9049, + "step": 269 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019543972187542833, + "loss": 0.9683, + "step": 270 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019539525084162992, + "loss": 0.8555, + "step": 271 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019535056913115725, + "loss": 0.8489, + "step": 272 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001953056768426882, + "loss": 0.8728, + "step": 273 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019526057407536564, + "loss": 0.9443, + "step": 274 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019521526092879725, + "loss": 0.8161, + "step": 275 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019516973750305532, + "loss": 0.8936, + "step": 276 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019512400389867657, + "loss": 0.8315, + "step": 277 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019507806021666188, + "loss": 0.9298, + "step": 278 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019503190655847604, + "loss": 0.8235, + "step": 279 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019498554302604766, + "loss": 0.9245, + "step": 280 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001949389697217687, + "loss": 0.8302, + "step": 281 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019489218674849455, + "loss": 0.8488, + "step": 282 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019484519420954354, + "loss": 0.8177, + "step": 283 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019479799220869682, + "loss": 1.0039, + "step": 284 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019475058085019825, + "loss": 0.7685, + "step": 285 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019470296023875387, + "loss": 0.9174, + "step": 286 + }, + { + "epoch": 0.66, + "learning_rate": 0.000194655130479532, + "loss": 1.0997, + "step": 287 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019460709167816274, + "loss": 0.9759, + "step": 288 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001945588439407379, + "loss": 0.9397, + "step": 289 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019451038737381077, + "loss": 1.0367, + "step": 290 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019446172208439574, + "loss": 0.8298, + "step": 291 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001944128481799682, + "loss": 0.9094, + "step": 292 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019436376576846423, + "loss": 1.1234, + "step": 293 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019431447495828045, + "loss": 0.9103, + "step": 294 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001942649758582737, + "loss": 0.7841, + "step": 295 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019421526857776072, + "loss": 0.8817, + "step": 296 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019416535322651818, + "loss": 1.0682, + "step": 297 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019411522991478214, + "loss": 0.9201, + "step": 298 + }, + { + "epoch": 0.68, + "learning_rate": 0.000194064898753248, + "loss": 4.1834, + "step": 299 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019401435985307012, + "loss": 1.0391, + "step": 300 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019396361332586166, + "loss": 2.5015, + "step": 301 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001939126592836944, + "loss": 0.7927, + "step": 302 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001938614978390983, + "loss": 2.2345, + "step": 303 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019381012910506146, + "loss": 0.9311, + "step": 304 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019375855319502962, + "loss": 0.9713, + "step": 305 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019370677022290624, + "loss": 0.8967, + "step": 306 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019365478030305196, + "loss": 3.095, + "step": 307 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001936025835502845, + "loss": 1.1008, + "step": 308 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001935501800798783, + "loss": 1.5409, + "step": 309 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019349757000756444, + "loss": 1.02, + "step": 310 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019344475344953012, + "loss": 1.0101, + "step": 311 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001933917305224187, + "loss": 0.7686, + "step": 312 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001933385013433292, + "loss": 1.1061, + "step": 313 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001932850660298162, + "loss": 0.8083, + "step": 314 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001932314246998895, + "loss": 1.1942, + "step": 315 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019317757747201384, + "loss": 0.8551, + "step": 316 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019312352446510878, + "loss": 0.9049, + "step": 317 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019306926579854821, + "loss": 0.7072, + "step": 318 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019301480159216028, + "loss": 0.8552, + "step": 319 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019296013196622706, + "loss": 0.8414, + "step": 320 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001929052570414843, + "loss": 0.9198, + "step": 321 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019285017693912107, + "loss": 2.1953, + "step": 322 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019279489178077969, + "loss": 0.851, + "step": 323 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019273940168855518, + "loss": 1.0239, + "step": 324 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019268370678499533, + "loss": 1.5125, + "step": 325 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019262780719310008, + "loss": 0.9171, + "step": 326 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019257170303632148, + "loss": 0.9794, + "step": 327 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019251539443856344, + "loss": 0.9023, + "step": 328 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019245888152418124, + "loss": 1.058, + "step": 329 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019240216441798142, + "loss": 0.9411, + "step": 330 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001923452432452215, + "loss": 1.197, + "step": 331 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001922881181316097, + "loss": 0.9253, + "step": 332 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001922307892033046, + "loss": 1.156, + "step": 333 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019217325658691482, + "loss": 0.9424, + "step": 334 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019211552040949891, + "loss": 1.1147, + "step": 335 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019205758079856498, + "loss": 0.8528, + "step": 336 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001919994378820704, + "loss": 0.8105, + "step": 337 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019194109178842153, + "loss": 0.9279, + "step": 338 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019188254264647337, + "loss": 0.9231, + "step": 339 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019182379058552948, + "loss": 1.0425, + "step": 340 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019176483573534142, + "loss": 0.8794, + "step": 341 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019170567822610873, + "loss": 0.9873, + "step": 342 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001916463181884784, + "loss": 0.8146, + "step": 343 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019158675575354478, + "loss": 1.027, + "step": 344 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019152699105284913, + "loss": 0.8093, + "step": 345 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001914670242183795, + "loss": 0.951, + "step": 346 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019140685538257028, + "loss": 0.9268, + "step": 347 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019134648467830198, + "loss": 1.0205, + "step": 348 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019128591223890092, + "loss": 0.9043, + "step": 349 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019122513819813902, + "loss": 0.7387, + "step": 350 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001911641626902333, + "loss": 0.9422, + "step": 351 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019110298584984578, + "loss": 0.9015, + "step": 352 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001910416078120832, + "loss": 0.7522, + "step": 353 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019098002871249646, + "loss": 0.9722, + "step": 354 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001909182486870806, + "loss": 0.8358, + "step": 355 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019085626787227443, + "loss": 0.9859, + "step": 356 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019079408640496013, + "loss": 0.7796, + "step": 357 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019073170442246302, + "loss": 0.8617, + "step": 358 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001906691220625513, + "loss": 0.7727, + "step": 359 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001906063394634356, + "loss": 0.8786, + "step": 360 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001905433567637689, + "loss": 0.9117, + "step": 361 + }, + { + "epoch": 0.83, + "learning_rate": 0.000190480174102646, + "loss": 0.9182, + "step": 362 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001904167916196033, + "loss": 0.9706, + "step": 363 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001903532094546186, + "loss": 0.8036, + "step": 364 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001902894277481105, + "loss": 0.902, + "step": 365 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019022544664093854, + "loss": 0.9231, + "step": 366 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019016126627440237, + "loss": 0.9751, + "step": 367 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001900968867902419, + "loss": 0.8373, + "step": 368 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001900323083306367, + "loss": 0.8695, + "step": 369 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001899675310382057, + "loss": 0.8654, + "step": 370 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018990255505600706, + "loss": 0.98, + "step": 371 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018983738052753767, + "loss": 0.7454, + "step": 372 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018977200759673295, + "loss": 0.829, + "step": 373 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018970643640796642, + "loss": 0.8262, + "step": 374 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001896406671060495, + "loss": 1.0659, + "step": 375 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018957469983623112, + "loss": 0.8551, + "step": 376 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018950853474419742, + "loss": 0.7991, + "step": 377 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001894421719760714, + "loss": 0.8662, + "step": 378 + }, + { + "epoch": 0.87, + "learning_rate": 0.00018937561167841263, + "loss": 0.8817, + "step": 379 + }, + { + "epoch": 0.87, + "learning_rate": 0.00018930885399821693, + "loss": 1.0894, + "step": 380 + }, + { + "epoch": 0.87, + "learning_rate": 0.000189241899082916, + "loss": 0.8225, + "step": 381 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018917474708037718, + "loss": 0.9065, + "step": 382 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018910739813890302, + "loss": 0.8779, + "step": 383 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018903985240723104, + "loss": 0.7909, + "step": 384 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018897211003453328, + "loss": 0.7649, + "step": 385 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018890417117041619, + "loss": 0.9788, + "step": 386 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018883603596492004, + "loss": 0.938, + "step": 387 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018876770456851877, + "loss": 0.9032, + "step": 388 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018869917713211964, + "loss": 0.9059, + "step": 389 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018863045380706274, + "loss": 0.8896, + "step": 390 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001885615347451209, + "loss": 0.7614, + "step": 391 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001884924200984991, + "loss": 0.978, + "step": 392 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001884231100198344, + "loss": 0.9406, + "step": 393 + }, + { + "epoch": 0.9, + "learning_rate": 0.00018835360466219533, + "loss": 0.7555, + "step": 394 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001882839041790818, + "loss": 0.9049, + "step": 395 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018821400872442458, + "loss": 0.7041, + "step": 396 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018814391845258505, + "loss": 0.8995, + "step": 397 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001880736335183548, + "loss": 0.7461, + "step": 398 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018800315407695539, + "loss": 0.9954, + "step": 399 + }, + { + "epoch": 0.92, + "learning_rate": 0.00018793248028403788, + "loss": 0.9035, + "step": 400 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001878616122956826, + "loss": 0.9083, + "step": 401 + }, + { + "epoch": 0.92, + "learning_rate": 0.00018779055026839868, + "loss": 0.7286, + "step": 402 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001877192943591239, + "loss": 0.8001, + "step": 403 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018764784472522403, + "loss": 0.8795, + "step": 404 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001875762015244929, + "loss": 0.8912, + "step": 405 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018750436491515163, + "loss": 0.8848, + "step": 406 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018743233505584862, + "loss": 0.8512, + "step": 407 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018736011210565898, + "loss": 0.8537, + "step": 408 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018728769622408423, + "loss": 0.8777, + "step": 409 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018721508757105202, + "loss": 0.7849, + "step": 410 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018714228630691576, + "loss": 0.9669, + "step": 411 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001870692925924541, + "loss": 0.9299, + "step": 412 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018699610658887088, + "loss": 1.0188, + "step": 413 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018692272845779448, + "loss": 0.8388, + "step": 414 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018684915836127765, + "loss": 0.7904, + "step": 415 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018677539646179707, + "loss": 0.9689, + "step": 416 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018670144292225297, + "loss": 0.7339, + "step": 417 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018662729790596888, + "loss": 0.7894, + "step": 418 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018655296157669117, + "loss": 0.7163, + "step": 419 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018647843409858869, + "loss": 0.8642, + "step": 420 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018640371563625246, + "loss": 0.9281, + "step": 421 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018632880635469526, + "loss": 0.834, + "step": 422 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018625370641935129, + "loss": 0.7316, + "step": 423 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018617841599607586, + "loss": 0.8504, + "step": 424 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018610293525114492, + "loss": 0.8731, + "step": 425 + }, + { + "epoch": 0.98, + "learning_rate": 0.00018602726435125474, + "loss": 0.8803, + "step": 426 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001859514034635215, + "loss": 0.8417, + "step": 427 + }, + { + "epoch": 0.98, + "learning_rate": 0.000185875352755481, + "loss": 0.8947, + "step": 428 + }, + { + "epoch": 0.98, + "learning_rate": 0.00018579911239508827, + "loss": 0.8368, + "step": 429 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018572268255071718, + "loss": 0.8231, + "step": 430 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018564606339116, + "loss": 0.8576, + "step": 431 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001855692550856272, + "loss": 0.8753, + "step": 432 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018549225780374685, + "loss": 0.7778, + "step": 433 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018541507171556445, + "loss": 0.7516, + "step": 434 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001853376969915425, + "loss": 0.7466, + "step": 435 + }, + { + "epoch": 1.0, + "learning_rate": 0.00018526013380255999, + "loss": 0.917, + "step": 436 + } + ], + "max_steps": 2180, + "num_train_epochs": 5, + "total_flos": 140002887892992.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-436/training_args.bin b/checkpoint-436/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4aa0907a784d65549a9c45257c4d455176479607 --- /dev/null +++ b/checkpoint-436/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adff180a74f6fc1e6a420417eadde6ef8ff75561e442f481bfe772c93f46e2ae +size 6011 diff --git a/checkpoint-436/zero_to_fp32.py b/checkpoint-436/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..c98caae31534368be22b67fc4ae906836c992a8d --- /dev/null +++ b/checkpoint-436/zero_to_fp32.py @@ -0,0 +1,587 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag) diff --git a/checkpoint-873/README.md b/checkpoint-873/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/checkpoint-873/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/checkpoint-873/adapter_config.json b/checkpoint-873/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a764b53e4dc8b17af932aa1de32ced6a340469f0 --- /dev/null +++ b/checkpoint-873/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "liuhaotian/llava-v1.5-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 256, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "up_proj", + "v_proj", + "down_proj", + "q_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-873/adapter_model.bin b/checkpoint-873/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..47996048ae242a456b0319f67081e8b4154cbf57 --- /dev/null +++ b/checkpoint-873/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c1bdc6f01cc6e2ce347abd4d25e4c2ba0c85c280a9e5314513ef171d7814780 +size 639786637 diff --git a/checkpoint-873/global_step873/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-873/global_step873/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8aab01e2c33f9aab307e900ede9872646d8fd1b6 --- /dev/null +++ b/checkpoint-873/global_step873/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c18afc248ff9cfe4f883b27b040dc7b0c81c0f7f5a4729913240f03b345c0dec +size 1022391865 diff --git a/checkpoint-873/global_step873/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-873/global_step873/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..27a202e0eaf4c34d6bcfabc9bbb18ff46c0d29d7 --- /dev/null +++ b/checkpoint-873/global_step873/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84dfcd1939805d65f095c4224f8c693fc69396b7270f16ededed44129590157a +size 1022391865 diff --git a/checkpoint-873/global_step873/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-873/global_step873/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42dd1584a73071006cff861886291fb795a0f4f1 --- /dev/null +++ b/checkpoint-873/global_step873/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fff0e0215ba960a378cf4595bff6f48490a532994fdf616dfed72ccc26f659ed +size 1022391865 diff --git a/checkpoint-873/global_step873/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-873/global_step873/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec9c76aaf167eb23d4efead9991c98419f2e75ff --- /dev/null +++ b/checkpoint-873/global_step873/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a9cafd478f2a7d69c2f4d773c7b75e27864c9c869d73d4ca05397b1c3ead24a +size 1022391865 diff --git a/checkpoint-873/global_step873/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-873/global_step873/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18b2f3a3dcf89c5ea5493e1debc4260f4628a605 --- /dev/null +++ b/checkpoint-873/global_step873/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd3449d3a11b23fb119eb53d0bf21427d327c8ac0cbdcbe265afdd054dc11ec4 +size 3521982567 diff --git a/checkpoint-873/global_step873/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-873/global_step873/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e68185970446d6b9bdc4663d9bb6aa4ed5ccacf --- /dev/null +++ b/checkpoint-873/global_step873/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c205e0755be564d54a751e858661ddd9edc0c9c771695041f3624393f17314db +size 3521982567 diff --git a/checkpoint-873/global_step873/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-873/global_step873/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71290c1fba591407fe6f7c9d6aea1e7752823c7b --- /dev/null +++ b/checkpoint-873/global_step873/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffe2577d4a157efb5f992cad761bb2e7bb41d1c8b9548e154a2f91e2bcb2241d +size 3521982567 diff --git a/checkpoint-873/global_step873/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-873/global_step873/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ca68b0160e4c61d8f185c9da78d26a472b26dd3 --- /dev/null +++ b/checkpoint-873/global_step873/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc2a189fd302cdaa278cc717151ca0ca8518c8fe5eb98b488f1b3fd0807f9504 +size 3521982567 diff --git a/checkpoint-873/latest b/checkpoint-873/latest new file mode 100644 index 0000000000000000000000000000000000000000..5d08f082c236d25860948ca0f119fafb9d409975 --- /dev/null +++ b/checkpoint-873/latest @@ -0,0 +1 @@ +global_step873 \ No newline at end of file diff --git a/checkpoint-873/rng_state_0.pth b/checkpoint-873/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..81b3bc7ef96d9896d5856f9dd44c0bdb7185a41d --- /dev/null +++ b/checkpoint-873/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a82e4035a6e4776d8afccc6a91285aa29289b92f7e0f5d62bff3033053ba4bc +size 17655 diff --git a/checkpoint-873/rng_state_1.pth b/checkpoint-873/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f5932bd1299448996bfc06416d1865f610ca40af --- /dev/null +++ b/checkpoint-873/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:464639b5ea6bdd5a49eefc704152c76c70cda636434218c145d87ad620c1e8a5 +size 17655 diff --git a/checkpoint-873/rng_state_2.pth b/checkpoint-873/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..47fe0cbd6cbea1f2dbc28ed688d32683140c518a --- /dev/null +++ b/checkpoint-873/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd1d50037b5754526a72bbe35f335fcfab1d2d663329ee92bbe92ef17bb2974a +size 17655 diff --git a/checkpoint-873/rng_state_3.pth b/checkpoint-873/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c53bf7580923c06845441729b49e5507be91bb03 --- /dev/null +++ b/checkpoint-873/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdaaf8272311352021464ae04e89c42ccfebc5134a39113f8e69a5112324620c +size 17655 diff --git a/checkpoint-873/special_tokens_map.json b/checkpoint-873/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/checkpoint-873/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-873/tokenizer.model b/checkpoint-873/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-873/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-873/tokenizer_config.json b/checkpoint-873/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..740756b4bef305e27d0bb4d2e1a40dd8847797f7 --- /dev/null +++ b/checkpoint-873/tokenizer_config.json @@ -0,0 +1,35 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "legacy": false, + "model_max_length": 2048, + "pad_token": null, + "padding_side": "right", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-873/trainer_state.json b/checkpoint-873/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c0657dc0131af2b0b20375e7d226e99c47584af6 --- /dev/null +++ b/checkpoint-873/trainer_state.json @@ -0,0 +1,5254 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "global_step": 873, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.946, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 6.060606060606061e-06, + "loss": 1.908, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 9.090909090909091e-06, + "loss": 2.1083, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 1.2121212121212122e-05, + "loss": 2.3218, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 1.5151515151515153e-05, + "loss": 1.8338, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 1.8181818181818182e-05, + "loss": 2.0202, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 2.1212121212121215e-05, + "loss": 2.1332, + "step": 7 + }, + { + "epoch": 0.02, + "learning_rate": 2.4242424242424244e-05, + "loss": 1.8593, + "step": 8 + }, + { + "epoch": 0.02, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.5359, + "step": 9 + }, + { + "epoch": 0.02, + "learning_rate": 3.0303030303030306e-05, + "loss": 1.327, + "step": 10 + }, + { + "epoch": 0.03, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.7252, + "step": 11 + }, + { + "epoch": 0.03, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.4351, + "step": 12 + }, + { + "epoch": 0.03, + "learning_rate": 3.939393939393939e-05, + "loss": 1.2774, + "step": 13 + }, + { + "epoch": 0.03, + "learning_rate": 4.242424242424243e-05, + "loss": 1.5145, + "step": 14 + }, + { + "epoch": 0.03, + "learning_rate": 4.545454545454546e-05, + "loss": 1.1529, + "step": 15 + }, + { + "epoch": 0.04, + "learning_rate": 4.848484848484849e-05, + "loss": 1.0047, + "step": 16 + }, + { + "epoch": 0.04, + "learning_rate": 5.151515151515152e-05, + "loss": 1.3872, + "step": 17 + }, + { + "epoch": 0.04, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.1229, + "step": 18 + }, + { + "epoch": 0.04, + "learning_rate": 5.757575757575758e-05, + "loss": 1.3386, + "step": 19 + }, + { + "epoch": 0.05, + "learning_rate": 6.060606060606061e-05, + "loss": 1.2493, + "step": 20 + }, + { + "epoch": 0.05, + "learning_rate": 6.363636363636364e-05, + "loss": 1.1427, + "step": 21 + }, + { + "epoch": 0.05, + "learning_rate": 6.666666666666667e-05, + "loss": 1.0895, + "step": 22 + }, + { + "epoch": 0.05, + "learning_rate": 6.96969696969697e-05, + "loss": 1.1989, + "step": 23 + }, + { + "epoch": 0.05, + "learning_rate": 7.272727272727273e-05, + "loss": 1.0438, + "step": 24 + }, + { + "epoch": 0.06, + "learning_rate": 7.575757575757576e-05, + "loss": 1.176, + "step": 25 + }, + { + "epoch": 0.06, + "learning_rate": 7.878787878787879e-05, + "loss": 1.1372, + "step": 26 + }, + { + "epoch": 0.06, + "learning_rate": 8.181818181818183e-05, + "loss": 1.2983, + "step": 27 + }, + { + "epoch": 0.06, + "learning_rate": 8.484848484848486e-05, + "loss": 0.9371, + "step": 28 + }, + { + "epoch": 0.07, + "learning_rate": 8.787878787878789e-05, + "loss": 1.2299, + "step": 29 + }, + { + "epoch": 0.07, + "learning_rate": 9.090909090909092e-05, + "loss": 0.9441, + "step": 30 + }, + { + "epoch": 0.07, + "learning_rate": 9.393939393939395e-05, + "loss": 1.0011, + "step": 31 + }, + { + "epoch": 0.07, + "learning_rate": 9.696969696969698e-05, + "loss": 1.1704, + "step": 32 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001, + "loss": 1.1193, + "step": 33 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010303030303030303, + "loss": 1.1559, + "step": 34 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010606060606060606, + "loss": 0.8677, + "step": 35 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010909090909090909, + "loss": 1.0865, + "step": 36 + }, + { + "epoch": 0.08, + "learning_rate": 0.00011212121212121212, + "loss": 1.0922, + "step": 37 + }, + { + "epoch": 0.09, + "learning_rate": 0.00011515151515151516, + "loss": 0.9434, + "step": 38 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001181818181818182, + "loss": 0.9144, + "step": 39 + }, + { + "epoch": 0.09, + "learning_rate": 0.00012121212121212122, + "loss": 0.9546, + "step": 40 + }, + { + "epoch": 0.09, + "learning_rate": 0.00012424242424242425, + "loss": 1.0654, + "step": 41 + }, + { + "epoch": 0.1, + "learning_rate": 0.00012727272727272728, + "loss": 0.8077, + "step": 42 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001303030303030303, + "loss": 1.0758, + "step": 43 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013333333333333334, + "loss": 1.1512, + "step": 44 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013636363636363637, + "loss": 0.84, + "step": 45 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001393939393939394, + "loss": 1.0567, + "step": 46 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014242424242424243, + "loss": 1.0165, + "step": 47 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014545454545454546, + "loss": 0.8678, + "step": 48 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014848484848484849, + "loss": 1.055, + "step": 49 + }, + { + "epoch": 0.11, + "learning_rate": 0.00015151515151515152, + "loss": 1.0669, + "step": 50 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015454545454545454, + "loss": 0.9915, + "step": 51 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015757575757575757, + "loss": 0.993, + "step": 52 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001606060606060606, + "loss": 1.1085, + "step": 53 + }, + { + "epoch": 0.12, + "learning_rate": 0.00016363636363636366, + "loss": 0.9391, + "step": 54 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001666666666666667, + "loss": 0.975, + "step": 55 + }, + { + "epoch": 0.13, + "learning_rate": 0.00016969696969696972, + "loss": 1.0697, + "step": 56 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017272727272727275, + "loss": 0.9462, + "step": 57 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017575757575757578, + "loss": 1.1209, + "step": 58 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001787878787878788, + "loss": 1.0648, + "step": 59 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018181818181818183, + "loss": 0.9964, + "step": 60 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018484848484848484, + "loss": 0.8451, + "step": 61 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001878787878787879, + "loss": 0.8437, + "step": 62 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019090909090909092, + "loss": 1.1271, + "step": 63 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019393939393939395, + "loss": 1.161, + "step": 64 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019696969696969698, + "loss": 1.0032, + "step": 65 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 66 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019999988957695886, + "loss": 0.9543, + "step": 67 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999955830807923, + "loss": 1.0274, + "step": 68 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999900619409279, + "loss": 0.9334, + "step": 69 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001999982332362188, + "loss": 1.0398, + "step": 70 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999723943616433, + "loss": 0.9049, + "step": 71 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999602479612417, + "loss": 0.7452, + "step": 72 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999458931878073, + "loss": 0.8762, + "step": 73 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999293300730427, + "loss": 1.0941, + "step": 74 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019999105586535268, + "loss": 0.7713, + "step": 75 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019998895789707154, + "loss": 0.9233, + "step": 76 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998663910709416, + "loss": 0.8634, + "step": 77 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998409950054146, + "loss": 0.9697, + "step": 78 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019998133908302209, + "loss": 1.0816, + "step": 79 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001999783578606323, + "loss": 0.9659, + "step": 80 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019997515583995603, + "loss": 0.9644, + "step": 81 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019997173302806478, + "loss": 0.8561, + "step": 82 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019996808943251773, + "loss": 1.0016, + "step": 83 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001999642250613616, + "loss": 0.8951, + "step": 84 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019996013992313073, + "loss": 1.0157, + "step": 85 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019995583402684694, + "loss": 0.9414, + "step": 86 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019995130738201966, + "loss": 0.8097, + "step": 87 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019994655999864582, + "loss": 0.8606, + "step": 88 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001999415918872098, + "loss": 1.0427, + "step": 89 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019993640305868352, + "loss": 0.9578, + "step": 90 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019993099352452623, + "loss": 1.1097, + "step": 91 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019992536329668478, + "loss": 0.8119, + "step": 92 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019991951238759325, + "loss": 0.9915, + "step": 93 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001999134408101731, + "loss": 0.838, + "step": 94 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019990714857783326, + "loss": 0.8935, + "step": 95 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019990063570446984, + "loss": 0.7914, + "step": 96 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019989390220446622, + "loss": 0.8724, + "step": 97 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019988694809269314, + "loss": 1.0374, + "step": 98 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019987977338450845, + "loss": 0.9028, + "step": 99 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019987237809575723, + "loss": 0.9986, + "step": 100 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019986476224277165, + "loss": 1.113, + "step": 101 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019985692584237108, + "loss": 0.8395, + "step": 102 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019984886891186184, + "loss": 1.0134, + "step": 103 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001998405914690374, + "loss": 0.8845, + "step": 104 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019983209353217812, + "loss": 0.7507, + "step": 105 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019982337512005138, + "loss": 0.9073, + "step": 106 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019981443625191148, + "loss": 0.9973, + "step": 107 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019980527694749952, + "loss": 1.0733, + "step": 108 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019979589722704346, + "loss": 0.9148, + "step": 109 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019978629711125812, + "loss": 0.8385, + "step": 110 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019977647662134488, + "loss": 0.75, + "step": 111 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019976643577899195, + "loss": 0.9002, + "step": 112 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019975617460637416, + "loss": 0.8754, + "step": 113 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001997456931261529, + "loss": 0.8886, + "step": 114 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019973499136147606, + "loss": 1.0058, + "step": 115 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019972406933597812, + "loss": 0.9276, + "step": 116 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019971292707377991, + "loss": 0.9922, + "step": 117 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019970156459948873, + "loss": 0.9507, + "step": 118 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001996899819381981, + "loss": 0.9619, + "step": 119 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019967817911548794, + "loss": 0.8163, + "step": 120 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019966615615742424, + "loss": 1.0647, + "step": 121 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001996539130905593, + "loss": 0.9348, + "step": 122 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019964144994193142, + "loss": 1.0523, + "step": 123 + }, + { + "epoch": 0.28, + "learning_rate": 0.000199628766739065, + "loss": 0.9063, + "step": 124 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019961586350997033, + "loss": 1.0227, + "step": 125 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001996027402831438, + "loss": 1.006, + "step": 126 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019958939708756746, + "loss": 0.9082, + "step": 127 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019957583395270923, + "loss": 0.8756, + "step": 128 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995620509085228, + "loss": 0.8311, + "step": 129 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019954804798544745, + "loss": 1.0332, + "step": 130 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019953382521440815, + "loss": 0.9427, + "step": 131 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019951938262681527, + "loss": 0.838, + "step": 132 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995047202545647, + "loss": 0.8509, + "step": 133 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019948983813003774, + "loss": 0.8944, + "step": 134 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019947473628610099, + "loss": 0.9569, + "step": 135 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019945941475610623, + "loss": 0.7805, + "step": 136 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019944387357389052, + "loss": 0.9337, + "step": 137 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994281127737759, + "loss": 0.8712, + "step": 138 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994121323905695, + "loss": 0.9264, + "step": 139 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001993959324595634, + "loss": 0.9323, + "step": 140 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019937951301653444, + "loss": 0.8331, + "step": 141 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001993628740977444, + "loss": 0.902, + "step": 142 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001993460157399396, + "loss": 0.8676, + "step": 143 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019932893798035116, + "loss": 0.8525, + "step": 144 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019931164085669456, + "loss": 0.8571, + "step": 145 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019929412440716985, + "loss": 1.0006, + "step": 146 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019927638867046142, + "loss": 0.9849, + "step": 147 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019925843368573794, + "loss": 0.9064, + "step": 148 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001992402594926523, + "loss": 0.9716, + "step": 149 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001992218661313415, + "loss": 0.7553, + "step": 150 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019920325364242654, + "loss": 0.7921, + "step": 151 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019918442206701245, + "loss": 0.7994, + "step": 152 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001991653714466879, + "loss": 0.8296, + "step": 153 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019914610182352548, + "loss": 0.8116, + "step": 154 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019912661324008148, + "loss": 0.9844, + "step": 155 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019910690573939557, + "loss": 0.865, + "step": 156 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019908697936499103, + "loss": 0.959, + "step": 157 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019906683416087448, + "loss": 0.7727, + "step": 158 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019904647017153582, + "loss": 0.707, + "step": 159 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019902588744194813, + "loss": 0.8597, + "step": 160 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019900508601756756, + "loss": 0.9146, + "step": 161 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001989840659443332, + "loss": 0.9571, + "step": 162 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001989628272686671, + "loss": 0.8537, + "step": 163 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019894137003747403, + "loss": 0.828, + "step": 164 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019891969429814145, + "loss": 0.8055, + "step": 165 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001988978000985394, + "loss": 0.8432, + "step": 166 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001988756874870203, + "loss": 0.8101, + "step": 167 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019885335651241903, + "loss": 0.9072, + "step": 168 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001988308072240527, + "loss": 0.7862, + "step": 169 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019880803967172047, + "loss": 0.8303, + "step": 170 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019878505390570362, + "loss": 0.9489, + "step": 171 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001987618499767653, + "loss": 1.0125, + "step": 172 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001987384279361505, + "loss": 0.809, + "step": 173 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019871478783558587, + "loss": 0.9488, + "step": 174 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001986909297272796, + "loss": 0.9664, + "step": 175 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001986668536639215, + "loss": 0.9657, + "step": 176 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001986425596986825, + "loss": 0.8123, + "step": 177 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019861804788521493, + "loss": 0.9482, + "step": 178 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019859331827765212, + "loss": 0.879, + "step": 179 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019856837093060848, + "loss": 0.896, + "step": 180 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019854320589917927, + "loss": 1.0729, + "step": 181 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019851782323894042, + "loss": 0.9844, + "step": 182 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001984922230059486, + "loss": 0.9131, + "step": 183 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019846640525674082, + "loss": 0.9417, + "step": 184 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019844037004833473, + "loss": 0.9633, + "step": 185 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001984141174382279, + "loss": 0.968, + "step": 186 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019838764748439827, + "loss": 0.8447, + "step": 187 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019836096024530373, + "loss": 0.8638, + "step": 188 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019833405577988195, + "loss": 0.9346, + "step": 189 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001983069341475504, + "loss": 0.8969, + "step": 190 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019827959540820613, + "loss": 0.8499, + "step": 191 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019825203962222572, + "loss": 0.8041, + "step": 192 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019822426685046497, + "loss": 0.9216, + "step": 193 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019819627715425903, + "loss": 0.906, + "step": 194 + }, + { + "epoch": 0.45, + "learning_rate": 0.000198168070595422, + "loss": 0.8969, + "step": 195 + }, + { + "epoch": 0.45, + "learning_rate": 0.000198139647236247, + "loss": 0.7949, + "step": 196 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019811100713950587, + "loss": 0.8996, + "step": 197 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019808215036844917, + "loss": 0.9118, + "step": 198 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001980530769868059, + "loss": 0.7355, + "step": 199 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019802378705878354, + "loss": 0.8344, + "step": 200 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019799428064906774, + "loss": 0.9639, + "step": 201 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001979645578228222, + "loss": 0.852, + "step": 202 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001979346186456887, + "loss": 0.8493, + "step": 203 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019790446318378665, + "loss": 0.851, + "step": 204 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019787409150371328, + "loss": 0.7161, + "step": 205 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019784350367254322, + "loss": 0.9846, + "step": 206 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001978126997578285, + "loss": 0.7883, + "step": 207 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019778167982759833, + "loss": 0.8691, + "step": 208 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019775044395035907, + "loss": 0.928, + "step": 209 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001977189921950939, + "loss": 0.8244, + "step": 210 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001976873246312628, + "loss": 1.0413, + "step": 211 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001976554413288023, + "loss": 0.8261, + "step": 212 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001976233423581255, + "loss": 0.823, + "step": 213 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019759102779012166, + "loss": 0.9386, + "step": 214 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019755849769615628, + "loss": 0.8156, + "step": 215 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019752575214807076, + "loss": 0.8556, + "step": 216 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019749279121818235, + "loss": 0.7769, + "step": 217 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019745961497928406, + "loss": 1.0772, + "step": 218 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019742622350464418, + "loss": 0.8147, + "step": 219 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001973926168680066, + "loss": 0.9529, + "step": 220 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019735879514359018, + "loss": 0.8688, + "step": 221 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019732475840608888, + "loss": 0.9647, + "step": 222 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019729050673067156, + "loss": 0.837, + "step": 223 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019725604019298163, + "loss": 0.9211, + "step": 224 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019722135886913715, + "loss": 0.9434, + "step": 225 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001971864628357304, + "loss": 0.6506, + "step": 226 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019715135216982798, + "loss": 0.8052, + "step": 227 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019711602694897037, + "loss": 0.7852, + "step": 228 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019708048725117192, + "loss": 0.9283, + "step": 229 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001970447331549207, + "loss": 0.9081, + "step": 230 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019700876473917824, + "loss": 0.9036, + "step": 231 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019697258208337934, + "loss": 0.716, + "step": 232 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019693618526743197, + "loss": 0.8192, + "step": 233 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001968995743717171, + "loss": 0.9773, + "step": 234 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019686274947708848, + "loss": 0.8698, + "step": 235 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001968257106648724, + "loss": 0.9062, + "step": 236 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019678845801686764, + "loss": 0.8984, + "step": 237 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019675099161534521, + "loss": 0.8087, + "step": 238 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019671331154304822, + "loss": 0.8272, + "step": 239 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019667541788319162, + "loss": 0.784, + "step": 240 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019663731071946206, + "loss": 0.8777, + "step": 241 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019659899013601772, + "loss": 0.8534, + "step": 242 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019656045621748808, + "loss": 0.9645, + "step": 243 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019652170904897387, + "loss": 0.9692, + "step": 244 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019648274871604662, + "loss": 0.838, + "step": 245 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019644357530474872, + "loss": 0.7445, + "step": 246 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001964041889015931, + "loss": 0.9065, + "step": 247 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019636458959356316, + "loss": 0.7806, + "step": 248 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019632477746811232, + "loss": 0.7971, + "step": 249 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019628475261316417, + "loss": 0.8409, + "step": 250 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019624451511711198, + "loss": 0.7432, + "step": 251 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019620406506881875, + "loss": 0.9096, + "step": 252 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019616340255761676, + "loss": 0.8004, + "step": 253 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019612252767330763, + "loss": 0.7978, + "step": 254 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001960814405061619, + "loss": 0.9535, + "step": 255 + }, + { + "epoch": 0.59, + "learning_rate": 0.000196040141146919, + "loss": 0.9945, + "step": 256 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001959986296867869, + "loss": 0.9703, + "step": 257 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019595690621744208, + "loss": 0.9639, + "step": 258 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019591497083102914, + "loss": 0.9312, + "step": 259 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019587282362016083, + "loss": 0.7709, + "step": 260 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001958304646779175, + "loss": 0.8547, + "step": 261 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019578789409784727, + "loss": 0.8081, + "step": 262 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019574511197396563, + "loss": 0.8476, + "step": 263 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019570211840075517, + "loss": 0.9658, + "step": 264 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019565891347316552, + "loss": 0.7778, + "step": 265 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001956154972866131, + "loss": 0.9926, + "step": 266 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001955718699369808, + "loss": 0.957, + "step": 267 + }, + { + "epoch": 0.61, + "learning_rate": 0.000195528031520618, + "loss": 0.9396, + "step": 268 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019548398213434007, + "loss": 0.9049, + "step": 269 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019543972187542833, + "loss": 0.9683, + "step": 270 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019539525084162992, + "loss": 0.8555, + "step": 271 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019535056913115725, + "loss": 0.8489, + "step": 272 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001953056768426882, + "loss": 0.8728, + "step": 273 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019526057407536564, + "loss": 0.9443, + "step": 274 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019521526092879725, + "loss": 0.8161, + "step": 275 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019516973750305532, + "loss": 0.8936, + "step": 276 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019512400389867657, + "loss": 0.8315, + "step": 277 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019507806021666188, + "loss": 0.9298, + "step": 278 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019503190655847604, + "loss": 0.8235, + "step": 279 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019498554302604766, + "loss": 0.9245, + "step": 280 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001949389697217687, + "loss": 0.8302, + "step": 281 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019489218674849455, + "loss": 0.8488, + "step": 282 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019484519420954354, + "loss": 0.8177, + "step": 283 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019479799220869682, + "loss": 1.0039, + "step": 284 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019475058085019825, + "loss": 0.7685, + "step": 285 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019470296023875387, + "loss": 0.9174, + "step": 286 + }, + { + "epoch": 0.66, + "learning_rate": 0.000194655130479532, + "loss": 1.0997, + "step": 287 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019460709167816274, + "loss": 0.9759, + "step": 288 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001945588439407379, + "loss": 0.9397, + "step": 289 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019451038737381077, + "loss": 1.0367, + "step": 290 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019446172208439574, + "loss": 0.8298, + "step": 291 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001944128481799682, + "loss": 0.9094, + "step": 292 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019436376576846423, + "loss": 1.1234, + "step": 293 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019431447495828045, + "loss": 0.9103, + "step": 294 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001942649758582737, + "loss": 0.7841, + "step": 295 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019421526857776072, + "loss": 0.8817, + "step": 296 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019416535322651818, + "loss": 1.0682, + "step": 297 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019411522991478214, + "loss": 0.9201, + "step": 298 + }, + { + "epoch": 0.68, + "learning_rate": 0.000194064898753248, + "loss": 4.1834, + "step": 299 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019401435985307012, + "loss": 1.0391, + "step": 300 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019396361332586166, + "loss": 2.5015, + "step": 301 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001939126592836944, + "loss": 0.7927, + "step": 302 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001938614978390983, + "loss": 2.2345, + "step": 303 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019381012910506146, + "loss": 0.9311, + "step": 304 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019375855319502962, + "loss": 0.9713, + "step": 305 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019370677022290624, + "loss": 0.8967, + "step": 306 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019365478030305196, + "loss": 3.095, + "step": 307 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001936025835502845, + "loss": 1.1008, + "step": 308 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001935501800798783, + "loss": 1.5409, + "step": 309 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019349757000756444, + "loss": 1.02, + "step": 310 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019344475344953012, + "loss": 1.0101, + "step": 311 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001933917305224187, + "loss": 0.7686, + "step": 312 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001933385013433292, + "loss": 1.1061, + "step": 313 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001932850660298162, + "loss": 0.8083, + "step": 314 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001932314246998895, + "loss": 1.1942, + "step": 315 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019317757747201384, + "loss": 0.8551, + "step": 316 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019312352446510878, + "loss": 0.9049, + "step": 317 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019306926579854821, + "loss": 0.7072, + "step": 318 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019301480159216028, + "loss": 0.8552, + "step": 319 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019296013196622706, + "loss": 0.8414, + "step": 320 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001929052570414843, + "loss": 0.9198, + "step": 321 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019285017693912107, + "loss": 2.1953, + "step": 322 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019279489178077969, + "loss": 0.851, + "step": 323 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019273940168855518, + "loss": 1.0239, + "step": 324 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019268370678499533, + "loss": 1.5125, + "step": 325 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019262780719310008, + "loss": 0.9171, + "step": 326 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019257170303632148, + "loss": 0.9794, + "step": 327 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019251539443856344, + "loss": 0.9023, + "step": 328 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019245888152418124, + "loss": 1.058, + "step": 329 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019240216441798142, + "loss": 0.9411, + "step": 330 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001923452432452215, + "loss": 1.197, + "step": 331 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001922881181316097, + "loss": 0.9253, + "step": 332 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001922307892033046, + "loss": 1.156, + "step": 333 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019217325658691482, + "loss": 0.9424, + "step": 334 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019211552040949891, + "loss": 1.1147, + "step": 335 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019205758079856498, + "loss": 0.8528, + "step": 336 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001919994378820704, + "loss": 0.8105, + "step": 337 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019194109178842153, + "loss": 0.9279, + "step": 338 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019188254264647337, + "loss": 0.9231, + "step": 339 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019182379058552948, + "loss": 1.0425, + "step": 340 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019176483573534142, + "loss": 0.8794, + "step": 341 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019170567822610873, + "loss": 0.9873, + "step": 342 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001916463181884784, + "loss": 0.8146, + "step": 343 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019158675575354478, + "loss": 1.027, + "step": 344 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019152699105284913, + "loss": 0.8093, + "step": 345 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001914670242183795, + "loss": 0.951, + "step": 346 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019140685538257028, + "loss": 0.9268, + "step": 347 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019134648467830198, + "loss": 1.0205, + "step": 348 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019128591223890092, + "loss": 0.9043, + "step": 349 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019122513819813902, + "loss": 0.7387, + "step": 350 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001911641626902333, + "loss": 0.9422, + "step": 351 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019110298584984578, + "loss": 0.9015, + "step": 352 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001910416078120832, + "loss": 0.7522, + "step": 353 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019098002871249646, + "loss": 0.9722, + "step": 354 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001909182486870806, + "loss": 0.8358, + "step": 355 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019085626787227443, + "loss": 0.9859, + "step": 356 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019079408640496013, + "loss": 0.7796, + "step": 357 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019073170442246302, + "loss": 0.8617, + "step": 358 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001906691220625513, + "loss": 0.7727, + "step": 359 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001906063394634356, + "loss": 0.8786, + "step": 360 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001905433567637689, + "loss": 0.9117, + "step": 361 + }, + { + "epoch": 0.83, + "learning_rate": 0.000190480174102646, + "loss": 0.9182, + "step": 362 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001904167916196033, + "loss": 0.9706, + "step": 363 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001903532094546186, + "loss": 0.8036, + "step": 364 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001902894277481105, + "loss": 0.902, + "step": 365 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019022544664093854, + "loss": 0.9231, + "step": 366 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019016126627440237, + "loss": 0.9751, + "step": 367 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001900968867902419, + "loss": 0.8373, + "step": 368 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001900323083306367, + "loss": 0.8695, + "step": 369 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001899675310382057, + "loss": 0.8654, + "step": 370 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018990255505600706, + "loss": 0.98, + "step": 371 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018983738052753767, + "loss": 0.7454, + "step": 372 + }, + { + "epoch": 0.85, + "learning_rate": 0.00018977200759673295, + "loss": 0.829, + "step": 373 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018970643640796642, + "loss": 0.8262, + "step": 374 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001896406671060495, + "loss": 1.0659, + "step": 375 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018957469983623112, + "loss": 0.8551, + "step": 376 + }, + { + "epoch": 0.86, + "learning_rate": 0.00018950853474419742, + "loss": 0.7991, + "step": 377 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001894421719760714, + "loss": 0.8662, + "step": 378 + }, + { + "epoch": 0.87, + "learning_rate": 0.00018937561167841263, + "loss": 0.8817, + "step": 379 + }, + { + "epoch": 0.87, + "learning_rate": 0.00018930885399821693, + "loss": 1.0894, + "step": 380 + }, + { + "epoch": 0.87, + "learning_rate": 0.000189241899082916, + "loss": 0.8225, + "step": 381 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018917474708037718, + "loss": 0.9065, + "step": 382 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018910739813890302, + "loss": 0.8779, + "step": 383 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018903985240723104, + "loss": 0.7909, + "step": 384 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018897211003453328, + "loss": 0.7649, + "step": 385 + }, + { + "epoch": 0.88, + "learning_rate": 0.00018890417117041619, + "loss": 0.9788, + "step": 386 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018883603596492004, + "loss": 0.938, + "step": 387 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018876770456851877, + "loss": 0.9032, + "step": 388 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018869917713211964, + "loss": 0.9059, + "step": 389 + }, + { + "epoch": 0.89, + "learning_rate": 0.00018863045380706274, + "loss": 0.8896, + "step": 390 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001885615347451209, + "loss": 0.7614, + "step": 391 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001884924200984991, + "loss": 0.978, + "step": 392 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001884231100198344, + "loss": 0.9406, + "step": 393 + }, + { + "epoch": 0.9, + "learning_rate": 0.00018835360466219533, + "loss": 0.7555, + "step": 394 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001882839041790818, + "loss": 0.9049, + "step": 395 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018821400872442458, + "loss": 0.7041, + "step": 396 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018814391845258505, + "loss": 0.8995, + "step": 397 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001880736335183548, + "loss": 0.7461, + "step": 398 + }, + { + "epoch": 0.91, + "learning_rate": 0.00018800315407695539, + "loss": 0.9954, + "step": 399 + }, + { + "epoch": 0.92, + "learning_rate": 0.00018793248028403788, + "loss": 0.9035, + "step": 400 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001878616122956826, + "loss": 0.9083, + "step": 401 + }, + { + "epoch": 0.92, + "learning_rate": 0.00018779055026839868, + "loss": 0.7286, + "step": 402 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001877192943591239, + "loss": 0.8001, + "step": 403 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018764784472522403, + "loss": 0.8795, + "step": 404 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001875762015244929, + "loss": 0.8912, + "step": 405 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018750436491515163, + "loss": 0.8848, + "step": 406 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018743233505584862, + "loss": 0.8512, + "step": 407 + }, + { + "epoch": 0.93, + "learning_rate": 0.00018736011210565898, + "loss": 0.8537, + "step": 408 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018728769622408423, + "loss": 0.8777, + "step": 409 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018721508757105202, + "loss": 0.7849, + "step": 410 + }, + { + "epoch": 0.94, + "learning_rate": 0.00018714228630691576, + "loss": 0.9669, + "step": 411 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001870692925924541, + "loss": 0.9299, + "step": 412 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018699610658887088, + "loss": 1.0188, + "step": 413 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018692272845779448, + "loss": 0.8388, + "step": 414 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018684915836127765, + "loss": 0.7904, + "step": 415 + }, + { + "epoch": 0.95, + "learning_rate": 0.00018677539646179707, + "loss": 0.9689, + "step": 416 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018670144292225297, + "loss": 0.7339, + "step": 417 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018662729790596888, + "loss": 0.7894, + "step": 418 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018655296157669117, + "loss": 0.7163, + "step": 419 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018647843409858869, + "loss": 0.8642, + "step": 420 + }, + { + "epoch": 0.96, + "learning_rate": 0.00018640371563625246, + "loss": 0.9281, + "step": 421 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018632880635469526, + "loss": 0.834, + "step": 422 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018625370641935129, + "loss": 0.7316, + "step": 423 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018617841599607586, + "loss": 0.8504, + "step": 424 + }, + { + "epoch": 0.97, + "learning_rate": 0.00018610293525114492, + "loss": 0.8731, + "step": 425 + }, + { + "epoch": 0.98, + "learning_rate": 0.00018602726435125474, + "loss": 0.8803, + "step": 426 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001859514034635215, + "loss": 0.8417, + "step": 427 + }, + { + "epoch": 0.98, + "learning_rate": 0.000185875352755481, + "loss": 0.8947, + "step": 428 + }, + { + "epoch": 0.98, + "learning_rate": 0.00018579911239508827, + "loss": 0.8368, + "step": 429 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018572268255071718, + "loss": 0.8231, + "step": 430 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018564606339116, + "loss": 0.8576, + "step": 431 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001855692550856272, + "loss": 0.8753, + "step": 432 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018549225780374685, + "loss": 0.7778, + "step": 433 + }, + { + "epoch": 0.99, + "learning_rate": 0.00018541507171556445, + "loss": 0.7516, + "step": 434 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001853376969915425, + "loss": 0.7466, + "step": 435 + }, + { + "epoch": 1.0, + "learning_rate": 0.00018526013380255999, + "loss": 0.917, + "step": 436 + }, + { + "epoch": 1.0, + "learning_rate": 0.00018518238231991218, + "loss": 0.9042, + "step": 437 + }, + { + "epoch": 1.0, + "learning_rate": 0.00018510444271531022, + "loss": 0.8587, + "step": 438 + }, + { + "epoch": 1.01, + "learning_rate": 0.00018502631516088066, + "loss": 0.9001, + "step": 439 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001849479998291651, + "loss": 0.7977, + "step": 440 + }, + { + "epoch": 1.01, + "learning_rate": 0.00018486949689311993, + "loss": 0.8711, + "step": 441 + }, + { + "epoch": 1.01, + "learning_rate": 0.00018479080652611583, + "loss": 0.7192, + "step": 442 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001847119289019373, + "loss": 0.9608, + "step": 443 + }, + { + "epoch": 1.02, + "learning_rate": 0.00018463286419478255, + "loss": 0.7097, + "step": 444 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001845536125792629, + "loss": 0.7354, + "step": 445 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001844741742304024, + "loss": 0.8711, + "step": 446 + }, + { + "epoch": 1.02, + "learning_rate": 0.00018439454932363755, + "loss": 0.8832, + "step": 447 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018431473803481684, + "loss": 0.932, + "step": 448 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018423474054020034, + "loss": 0.8394, + "step": 449 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018415455701645942, + "loss": 0.7698, + "step": 450 + }, + { + "epoch": 1.03, + "learning_rate": 0.00018407418764067627, + "loss": 0.8856, + "step": 451 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018399363259034347, + "loss": 0.8529, + "step": 452 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018391289204336368, + "loss": 0.9898, + "step": 453 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018383196617804926, + "loss": 0.8312, + "step": 454 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018375085517312182, + "loss": 0.8234, + "step": 455 + }, + { + "epoch": 1.04, + "learning_rate": 0.00018366955920771184, + "loss": 0.7871, + "step": 456 + }, + { + "epoch": 1.05, + "learning_rate": 0.00018358807846135825, + "loss": 0.9814, + "step": 457 + }, + { + "epoch": 1.05, + "learning_rate": 0.00018350641311400812, + "loss": 0.8183, + "step": 458 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001834245633460161, + "loss": 0.8961, + "step": 459 + }, + { + "epoch": 1.05, + "learning_rate": 0.00018334252933814427, + "loss": 0.9166, + "step": 460 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018326031127156148, + "loss": 1.0031, + "step": 461 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018317790932784317, + "loss": 0.8171, + "step": 462 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001830953236889707, + "loss": 0.83, + "step": 463 + }, + { + "epoch": 1.06, + "learning_rate": 0.00018301255453733134, + "loss": 0.8134, + "step": 464 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001829296020557174, + "loss": 0.8561, + "step": 465 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001828464664273263, + "loss": 0.8669, + "step": 466 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001827631478357597, + "loss": 1.003, + "step": 467 + }, + { + "epoch": 1.07, + "learning_rate": 0.00018267964646502357, + "loss": 0.8715, + "step": 468 + }, + { + "epoch": 1.07, + "learning_rate": 0.00018259596249952731, + "loss": 0.7434, + "step": 469 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018251209612408373, + "loss": 0.9163, + "step": 470 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018242804752390844, + "loss": 1.0639, + "step": 471 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018234381688461942, + "loss": 0.8266, + "step": 472 + }, + { + "epoch": 1.08, + "learning_rate": 0.00018225940439223684, + "loss": 0.7582, + "step": 473 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001821748102331823, + "loss": 0.8547, + "step": 474 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001820900345942787, + "loss": 0.7908, + "step": 475 + }, + { + "epoch": 1.09, + "learning_rate": 0.00018200507766274977, + "loss": 0.6203, + "step": 476 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001819199396262195, + "loss": 0.806, + "step": 477 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001818346206727119, + "loss": 0.8016, + "step": 478 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001817491209906506, + "loss": 0.8548, + "step": 479 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018166344076885827, + "loss": 0.9194, + "step": 480 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018157758019655634, + "loss": 0.8704, + "step": 481 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018149153946336446, + "loss": 0.8373, + "step": 482 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001814053187593003, + "loss": 0.8229, + "step": 483 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018131891827477884, + "loss": 0.8289, + "step": 484 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018123233820061218, + "loss": 0.7753, + "step": 485 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018114557872800905, + "loss": 1.029, + "step": 486 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001810586400485743, + "loss": 0.6198, + "step": 487 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001809715223543087, + "loss": 0.8418, + "step": 488 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018088422583760813, + "loss": 0.7421, + "step": 489 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001807967506912636, + "loss": 0.8032, + "step": 490 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018070909710846052, + "loss": 0.7956, + "step": 491 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018062126528277844, + "loss": 0.9013, + "step": 492 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018053325540819045, + "loss": 0.9582, + "step": 493 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018044506767906295, + "loss": 0.6845, + "step": 494 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018035670229015507, + "loss": 0.8731, + "step": 495 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001802681594366183, + "loss": 0.8369, + "step": 496 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018017943931399603, + "loss": 0.6557, + "step": 497 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018009054211822324, + "loss": 0.7997, + "step": 498 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001800014680456259, + "loss": 0.8348, + "step": 499 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001799122172929206, + "loss": 0.9043, + "step": 500 + }, + { + "epoch": 1.15, + "learning_rate": 0.00017982279005721407, + "loss": 0.8499, + "step": 501 + }, + { + "epoch": 1.15, + "learning_rate": 0.00017973318653600293, + "loss": 0.8595, + "step": 502 + }, + { + "epoch": 1.15, + "learning_rate": 0.00017964340692717303, + "loss": 0.9468, + "step": 503 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001795534514289991, + "loss": 0.9848, + "step": 504 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017946332024014434, + "loss": 0.7326, + "step": 505 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017937301355965996, + "loss": 0.8479, + "step": 506 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017928253158698473, + "loss": 0.8669, + "step": 507 + }, + { + "epoch": 1.16, + "learning_rate": 0.00017919187452194454, + "loss": 0.8163, + "step": 508 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017910104256475194, + "loss": 0.926, + "step": 509 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017901003591600575, + "loss": 0.7956, + "step": 510 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017891885477669064, + "loss": 0.9002, + "step": 511 + }, + { + "epoch": 1.17, + "learning_rate": 0.00017882749934817652, + "loss": 0.787, + "step": 512 + }, + { + "epoch": 1.18, + "learning_rate": 0.00017873596983221832, + "loss": 0.7519, + "step": 513 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001786442664309554, + "loss": 0.8067, + "step": 514 + }, + { + "epoch": 1.18, + "learning_rate": 0.00017855238934691108, + "loss": 0.8824, + "step": 515 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001784603387829923, + "loss": 0.8014, + "step": 516 + }, + { + "epoch": 1.18, + "learning_rate": 0.00017836811494248919, + "loss": 0.6672, + "step": 517 + }, + { + "epoch": 1.19, + "learning_rate": 0.00017827571802907444, + "loss": 0.8516, + "step": 518 + }, + { + "epoch": 1.19, + "learning_rate": 0.000178183148246803, + "loss": 0.8476, + "step": 519 + }, + { + "epoch": 1.19, + "learning_rate": 0.00017809040580011164, + "loss": 0.8493, + "step": 520 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001779974908938184, + "loss": 0.7288, + "step": 521 + }, + { + "epoch": 1.2, + "learning_rate": 0.00017790440373312223, + "loss": 0.7443, + "step": 522 + }, + { + "epoch": 1.2, + "learning_rate": 0.00017781114452360245, + "loss": 0.8767, + "step": 523 + }, + { + "epoch": 1.2, + "learning_rate": 0.00017771771347121842, + "loss": 0.8025, + "step": 524 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001776241107823089, + "loss": 0.8842, + "step": 525 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017753033666359177, + "loss": 0.9648, + "step": 526 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017743639132216353, + "loss": 0.7872, + "step": 527 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001773422749654988, + "loss": 0.9122, + "step": 528 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017724798780144983, + "loss": 0.7688, + "step": 529 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001771535300382461, + "loss": 0.8938, + "step": 530 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017705890188449394, + "loss": 0.7152, + "step": 531 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001769641035491759, + "loss": 0.7077, + "step": 532 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017686913524165036, + "loss": 0.8872, + "step": 533 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017677399717165116, + "loss": 0.8775, + "step": 534 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017667868954928694, + "loss": 0.8508, + "step": 535 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017658321258504092, + "loss": 0.8589, + "step": 536 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017648756648977018, + "loss": 0.6499, + "step": 537 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017639175147470538, + "loss": 0.8927, + "step": 538 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017629576775145026, + "loss": 0.8702, + "step": 539 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017619961553198108, + "loss": 0.7958, + "step": 540 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017610329502864625, + "loss": 0.8582, + "step": 541 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017600680645416583, + "loss": 0.7905, + "step": 542 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001759101500216311, + "loss": 0.7574, + "step": 543 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017581332594450392, + "loss": 0.861, + "step": 544 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017571633443661658, + "loss": 0.7682, + "step": 545 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017561917571217093, + "loss": 0.7547, + "step": 546 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017552184998573825, + "loss": 0.7852, + "step": 547 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001754243574722586, + "loss": 0.7635, + "step": 548 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017532669838704035, + "loss": 0.8714, + "step": 549 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017522887294575977, + "loss": 0.7839, + "step": 550 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017513088136446054, + "loss": 0.8551, + "step": 551 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017503272385955318, + "loss": 0.7367, + "step": 552 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017493440064781475, + "loss": 0.9257, + "step": 553 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017483591194638817, + "loss": 0.8246, + "step": 554 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017473725797278192, + "loss": 0.8319, + "step": 555 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017463843894486937, + "loss": 0.8304, + "step": 556 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017453945508088853, + "loss": 0.6536, + "step": 557 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017444030659944138, + "loss": 0.7606, + "step": 558 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017434099371949345, + "loss": 0.7084, + "step": 559 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017424151666037329, + "loss": 0.8891, + "step": 560 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017414187564177217, + "loss": 0.6199, + "step": 561 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017404207088374333, + "loss": 0.8676, + "step": 562 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001739421026067017, + "loss": 0.8477, + "step": 563 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017384197103142328, + "loss": 0.9234, + "step": 564 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001737416763790447, + "loss": 0.9103, + "step": 565 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017364121887106286, + "loss": 0.7859, + "step": 566 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017354059872933415, + "loss": 0.8623, + "step": 567 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017343981617607424, + "loss": 0.6266, + "step": 568 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017333887143385743, + "loss": 0.8105, + "step": 569 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017323776472561627, + "loss": 0.7752, + "step": 570 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001731364962746409, + "loss": 0.7873, + "step": 571 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001730350663045788, + "loss": 0.8425, + "step": 572 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017293347503943406, + "loss": 0.777, + "step": 573 + }, + { + "epoch": 1.32, + "learning_rate": 0.000172831722703567, + "loss": 0.7348, + "step": 574 + }, + { + "epoch": 1.32, + "learning_rate": 0.00017272980952169365, + "loss": 0.7797, + "step": 575 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001726277357188853, + "loss": 0.8328, + "step": 576 + }, + { + "epoch": 1.32, + "learning_rate": 0.00017252550152056795, + "loss": 0.7109, + "step": 577 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001724231071525218, + "loss": 0.7905, + "step": 578 + }, + { + "epoch": 1.33, + "learning_rate": 0.00017232055284088085, + "loss": 0.7541, + "step": 579 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001722178388121322, + "loss": 0.8954, + "step": 580 + }, + { + "epoch": 1.33, + "learning_rate": 0.00017211496529311582, + "loss": 0.8362, + "step": 581 + }, + { + "epoch": 1.33, + "learning_rate": 0.00017201193251102382, + "loss": 0.8436, + "step": 582 + }, + { + "epoch": 1.34, + "learning_rate": 0.00017190874069340014, + "loss": 0.7594, + "step": 583 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001718053900681397, + "loss": 0.9342, + "step": 584 + }, + { + "epoch": 1.34, + "learning_rate": 0.00017170188086348848, + "loss": 0.8934, + "step": 585 + }, + { + "epoch": 1.34, + "learning_rate": 0.00017159821330804236, + "loss": 0.831, + "step": 586 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001714943876307472, + "loss": 0.8053, + "step": 587 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017139040406089786, + "loss": 0.81, + "step": 588 + }, + { + "epoch": 1.35, + "learning_rate": 0.000171286262828138, + "loss": 0.8245, + "step": 589 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017118196416245947, + "loss": 0.8232, + "step": 590 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017107750829420176, + "loss": 0.8244, + "step": 591 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001709728954540516, + "loss": 0.7863, + "step": 592 + }, + { + "epoch": 1.36, + "learning_rate": 0.00017086812587304234, + "loss": 0.8274, + "step": 593 + }, + { + "epoch": 1.36, + "learning_rate": 0.00017076319978255345, + "loss": 0.6595, + "step": 594 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001706581174143101, + "loss": 0.8582, + "step": 595 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017055287900038263, + "loss": 0.6873, + "step": 596 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017044748477318593, + "loss": 0.8673, + "step": 597 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017034193496547902, + "loss": 0.8055, + "step": 598 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017023622981036455, + "loss": 0.8232, + "step": 599 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001701303695412881, + "loss": 0.8745, + "step": 600 + }, + { + "epoch": 1.38, + "learning_rate": 0.00017002435439203808, + "loss": 0.8034, + "step": 601 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016991818459674468, + "loss": 0.9006, + "step": 602 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001698118603898798, + "loss": 0.7828, + "step": 603 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016970538200625622, + "loss": 0.8413, + "step": 604 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016959874968102735, + "loss": 0.8669, + "step": 605 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016949196364968646, + "loss": 0.9277, + "step": 606 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016938502414806634, + "loss": 0.9256, + "step": 607 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016927793141233868, + "loss": 0.8613, + "step": 608 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016917068567901358, + "loss": 0.9439, + "step": 609 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016906328718493906, + "loss": 0.8606, + "step": 610 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016895573616730044, + "loss": 0.7483, + "step": 611 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016884803286362, + "loss": 0.8359, + "step": 612 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001687401775117562, + "loss": 0.7764, + "step": 613 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016863217034990342, + "loss": 0.9857, + "step": 614 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001685240116165912, + "loss": 0.8706, + "step": 615 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001684157015506839, + "loss": 0.867, + "step": 616 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016830724039138003, + "loss": 0.7974, + "step": 617 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016819862837821181, + "loss": 0.7835, + "step": 618 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016808986575104465, + "loss": 0.7987, + "step": 619 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001679809527500765, + "loss": 0.7383, + "step": 620 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001678718896158375, + "loss": 0.9224, + "step": 621 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016776267658918928, + "loss": 0.8959, + "step": 622 + }, + { + "epoch": 1.43, + "learning_rate": 0.00016765331391132456, + "loss": 0.6702, + "step": 623 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001675438018237665, + "loss": 0.6911, + "step": 624 + }, + { + "epoch": 1.43, + "learning_rate": 0.00016743414056836825, + "loss": 0.9364, + "step": 625 + }, + { + "epoch": 1.43, + "learning_rate": 0.00016732433038731242, + "loss": 0.7902, + "step": 626 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016721437152311054, + "loss": 0.8473, + "step": 627 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016710426421860235, + "loss": 0.8765, + "step": 628 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016699400871695555, + "loss": 0.7705, + "step": 629 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016688360526166514, + "loss": 0.8653, + "step": 630 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001667730540965528, + "loss": 0.9137, + "step": 631 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016666235546576648, + "loss": 0.9772, + "step": 632 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001665515096137797, + "loss": 0.6433, + "step": 633 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001664405167853912, + "loss": 0.8096, + "step": 634 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016632937722572434, + "loss": 0.7298, + "step": 635 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016621809118022647, + "loss": 0.6841, + "step": 636 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016610665889466838, + "loss": 0.9471, + "step": 637 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016599508061514404, + "loss": 0.8396, + "step": 638 + }, + { + "epoch": 1.46, + "learning_rate": 0.00016588335658806962, + "loss": 0.8769, + "step": 639 + }, + { + "epoch": 1.47, + "learning_rate": 0.00016577148706018328, + "loss": 0.8328, + "step": 640 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001656594722785445, + "loss": 0.8932, + "step": 641 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001655473124905335, + "loss": 0.8203, + "step": 642 + }, + { + "epoch": 1.47, + "learning_rate": 0.00016543500794385084, + "loss": 0.8514, + "step": 643 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016532255888651666, + "loss": 0.7396, + "step": 644 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016520996556687028, + "loss": 0.9178, + "step": 645 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001650972282335697, + "loss": 0.6308, + "step": 646 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016498434713559088, + "loss": 0.9018, + "step": 647 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016487132252222727, + "loss": 0.8658, + "step": 648 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016475815464308933, + "loss": 0.8228, + "step": 649 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001646448437481039, + "loss": 0.8944, + "step": 650 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001645313900875136, + "loss": 0.8617, + "step": 651 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016441779391187646, + "loss": 0.9726, + "step": 652 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016430405547206516, + "loss": 0.693, + "step": 653 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016419017501926656, + "loss": 0.8272, + "step": 654 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016407615280498124, + "loss": 0.8523, + "step": 655 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016396198908102272, + "loss": 0.7444, + "step": 656 + }, + { + "epoch": 1.51, + "learning_rate": 0.00016384768409951714, + "loss": 0.8366, + "step": 657 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001637332381129026, + "loss": 0.7441, + "step": 658 + }, + { + "epoch": 1.51, + "learning_rate": 0.00016361865137392854, + "loss": 0.6694, + "step": 659 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001635039241356553, + "loss": 0.8103, + "step": 660 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001633890566514535, + "loss": 0.9135, + "step": 661 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016327404917500346, + "loss": 0.7327, + "step": 662 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016315890196029467, + "loss": 0.8425, + "step": 663 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016304361526162534, + "loss": 0.8812, + "step": 664 + }, + { + "epoch": 1.52, + "learning_rate": 0.00016292818933360151, + "loss": 0.777, + "step": 665 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001628126244311369, + "loss": 0.8864, + "step": 666 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016269692080945198, + "loss": 0.9333, + "step": 667 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016258107872407375, + "loss": 0.906, + "step": 668 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016246509843083492, + "loss": 0.7346, + "step": 669 + }, + { + "epoch": 1.53, + "learning_rate": 0.00016234898018587337, + "loss": 0.8555, + "step": 670 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016223272424563173, + "loss": 0.8449, + "step": 671 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016211633086685664, + "loss": 0.8559, + "step": 672 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016199980030659838, + "loss": 0.7468, + "step": 673 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016188313282221008, + "loss": 0.7986, + "step": 674 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001617663286713474, + "loss": 0.7757, + "step": 675 + }, + { + "epoch": 1.55, + "learning_rate": 0.00016164938811196757, + "loss": 0.8789, + "step": 676 + }, + { + "epoch": 1.55, + "learning_rate": 0.00016153231140232936, + "loss": 0.5499, + "step": 677 + }, + { + "epoch": 1.55, + "learning_rate": 0.00016141509880099206, + "loss": 0.9319, + "step": 678 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016129775056681513, + "loss": 0.6904, + "step": 679 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001611802669589575, + "loss": 0.8506, + "step": 680 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016106264823687716, + "loss": 0.7242, + "step": 681 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016094489466033043, + "loss": 0.6808, + "step": 682 + }, + { + "epoch": 1.56, + "learning_rate": 0.00016082700648937146, + "loss": 0.8017, + "step": 683 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016070898398435167, + "loss": 0.9109, + "step": 684 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016059082740591915, + "loss": 0.7277, + "step": 685 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016047253701501808, + "loss": 0.8601, + "step": 686 + }, + { + "epoch": 1.57, + "learning_rate": 0.00016035411307288813, + "loss": 0.9118, + "step": 687 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001602355558410639, + "loss": 0.8049, + "step": 688 + }, + { + "epoch": 1.58, + "learning_rate": 0.00016011686558137448, + "loss": 0.8174, + "step": 689 + }, + { + "epoch": 1.58, + "learning_rate": 0.00015999804255594258, + "loss": 0.8481, + "step": 690 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001598790870271843, + "loss": 0.7052, + "step": 691 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015975999925780813, + "loss": 0.8208, + "step": 692 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015964077951081485, + "loss": 0.7257, + "step": 693 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015952142804949652, + "loss": 0.858, + "step": 694 + }, + { + "epoch": 1.59, + "learning_rate": 0.00015940194513743624, + "loss": 0.9242, + "step": 695 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001592823310385073, + "loss": 0.7924, + "step": 696 + }, + { + "epoch": 1.6, + "learning_rate": 0.00015916258601687274, + "loss": 0.8788, + "step": 697 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001590427103369848, + "loss": 0.7946, + "step": 698 + }, + { + "epoch": 1.6, + "learning_rate": 0.00015892270426358414, + "loss": 0.8318, + "step": 699 + }, + { + "epoch": 1.6, + "learning_rate": 0.00015880256806169953, + "loss": 0.8983, + "step": 700 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015868230199664711, + "loss": 0.8889, + "step": 701 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015856190633402968, + "loss": 0.9692, + "step": 702 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001584413813397364, + "loss": 0.7787, + "step": 703 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015832072727994193, + "loss": 0.6455, + "step": 704 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015819994442110616, + "loss": 1.0006, + "step": 705 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015807903302997317, + "loss": 0.7384, + "step": 706 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015795799337357114, + "loss": 0.8517, + "step": 707 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015783682571921133, + "loss": 0.8446, + "step": 708 + }, + { + "epoch": 1.62, + "learning_rate": 0.00015771553033448775, + "loss": 0.8227, + "step": 709 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015759410748727662, + "loss": 0.8374, + "step": 710 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001574725574457354, + "loss": 0.7274, + "step": 711 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015735088047830268, + "loss": 0.8728, + "step": 712 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015722907685369723, + "loss": 1.0569, + "step": 713 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015710714684091762, + "loss": 0.9775, + "step": 714 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001569850907092415, + "loss": 0.6832, + "step": 715 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015686290872822504, + "loss": 0.7358, + "step": 716 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015674060116770236, + "loss": 0.9015, + "step": 717 + }, + { + "epoch": 1.64, + "learning_rate": 0.00015661816829778494, + "loss": 0.8516, + "step": 718 + }, + { + "epoch": 1.65, + "learning_rate": 0.00015649561038886094, + "loss": 0.8911, + "step": 719 + }, + { + "epoch": 1.65, + "learning_rate": 0.00015637292771159472, + "loss": 0.7098, + "step": 720 + }, + { + "epoch": 1.65, + "learning_rate": 0.00015625012053692615, + "loss": 0.955, + "step": 721 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001561271891360701, + "loss": 0.6421, + "step": 722 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001560041337805157, + "loss": 0.8807, + "step": 723 + }, + { + "epoch": 1.66, + "learning_rate": 0.00015588095474202595, + "loss": 0.722, + "step": 724 + }, + { + "epoch": 1.66, + "learning_rate": 0.00015575765229263686, + "loss": 0.8055, + "step": 725 + }, + { + "epoch": 1.66, + "learning_rate": 0.00015563422670465712, + "loss": 0.7822, + "step": 726 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015551067825066728, + "loss": 0.8311, + "step": 727 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015538700720351924, + "loss": 0.8519, + "step": 728 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015526321383633568, + "loss": 0.7506, + "step": 729 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001551392984225094, + "loss": 0.8056, + "step": 730 + }, + { + "epoch": 1.67, + "learning_rate": 0.00015501526123570277, + "loss": 0.6968, + "step": 731 + }, + { + "epoch": 1.68, + "learning_rate": 0.000154891102549847, + "loss": 0.829, + "step": 732 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001547668226391417, + "loss": 0.6682, + "step": 733 + }, + { + "epoch": 1.68, + "learning_rate": 0.00015464242177805422, + "loss": 0.8295, + "step": 734 + }, + { + "epoch": 1.68, + "learning_rate": 0.00015451790024131895, + "loss": 0.6911, + "step": 735 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015439325830393687, + "loss": 0.6785, + "step": 736 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015426849624117472, + "loss": 0.81, + "step": 737 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015414361432856475, + "loss": 0.9955, + "step": 738 + }, + { + "epoch": 1.69, + "learning_rate": 0.00015401861284190368, + "loss": 0.8433, + "step": 739 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015389349205725242, + "loss": 0.618, + "step": 740 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015376825225093537, + "loss": 0.7747, + "step": 741 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015364289369953967, + "loss": 0.7673, + "step": 742 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001535174166799148, + "loss": 0.8066, + "step": 743 + }, + { + "epoch": 1.7, + "learning_rate": 0.00015339182146917183, + "loss": 0.8392, + "step": 744 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001532661083446829, + "loss": 0.7949, + "step": 745 + }, + { + "epoch": 1.71, + "learning_rate": 0.00015314027758408044, + "loss": 0.8698, + "step": 746 + }, + { + "epoch": 1.71, + "learning_rate": 0.00015301432946525684, + "loss": 0.7715, + "step": 747 + }, + { + "epoch": 1.71, + "learning_rate": 0.00015288826426636354, + "loss": 0.7583, + "step": 748 + }, + { + "epoch": 1.72, + "learning_rate": 0.00015276208226581064, + "loss": 0.8544, + "step": 749 + }, + { + "epoch": 1.72, + "learning_rate": 0.00015263578374226605, + "loss": 0.8272, + "step": 750 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001525093689746552, + "loss": 0.857, + "step": 751 + }, + { + "epoch": 1.72, + "learning_rate": 0.00015238283824216015, + "loss": 0.9208, + "step": 752 + }, + { + "epoch": 1.73, + "learning_rate": 0.000152256191824219, + "loss": 0.8626, + "step": 753 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015212943000052545, + "loss": 0.9418, + "step": 754 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015200255305102803, + "loss": 0.8087, + "step": 755 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015187556125592945, + "loss": 0.7913, + "step": 756 + }, + { + "epoch": 1.73, + "learning_rate": 0.00015174845489568622, + "loss": 0.8973, + "step": 757 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015162123425100762, + "loss": 0.701, + "step": 758 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015149389960285558, + "loss": 0.898, + "step": 759 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015136645123244366, + "loss": 0.8809, + "step": 760 + }, + { + "epoch": 1.74, + "learning_rate": 0.00015123888942123652, + "loss": 0.7334, + "step": 761 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001511112144509495, + "loss": 0.8506, + "step": 762 + }, + { + "epoch": 1.75, + "learning_rate": 0.00015098342660354775, + "loss": 0.8469, + "step": 763 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001508555261612457, + "loss": 1.0353, + "step": 764 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001507275134065065, + "loss": 0.6269, + "step": 765 + }, + { + "epoch": 1.75, + "learning_rate": 0.00015059938862204127, + "loss": 0.7825, + "step": 766 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001504711520908086, + "loss": 0.8388, + "step": 767 + }, + { + "epoch": 1.76, + "learning_rate": 0.00015034280409601385, + "loss": 0.7383, + "step": 768 + }, + { + "epoch": 1.76, + "learning_rate": 0.00015021434492110852, + "loss": 0.8029, + "step": 769 + }, + { + "epoch": 1.76, + "learning_rate": 0.00015008577484978966, + "loss": 0.6527, + "step": 770 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014995709416599926, + "loss": 0.9434, + "step": 771 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014982830315392358, + "loss": 0.753, + "step": 772 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014969940209799248, + "loss": 0.8143, + "step": 773 + }, + { + "epoch": 1.77, + "learning_rate": 0.00014957039128287892, + "loss": 0.8939, + "step": 774 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001494412709934982, + "loss": 0.9265, + "step": 775 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014931204151500747, + "loss": 0.8261, + "step": 776 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014918270313280495, + "loss": 0.8555, + "step": 777 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014905325613252937, + "loss": 0.8191, + "step": 778 + }, + { + "epoch": 1.78, + "learning_rate": 0.00014892370080005936, + "loss": 0.9159, + "step": 779 + }, + { + "epoch": 1.79, + "learning_rate": 0.00014879403742151283, + "loss": 0.7936, + "step": 780 + }, + { + "epoch": 1.79, + "learning_rate": 0.00014866426628324625, + "loss": 0.8782, + "step": 781 + }, + { + "epoch": 1.79, + "learning_rate": 0.00014853438767185412, + "loss": 0.6078, + "step": 782 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001484044018741682, + "loss": 0.7182, + "step": 783 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014827430917725712, + "loss": 0.7528, + "step": 784 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014814410986842543, + "loss": 0.902, + "step": 785 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014801380423521324, + "loss": 0.8765, + "step": 786 + }, + { + "epoch": 1.8, + "learning_rate": 0.00014788339256539544, + "loss": 0.6332, + "step": 787 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014775287514698105, + "loss": 0.7258, + "step": 788 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014762225226821273, + "loss": 0.7754, + "step": 789 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014749152421756595, + "loss": 0.7039, + "step": 790 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001473606912837485, + "loss": 0.8563, + "step": 791 + }, + { + "epoch": 1.81, + "learning_rate": 0.00014722975375569978, + "loss": 0.8956, + "step": 792 + }, + { + "epoch": 1.82, + "learning_rate": 0.00014709871192259026, + "loss": 0.8724, + "step": 793 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001469675660738206, + "loss": 0.8885, + "step": 794 + }, + { + "epoch": 1.82, + "learning_rate": 0.00014683631649902132, + "loss": 0.7637, + "step": 795 + }, + { + "epoch": 1.82, + "learning_rate": 0.00014670496348805195, + "loss": 0.7596, + "step": 796 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014657350733100047, + "loss": 0.8221, + "step": 797 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014644194831818266, + "loss": 0.8475, + "step": 798 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014631028674014142, + "loss": 0.7966, + "step": 799 + }, + { + "epoch": 1.83, + "learning_rate": 0.00014617852288764625, + "loss": 0.9186, + "step": 800 + }, + { + "epoch": 1.84, + "learning_rate": 0.00014604665705169237, + "loss": 0.9027, + "step": 801 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001459146895235004, + "loss": 0.9357, + "step": 802 + }, + { + "epoch": 1.84, + "learning_rate": 0.00014578262059451537, + "loss": 0.9202, + "step": 803 + }, + { + "epoch": 1.84, + "learning_rate": 0.00014565045055640638, + "loss": 0.9226, + "step": 804 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001455181797010658, + "loss": 0.8416, + "step": 805 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001453858083206086, + "loss": 0.8192, + "step": 806 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001452533367073718, + "loss": 0.8309, + "step": 807 + }, + { + "epoch": 1.85, + "learning_rate": 0.00014512076515391375, + "loss": 0.7646, + "step": 808 + }, + { + "epoch": 1.85, + "learning_rate": 0.00014498809395301356, + "loss": 0.9335, + "step": 809 + }, + { + "epoch": 1.86, + "learning_rate": 0.00014485532339767037, + "loss": 0.9696, + "step": 810 + }, + { + "epoch": 1.86, + "learning_rate": 0.00014472245378110277, + "loss": 0.7, + "step": 811 + }, + { + "epoch": 1.86, + "learning_rate": 0.000144589485396748, + "loss": 0.8206, + "step": 812 + }, + { + "epoch": 1.86, + "learning_rate": 0.0001444564185382617, + "loss": 0.7417, + "step": 813 + }, + { + "epoch": 1.86, + "learning_rate": 0.00014432325349951667, + "loss": 0.6384, + "step": 814 + }, + { + "epoch": 1.87, + "learning_rate": 0.00014418999057460276, + "loss": 0.7801, + "step": 815 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001440566300578259, + "loss": 0.8459, + "step": 816 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001439231722437075, + "loss": 0.8863, + "step": 817 + }, + { + "epoch": 1.87, + "learning_rate": 0.000143789617426984, + "loss": 0.8502, + "step": 818 + }, + { + "epoch": 1.88, + "learning_rate": 0.000143655965902606, + "loss": 0.8522, + "step": 819 + }, + { + "epoch": 1.88, + "learning_rate": 0.00014352221796573757, + "loss": 0.8612, + "step": 820 + }, + { + "epoch": 1.88, + "learning_rate": 0.00014338837391175582, + "loss": 0.8065, + "step": 821 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001432544340362501, + "loss": 0.8777, + "step": 822 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014312039863502145, + "loss": 0.7731, + "step": 823 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014298626800408166, + "loss": 0.8791, + "step": 824 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014285204243965306, + "loss": 0.9095, + "step": 825 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014271772223816757, + "loss": 0.8846, + "step": 826 + }, + { + "epoch": 1.89, + "learning_rate": 0.00014258330769626606, + "loss": 0.701, + "step": 827 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014244879911079779, + "loss": 0.7598, + "step": 828 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014231419677881966, + "loss": 1.0411, + "step": 829 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014217950099759569, + "loss": 0.6915, + "step": 830 + }, + { + "epoch": 1.9, + "learning_rate": 0.00014204471206459628, + "loss": 0.8048, + "step": 831 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001419098302774974, + "loss": 0.7688, + "step": 832 + }, + { + "epoch": 1.91, + "learning_rate": 0.00014177485593418028, + "loss": 0.7863, + "step": 833 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001416397893327304, + "loss": 0.7627, + "step": 834 + }, + { + "epoch": 1.91, + "learning_rate": 0.00014150463077143712, + "loss": 0.7423, + "step": 835 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014136938054879283, + "loss": 0.7236, + "step": 836 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014123403896349227, + "loss": 0.8978, + "step": 837 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014109860631443213, + "loss": 0.9403, + "step": 838 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014096308290071003, + "loss": 0.7267, + "step": 839 + }, + { + "epoch": 1.92, + "learning_rate": 0.00014082746902162414, + "loss": 0.7905, + "step": 840 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014069176497667242, + "loss": 0.8848, + "step": 841 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014055597106555192, + "loss": 0.9057, + "step": 842 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014042008758815818, + "loss": 0.7363, + "step": 843 + }, + { + "epoch": 1.93, + "learning_rate": 0.00014028411484458454, + "loss": 0.8193, + "step": 844 + }, + { + "epoch": 1.94, + "learning_rate": 0.00014014805313512145, + "loss": 0.7387, + "step": 845 + }, + { + "epoch": 1.94, + "learning_rate": 0.00014001190276025593, + "loss": 0.8871, + "step": 846 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001398756640206707, + "loss": 0.7342, + "step": 847 + }, + { + "epoch": 1.94, + "learning_rate": 0.00013973933721724363, + "loss": 0.8557, + "step": 848 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001396029226510472, + "loss": 0.8778, + "step": 849 + }, + { + "epoch": 1.95, + "learning_rate": 0.00013946642062334766, + "loss": 0.7844, + "step": 850 + }, + { + "epoch": 1.95, + "learning_rate": 0.00013932983143560433, + "loss": 0.7941, + "step": 851 + }, + { + "epoch": 1.95, + "learning_rate": 0.00013919315538946905, + "loss": 0.7505, + "step": 852 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001390563927867856, + "loss": 0.8371, + "step": 853 + }, + { + "epoch": 1.96, + "learning_rate": 0.00013891954392958878, + "loss": 0.8128, + "step": 854 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001387826091201039, + "loss": 0.7127, + "step": 855 + }, + { + "epoch": 1.96, + "learning_rate": 0.00013864558866074622, + "loss": 0.8165, + "step": 856 + }, + { + "epoch": 1.96, + "learning_rate": 0.00013850848285411994, + "loss": 0.7103, + "step": 857 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013837129200301794, + "loss": 0.8373, + "step": 858 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013823401641042084, + "loss": 0.6908, + "step": 859 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013809665637949637, + "loss": 0.7358, + "step": 860 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013795921221359877, + "loss": 0.7545, + "step": 861 + }, + { + "epoch": 1.97, + "learning_rate": 0.00013782168421626816, + "loss": 0.7681, + "step": 862 + }, + { + "epoch": 1.98, + "learning_rate": 0.00013768407269122967, + "loss": 1.026, + "step": 863 + }, + { + "epoch": 1.98, + "learning_rate": 0.000137546377942393, + "loss": 0.761, + "step": 864 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001374086002738516, + "loss": 0.8442, + "step": 865 + }, + { + "epoch": 1.98, + "learning_rate": 0.00013727073998988202, + "loss": 0.7959, + "step": 866 + }, + { + "epoch": 1.99, + "learning_rate": 0.00013713279739494333, + "loss": 0.8061, + "step": 867 + }, + { + "epoch": 1.99, + "learning_rate": 0.00013699477279367636, + "loss": 0.7434, + "step": 868 + }, + { + "epoch": 1.99, + "learning_rate": 0.000136856666490903, + "loss": 0.7159, + "step": 869 + }, + { + "epoch": 1.99, + "learning_rate": 0.00013671847879162562, + "loss": 0.867, + "step": 870 + }, + { + "epoch": 2.0, + "learning_rate": 0.00013658021000102636, + "loss": 0.9237, + "step": 871 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001364418604244664, + "loss": 0.8545, + "step": 872 + }, + { + "epoch": 2.0, + "learning_rate": 0.00013630343036748535, + "loss": 0.893, + "step": 873 + } + ], + "max_steps": 2180, + "num_train_epochs": 5, + "total_flos": 236331181277184.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-873/training_args.bin b/checkpoint-873/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4aa0907a784d65549a9c45257c4d455176479607 --- /dev/null +++ b/checkpoint-873/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adff180a74f6fc1e6a420417eadde6ef8ff75561e442f481bfe772c93f46e2ae +size 6011 diff --git a/checkpoint-873/zero_to_fp32.py b/checkpoint-873/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..c98caae31534368be22b67fc4ae906836c992a8d --- /dev/null +++ b/checkpoint-873/zero_to_fp32.py @@ -0,0 +1,587 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)