Uploaded checkpoint-25000
Browse files- adapter_model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +1795 -5
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 119975656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:58ea76fcf16a2912a570cf295dd1757cd9562cb7f7f8e74d37938855d31dc866
|
3 |
size 119975656
|
optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 240145026
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:231c6b24970ef58291d1980aeb742ace763101289d628ec3f4ac808335924d18
|
3 |
size 240145026
|
rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2bd031f30ceb89483d2d8b5eb187850133dcc5a689162e8975b2cc0e61b4001
|
3 |
size 14244
|
scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3bdbaa37c77733a3ea9eb90a36bc290f4f5b9f56abe23cc6586cbaa459f92c6
|
3 |
size 1064
|
trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
-
"best_metric": 1.
|
3 |
-
"best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-
|
4 |
-
"epoch": 0.
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -16117,6 +16117,1796 @@
|
|
16117 |
"eval_samples_per_second": 15.104,
|
16118 |
"eval_steps_per_second": 15.104,
|
16119 |
"step": 22500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16120 |
}
|
16121 |
],
|
16122 |
"logging_steps": 10,
|
@@ -16124,7 +17914,7 @@
|
|
16124 |
"num_input_tokens_seen": 0,
|
16125 |
"num_train_epochs": 1,
|
16126 |
"save_steps": 2500,
|
16127 |
-
"total_flos":
|
16128 |
"train_batch_size": 1,
|
16129 |
"trial_name": null,
|
16130 |
"trial_params": null
|
|
|
1 |
{
|
2 |
+
"best_metric": 1.3409814834594727,
|
3 |
+
"best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-25000",
|
4 |
+
"epoch": 0.625,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 25000,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
16117 |
"eval_samples_per_second": 15.104,
|
16118 |
"eval_steps_per_second": 15.104,
|
16119 |
"step": 22500
|
16120 |
+
},
|
16121 |
+
{
|
16122 |
+
"epoch": 0.56,
|
16123 |
+
"grad_norm": 3.232191562652588,
|
16124 |
+
"learning_rate": 5.077966101694915e-06,
|
16125 |
+
"loss": 1.3644,
|
16126 |
+
"step": 22510
|
16127 |
+
},
|
16128 |
+
{
|
16129 |
+
"epoch": 0.56,
|
16130 |
+
"grad_norm": 5.3384528160095215,
|
16131 |
+
"learning_rate": 5.071186440677967e-06,
|
16132 |
+
"loss": 1.3093,
|
16133 |
+
"step": 22520
|
16134 |
+
},
|
16135 |
+
{
|
16136 |
+
"epoch": 0.56,
|
16137 |
+
"grad_norm": 6.911621570587158,
|
16138 |
+
"learning_rate": 5.064406779661017e-06,
|
16139 |
+
"loss": 1.2571,
|
16140 |
+
"step": 22530
|
16141 |
+
},
|
16142 |
+
{
|
16143 |
+
"epoch": 0.56,
|
16144 |
+
"grad_norm": 6.84785270690918,
|
16145 |
+
"learning_rate": 5.057627118644069e-06,
|
16146 |
+
"loss": 1.324,
|
16147 |
+
"step": 22540
|
16148 |
+
},
|
16149 |
+
{
|
16150 |
+
"epoch": 0.56,
|
16151 |
+
"grad_norm": 4.634193420410156,
|
16152 |
+
"learning_rate": 5.050847457627119e-06,
|
16153 |
+
"loss": 1.4672,
|
16154 |
+
"step": 22550
|
16155 |
+
},
|
16156 |
+
{
|
16157 |
+
"epoch": 0.56,
|
16158 |
+
"grad_norm": 3.4189460277557373,
|
16159 |
+
"learning_rate": 5.0440677966101705e-06,
|
16160 |
+
"loss": 1.325,
|
16161 |
+
"step": 22560
|
16162 |
+
},
|
16163 |
+
{
|
16164 |
+
"epoch": 0.56,
|
16165 |
+
"grad_norm": 7.2140793800354,
|
16166 |
+
"learning_rate": 5.037288135593221e-06,
|
16167 |
+
"loss": 1.3923,
|
16168 |
+
"step": 22570
|
16169 |
+
},
|
16170 |
+
{
|
16171 |
+
"epoch": 0.56,
|
16172 |
+
"grad_norm": 3.916853189468384,
|
16173 |
+
"learning_rate": 5.030508474576271e-06,
|
16174 |
+
"loss": 1.3431,
|
16175 |
+
"step": 22580
|
16176 |
+
},
|
16177 |
+
{
|
16178 |
+
"epoch": 0.56,
|
16179 |
+
"grad_norm": 6.434349536895752,
|
16180 |
+
"learning_rate": 5.023728813559322e-06,
|
16181 |
+
"loss": 1.4615,
|
16182 |
+
"step": 22590
|
16183 |
+
},
|
16184 |
+
{
|
16185 |
+
"epoch": 0.56,
|
16186 |
+
"grad_norm": 6.851098537445068,
|
16187 |
+
"learning_rate": 5.016949152542373e-06,
|
16188 |
+
"loss": 1.3013,
|
16189 |
+
"step": 22600
|
16190 |
+
},
|
16191 |
+
{
|
16192 |
+
"epoch": 0.57,
|
16193 |
+
"grad_norm": 7.541562080383301,
|
16194 |
+
"learning_rate": 5.010169491525424e-06,
|
16195 |
+
"loss": 1.3696,
|
16196 |
+
"step": 22610
|
16197 |
+
},
|
16198 |
+
{
|
16199 |
+
"epoch": 0.57,
|
16200 |
+
"grad_norm": 4.100308895111084,
|
16201 |
+
"learning_rate": 5.003389830508475e-06,
|
16202 |
+
"loss": 1.3802,
|
16203 |
+
"step": 22620
|
16204 |
+
},
|
16205 |
+
{
|
16206 |
+
"epoch": 0.57,
|
16207 |
+
"grad_norm": 12.694953918457031,
|
16208 |
+
"learning_rate": 4.996610169491526e-06,
|
16209 |
+
"loss": 1.2328,
|
16210 |
+
"step": 22630
|
16211 |
+
},
|
16212 |
+
{
|
16213 |
+
"epoch": 0.57,
|
16214 |
+
"grad_norm": 12.468879699707031,
|
16215 |
+
"learning_rate": 4.989830508474577e-06,
|
16216 |
+
"loss": 1.3995,
|
16217 |
+
"step": 22640
|
16218 |
+
},
|
16219 |
+
{
|
16220 |
+
"epoch": 0.57,
|
16221 |
+
"grad_norm": 3.142810821533203,
|
16222 |
+
"learning_rate": 4.983050847457628e-06,
|
16223 |
+
"loss": 1.2403,
|
16224 |
+
"step": 22650
|
16225 |
+
},
|
16226 |
+
{
|
16227 |
+
"epoch": 0.57,
|
16228 |
+
"grad_norm": 7.71440315246582,
|
16229 |
+
"learning_rate": 4.976271186440678e-06,
|
16230 |
+
"loss": 1.2902,
|
16231 |
+
"step": 22660
|
16232 |
+
},
|
16233 |
+
{
|
16234 |
+
"epoch": 0.57,
|
16235 |
+
"grad_norm": 4.5355634689331055,
|
16236 |
+
"learning_rate": 4.969491525423729e-06,
|
16237 |
+
"loss": 1.3243,
|
16238 |
+
"step": 22670
|
16239 |
+
},
|
16240 |
+
{
|
16241 |
+
"epoch": 0.57,
|
16242 |
+
"grad_norm": 1.5578103065490723,
|
16243 |
+
"learning_rate": 4.96271186440678e-06,
|
16244 |
+
"loss": 1.2796,
|
16245 |
+
"step": 22680
|
16246 |
+
},
|
16247 |
+
{
|
16248 |
+
"epoch": 0.57,
|
16249 |
+
"grad_norm": 4.997007846832275,
|
16250 |
+
"learning_rate": 4.955932203389831e-06,
|
16251 |
+
"loss": 1.1288,
|
16252 |
+
"step": 22690
|
16253 |
+
},
|
16254 |
+
{
|
16255 |
+
"epoch": 0.57,
|
16256 |
+
"grad_norm": 4.240728855133057,
|
16257 |
+
"learning_rate": 4.949152542372882e-06,
|
16258 |
+
"loss": 1.408,
|
16259 |
+
"step": 22700
|
16260 |
+
},
|
16261 |
+
{
|
16262 |
+
"epoch": 0.57,
|
16263 |
+
"grad_norm": 7.517406940460205,
|
16264 |
+
"learning_rate": 4.942372881355932e-06,
|
16265 |
+
"loss": 1.3879,
|
16266 |
+
"step": 22710
|
16267 |
+
},
|
16268 |
+
{
|
16269 |
+
"epoch": 0.57,
|
16270 |
+
"grad_norm": 1.1581978797912598,
|
16271 |
+
"learning_rate": 4.935593220338984e-06,
|
16272 |
+
"loss": 1.3547,
|
16273 |
+
"step": 22720
|
16274 |
+
},
|
16275 |
+
{
|
16276 |
+
"epoch": 0.57,
|
16277 |
+
"grad_norm": 5.073269367218018,
|
16278 |
+
"learning_rate": 4.928813559322034e-06,
|
16279 |
+
"loss": 1.3664,
|
16280 |
+
"step": 22730
|
16281 |
+
},
|
16282 |
+
{
|
16283 |
+
"epoch": 0.57,
|
16284 |
+
"grad_norm": 3.7317769527435303,
|
16285 |
+
"learning_rate": 4.922033898305086e-06,
|
16286 |
+
"loss": 1.3942,
|
16287 |
+
"step": 22740
|
16288 |
+
},
|
16289 |
+
{
|
16290 |
+
"epoch": 0.57,
|
16291 |
+
"grad_norm": 4.843672275543213,
|
16292 |
+
"learning_rate": 4.915254237288136e-06,
|
16293 |
+
"loss": 1.3723,
|
16294 |
+
"step": 22750
|
16295 |
+
},
|
16296 |
+
{
|
16297 |
+
"epoch": 0.57,
|
16298 |
+
"grad_norm": 7.751224040985107,
|
16299 |
+
"learning_rate": 4.908474576271187e-06,
|
16300 |
+
"loss": 1.2057,
|
16301 |
+
"step": 22760
|
16302 |
+
},
|
16303 |
+
{
|
16304 |
+
"epoch": 0.57,
|
16305 |
+
"grad_norm": 2.4519495964050293,
|
16306 |
+
"learning_rate": 4.901694915254237e-06,
|
16307 |
+
"loss": 1.3836,
|
16308 |
+
"step": 22770
|
16309 |
+
},
|
16310 |
+
{
|
16311 |
+
"epoch": 0.57,
|
16312 |
+
"grad_norm": 8.7233304977417,
|
16313 |
+
"learning_rate": 4.894915254237289e-06,
|
16314 |
+
"loss": 1.3842,
|
16315 |
+
"step": 22780
|
16316 |
+
},
|
16317 |
+
{
|
16318 |
+
"epoch": 0.57,
|
16319 |
+
"grad_norm": 2.717367172241211,
|
16320 |
+
"learning_rate": 4.888135593220339e-06,
|
16321 |
+
"loss": 1.4519,
|
16322 |
+
"step": 22790
|
16323 |
+
},
|
16324 |
+
{
|
16325 |
+
"epoch": 0.57,
|
16326 |
+
"grad_norm": 4.797736167907715,
|
16327 |
+
"learning_rate": 4.881355932203391e-06,
|
16328 |
+
"loss": 1.1785,
|
16329 |
+
"step": 22800
|
16330 |
+
},
|
16331 |
+
{
|
16332 |
+
"epoch": 0.57,
|
16333 |
+
"grad_norm": 11.28987979888916,
|
16334 |
+
"learning_rate": 4.874576271186441e-06,
|
16335 |
+
"loss": 1.3058,
|
16336 |
+
"step": 22810
|
16337 |
+
},
|
16338 |
+
{
|
16339 |
+
"epoch": 0.57,
|
16340 |
+
"grad_norm": 5.097863674163818,
|
16341 |
+
"learning_rate": 4.867796610169492e-06,
|
16342 |
+
"loss": 1.232,
|
16343 |
+
"step": 22820
|
16344 |
+
},
|
16345 |
+
{
|
16346 |
+
"epoch": 0.57,
|
16347 |
+
"grad_norm": 5.479716777801514,
|
16348 |
+
"learning_rate": 4.861016949152543e-06,
|
16349 |
+
"loss": 1.4677,
|
16350 |
+
"step": 22830
|
16351 |
+
},
|
16352 |
+
{
|
16353 |
+
"epoch": 0.57,
|
16354 |
+
"grad_norm": 11.921456336975098,
|
16355 |
+
"learning_rate": 4.854237288135594e-06,
|
16356 |
+
"loss": 1.5442,
|
16357 |
+
"step": 22840
|
16358 |
+
},
|
16359 |
+
{
|
16360 |
+
"epoch": 0.57,
|
16361 |
+
"grad_norm": 9.934676170349121,
|
16362 |
+
"learning_rate": 4.847457627118645e-06,
|
16363 |
+
"loss": 1.4147,
|
16364 |
+
"step": 22850
|
16365 |
+
},
|
16366 |
+
{
|
16367 |
+
"epoch": 0.57,
|
16368 |
+
"grad_norm": 3.3140487670898438,
|
16369 |
+
"learning_rate": 4.840677966101695e-06,
|
16370 |
+
"loss": 1.2146,
|
16371 |
+
"step": 22860
|
16372 |
+
},
|
16373 |
+
{
|
16374 |
+
"epoch": 0.57,
|
16375 |
+
"grad_norm": 2.1303317546844482,
|
16376 |
+
"learning_rate": 4.833898305084746e-06,
|
16377 |
+
"loss": 1.3021,
|
16378 |
+
"step": 22870
|
16379 |
+
},
|
16380 |
+
{
|
16381 |
+
"epoch": 0.57,
|
16382 |
+
"grad_norm": 3.943474769592285,
|
16383 |
+
"learning_rate": 4.827118644067797e-06,
|
16384 |
+
"loss": 1.2028,
|
16385 |
+
"step": 22880
|
16386 |
+
},
|
16387 |
+
{
|
16388 |
+
"epoch": 0.57,
|
16389 |
+
"grad_norm": 5.444009780883789,
|
16390 |
+
"learning_rate": 4.820338983050848e-06,
|
16391 |
+
"loss": 1.3873,
|
16392 |
+
"step": 22890
|
16393 |
+
},
|
16394 |
+
{
|
16395 |
+
"epoch": 0.57,
|
16396 |
+
"grad_norm": 5.230558395385742,
|
16397 |
+
"learning_rate": 4.813559322033899e-06,
|
16398 |
+
"loss": 1.2907,
|
16399 |
+
"step": 22900
|
16400 |
+
},
|
16401 |
+
{
|
16402 |
+
"epoch": 0.57,
|
16403 |
+
"grad_norm": 2.652158498764038,
|
16404 |
+
"learning_rate": 4.80677966101695e-06,
|
16405 |
+
"loss": 1.27,
|
16406 |
+
"step": 22910
|
16407 |
+
},
|
16408 |
+
{
|
16409 |
+
"epoch": 0.57,
|
16410 |
+
"grad_norm": 5.423113822937012,
|
16411 |
+
"learning_rate": 4.800000000000001e-06,
|
16412 |
+
"loss": 1.3488,
|
16413 |
+
"step": 22920
|
16414 |
+
},
|
16415 |
+
{
|
16416 |
+
"epoch": 0.57,
|
16417 |
+
"grad_norm": 6.731577396392822,
|
16418 |
+
"learning_rate": 4.793220338983051e-06,
|
16419 |
+
"loss": 1.3411,
|
16420 |
+
"step": 22930
|
16421 |
+
},
|
16422 |
+
{
|
16423 |
+
"epoch": 0.57,
|
16424 |
+
"grad_norm": 6.142993927001953,
|
16425 |
+
"learning_rate": 4.786440677966102e-06,
|
16426 |
+
"loss": 1.4032,
|
16427 |
+
"step": 22940
|
16428 |
+
},
|
16429 |
+
{
|
16430 |
+
"epoch": 0.57,
|
16431 |
+
"grad_norm": 7.845600605010986,
|
16432 |
+
"learning_rate": 4.779661016949153e-06,
|
16433 |
+
"loss": 1.3567,
|
16434 |
+
"step": 22950
|
16435 |
+
},
|
16436 |
+
{
|
16437 |
+
"epoch": 0.57,
|
16438 |
+
"grad_norm": 3.123938798904419,
|
16439 |
+
"learning_rate": 4.772881355932204e-06,
|
16440 |
+
"loss": 1.2106,
|
16441 |
+
"step": 22960
|
16442 |
+
},
|
16443 |
+
{
|
16444 |
+
"epoch": 0.57,
|
16445 |
+
"grad_norm": 4.684544086456299,
|
16446 |
+
"learning_rate": 4.766101694915254e-06,
|
16447 |
+
"loss": 1.3407,
|
16448 |
+
"step": 22970
|
16449 |
+
},
|
16450 |
+
{
|
16451 |
+
"epoch": 0.57,
|
16452 |
+
"grad_norm": 11.232462882995605,
|
16453 |
+
"learning_rate": 4.759322033898306e-06,
|
16454 |
+
"loss": 1.136,
|
16455 |
+
"step": 22980
|
16456 |
+
},
|
16457 |
+
{
|
16458 |
+
"epoch": 0.57,
|
16459 |
+
"grad_norm": 3.3728113174438477,
|
16460 |
+
"learning_rate": 4.752542372881356e-06,
|
16461 |
+
"loss": 1.199,
|
16462 |
+
"step": 22990
|
16463 |
+
},
|
16464 |
+
{
|
16465 |
+
"epoch": 0.57,
|
16466 |
+
"grad_norm": 3.5851376056671143,
|
16467 |
+
"learning_rate": 4.745762711864408e-06,
|
16468 |
+
"loss": 1.463,
|
16469 |
+
"step": 23000
|
16470 |
+
},
|
16471 |
+
{
|
16472 |
+
"epoch": 0.57,
|
16473 |
+
"eval_loss": 1.3161565065383911,
|
16474 |
+
"eval_runtime": 66.103,
|
16475 |
+
"eval_samples_per_second": 15.128,
|
16476 |
+
"eval_steps_per_second": 15.128,
|
16477 |
+
"step": 23000
|
16478 |
+
},
|
16479 |
+
{
|
16480 |
+
"epoch": 0.58,
|
16481 |
+
"grad_norm": 5.720829010009766,
|
16482 |
+
"learning_rate": 4.738983050847458e-06,
|
16483 |
+
"loss": 1.4393,
|
16484 |
+
"step": 23010
|
16485 |
+
},
|
16486 |
+
{
|
16487 |
+
"epoch": 0.58,
|
16488 |
+
"grad_norm": 11.663517951965332,
|
16489 |
+
"learning_rate": 4.732203389830509e-06,
|
16490 |
+
"loss": 1.3158,
|
16491 |
+
"step": 23020
|
16492 |
+
},
|
16493 |
+
{
|
16494 |
+
"epoch": 0.58,
|
16495 |
+
"grad_norm": 10.158949851989746,
|
16496 |
+
"learning_rate": 4.725423728813559e-06,
|
16497 |
+
"loss": 1.4086,
|
16498 |
+
"step": 23030
|
16499 |
+
},
|
16500 |
+
{
|
16501 |
+
"epoch": 0.58,
|
16502 |
+
"grad_norm": 6.925542831420898,
|
16503 |
+
"learning_rate": 4.718644067796611e-06,
|
16504 |
+
"loss": 1.3655,
|
16505 |
+
"step": 23040
|
16506 |
+
},
|
16507 |
+
{
|
16508 |
+
"epoch": 0.58,
|
16509 |
+
"grad_norm": 1.4193698167800903,
|
16510 |
+
"learning_rate": 4.711864406779661e-06,
|
16511 |
+
"loss": 1.2348,
|
16512 |
+
"step": 23050
|
16513 |
+
},
|
16514 |
+
{
|
16515 |
+
"epoch": 0.58,
|
16516 |
+
"grad_norm": 6.884500980377197,
|
16517 |
+
"learning_rate": 4.705084745762713e-06,
|
16518 |
+
"loss": 1.4053,
|
16519 |
+
"step": 23060
|
16520 |
+
},
|
16521 |
+
{
|
16522 |
+
"epoch": 0.58,
|
16523 |
+
"grad_norm": 4.412232875823975,
|
16524 |
+
"learning_rate": 4.698305084745763e-06,
|
16525 |
+
"loss": 1.2476,
|
16526 |
+
"step": 23070
|
16527 |
+
},
|
16528 |
+
{
|
16529 |
+
"epoch": 0.58,
|
16530 |
+
"grad_norm": 2.403428077697754,
|
16531 |
+
"learning_rate": 4.691525423728814e-06,
|
16532 |
+
"loss": 1.4168,
|
16533 |
+
"step": 23080
|
16534 |
+
},
|
16535 |
+
{
|
16536 |
+
"epoch": 0.58,
|
16537 |
+
"grad_norm": 7.022388458251953,
|
16538 |
+
"learning_rate": 4.684745762711865e-06,
|
16539 |
+
"loss": 1.359,
|
16540 |
+
"step": 23090
|
16541 |
+
},
|
16542 |
+
{
|
16543 |
+
"epoch": 0.58,
|
16544 |
+
"grad_norm": 5.728121280670166,
|
16545 |
+
"learning_rate": 4.677966101694916e-06,
|
16546 |
+
"loss": 1.3916,
|
16547 |
+
"step": 23100
|
16548 |
+
},
|
16549 |
+
{
|
16550 |
+
"epoch": 0.58,
|
16551 |
+
"grad_norm": 14.995932579040527,
|
16552 |
+
"learning_rate": 4.671186440677967e-06,
|
16553 |
+
"loss": 1.4541,
|
16554 |
+
"step": 23110
|
16555 |
+
},
|
16556 |
+
{
|
16557 |
+
"epoch": 0.58,
|
16558 |
+
"grad_norm": 12.448729515075684,
|
16559 |
+
"learning_rate": 4.664406779661017e-06,
|
16560 |
+
"loss": 1.2989,
|
16561 |
+
"step": 23120
|
16562 |
+
},
|
16563 |
+
{
|
16564 |
+
"epoch": 0.58,
|
16565 |
+
"grad_norm": 2.5807507038116455,
|
16566 |
+
"learning_rate": 4.657627118644068e-06,
|
16567 |
+
"loss": 1.4881,
|
16568 |
+
"step": 23130
|
16569 |
+
},
|
16570 |
+
{
|
16571 |
+
"epoch": 0.58,
|
16572 |
+
"grad_norm": 4.670041561126709,
|
16573 |
+
"learning_rate": 4.650847457627119e-06,
|
16574 |
+
"loss": 1.4595,
|
16575 |
+
"step": 23140
|
16576 |
+
},
|
16577 |
+
{
|
16578 |
+
"epoch": 0.58,
|
16579 |
+
"grad_norm": 3.8450145721435547,
|
16580 |
+
"learning_rate": 4.64406779661017e-06,
|
16581 |
+
"loss": 1.245,
|
16582 |
+
"step": 23150
|
16583 |
+
},
|
16584 |
+
{
|
16585 |
+
"epoch": 0.58,
|
16586 |
+
"grad_norm": 15.546969413757324,
|
16587 |
+
"learning_rate": 4.637288135593221e-06,
|
16588 |
+
"loss": 1.2199,
|
16589 |
+
"step": 23160
|
16590 |
+
},
|
16591 |
+
{
|
16592 |
+
"epoch": 0.58,
|
16593 |
+
"grad_norm": 8.563859939575195,
|
16594 |
+
"learning_rate": 4.630508474576272e-06,
|
16595 |
+
"loss": 1.3752,
|
16596 |
+
"step": 23170
|
16597 |
+
},
|
16598 |
+
{
|
16599 |
+
"epoch": 0.58,
|
16600 |
+
"grad_norm": 8.742653846740723,
|
16601 |
+
"learning_rate": 4.623728813559323e-06,
|
16602 |
+
"loss": 1.2802,
|
16603 |
+
"step": 23180
|
16604 |
+
},
|
16605 |
+
{
|
16606 |
+
"epoch": 0.58,
|
16607 |
+
"grad_norm": 6.253279685974121,
|
16608 |
+
"learning_rate": 4.616949152542373e-06,
|
16609 |
+
"loss": 1.4559,
|
16610 |
+
"step": 23190
|
16611 |
+
},
|
16612 |
+
{
|
16613 |
+
"epoch": 0.58,
|
16614 |
+
"grad_norm": 5.3615827560424805,
|
16615 |
+
"learning_rate": 4.610169491525424e-06,
|
16616 |
+
"loss": 1.1649,
|
16617 |
+
"step": 23200
|
16618 |
+
},
|
16619 |
+
{
|
16620 |
+
"epoch": 0.58,
|
16621 |
+
"grad_norm": 9.165109634399414,
|
16622 |
+
"learning_rate": 4.603389830508475e-06,
|
16623 |
+
"loss": 1.4435,
|
16624 |
+
"step": 23210
|
16625 |
+
},
|
16626 |
+
{
|
16627 |
+
"epoch": 0.58,
|
16628 |
+
"grad_norm": 6.625391483306885,
|
16629 |
+
"learning_rate": 4.596610169491526e-06,
|
16630 |
+
"loss": 1.2911,
|
16631 |
+
"step": 23220
|
16632 |
+
},
|
16633 |
+
{
|
16634 |
+
"epoch": 0.58,
|
16635 |
+
"grad_norm": 3.898466110229492,
|
16636 |
+
"learning_rate": 4.589830508474576e-06,
|
16637 |
+
"loss": 1.2653,
|
16638 |
+
"step": 23230
|
16639 |
+
},
|
16640 |
+
{
|
16641 |
+
"epoch": 0.58,
|
16642 |
+
"grad_norm": 1.5483791828155518,
|
16643 |
+
"learning_rate": 4.583050847457628e-06,
|
16644 |
+
"loss": 1.1665,
|
16645 |
+
"step": 23240
|
16646 |
+
},
|
16647 |
+
{
|
16648 |
+
"epoch": 0.58,
|
16649 |
+
"grad_norm": 4.248947620391846,
|
16650 |
+
"learning_rate": 4.576271186440678e-06,
|
16651 |
+
"loss": 1.2275,
|
16652 |
+
"step": 23250
|
16653 |
+
},
|
16654 |
+
{
|
16655 |
+
"epoch": 0.58,
|
16656 |
+
"grad_norm": 3.522451639175415,
|
16657 |
+
"learning_rate": 4.56949152542373e-06,
|
16658 |
+
"loss": 1.3427,
|
16659 |
+
"step": 23260
|
16660 |
+
},
|
16661 |
+
{
|
16662 |
+
"epoch": 0.58,
|
16663 |
+
"grad_norm": 5.79518461227417,
|
16664 |
+
"learning_rate": 4.56271186440678e-06,
|
16665 |
+
"loss": 1.2545,
|
16666 |
+
"step": 23270
|
16667 |
+
},
|
16668 |
+
{
|
16669 |
+
"epoch": 0.58,
|
16670 |
+
"grad_norm": 7.211407661437988,
|
16671 |
+
"learning_rate": 4.555932203389831e-06,
|
16672 |
+
"loss": 1.3256,
|
16673 |
+
"step": 23280
|
16674 |
+
},
|
16675 |
+
{
|
16676 |
+
"epoch": 0.58,
|
16677 |
+
"grad_norm": 2.218186855316162,
|
16678 |
+
"learning_rate": 4.549152542372881e-06,
|
16679 |
+
"loss": 1.327,
|
16680 |
+
"step": 23290
|
16681 |
+
},
|
16682 |
+
{
|
16683 |
+
"epoch": 0.58,
|
16684 |
+
"grad_norm": 3.0725247859954834,
|
16685 |
+
"learning_rate": 4.542372881355933e-06,
|
16686 |
+
"loss": 1.1152,
|
16687 |
+
"step": 23300
|
16688 |
+
},
|
16689 |
+
{
|
16690 |
+
"epoch": 0.58,
|
16691 |
+
"grad_norm": 10.184927940368652,
|
16692 |
+
"learning_rate": 4.535593220338983e-06,
|
16693 |
+
"loss": 1.3153,
|
16694 |
+
"step": 23310
|
16695 |
+
},
|
16696 |
+
{
|
16697 |
+
"epoch": 0.58,
|
16698 |
+
"grad_norm": 6.575405597686768,
|
16699 |
+
"learning_rate": 4.528813559322035e-06,
|
16700 |
+
"loss": 1.2879,
|
16701 |
+
"step": 23320
|
16702 |
+
},
|
16703 |
+
{
|
16704 |
+
"epoch": 0.58,
|
16705 |
+
"grad_norm": 5.2145094871521,
|
16706 |
+
"learning_rate": 4.522033898305085e-06,
|
16707 |
+
"loss": 1.2836,
|
16708 |
+
"step": 23330
|
16709 |
+
},
|
16710 |
+
{
|
16711 |
+
"epoch": 0.58,
|
16712 |
+
"grad_norm": 16.586687088012695,
|
16713 |
+
"learning_rate": 4.515254237288136e-06,
|
16714 |
+
"loss": 1.245,
|
16715 |
+
"step": 23340
|
16716 |
+
},
|
16717 |
+
{
|
16718 |
+
"epoch": 0.58,
|
16719 |
+
"grad_norm": 5.264496803283691,
|
16720 |
+
"learning_rate": 4.508474576271187e-06,
|
16721 |
+
"loss": 1.1731,
|
16722 |
+
"step": 23350
|
16723 |
+
},
|
16724 |
+
{
|
16725 |
+
"epoch": 0.58,
|
16726 |
+
"grad_norm": 5.267670631408691,
|
16727 |
+
"learning_rate": 4.501694915254238e-06,
|
16728 |
+
"loss": 1.3662,
|
16729 |
+
"step": 23360
|
16730 |
+
},
|
16731 |
+
{
|
16732 |
+
"epoch": 0.58,
|
16733 |
+
"grad_norm": 7.335075855255127,
|
16734 |
+
"learning_rate": 4.494915254237289e-06,
|
16735 |
+
"loss": 1.3261,
|
16736 |
+
"step": 23370
|
16737 |
+
},
|
16738 |
+
{
|
16739 |
+
"epoch": 0.58,
|
16740 |
+
"grad_norm": 7.068376064300537,
|
16741 |
+
"learning_rate": 4.488135593220339e-06,
|
16742 |
+
"loss": 1.2805,
|
16743 |
+
"step": 23380
|
16744 |
+
},
|
16745 |
+
{
|
16746 |
+
"epoch": 0.58,
|
16747 |
+
"grad_norm": 15.14303207397461,
|
16748 |
+
"learning_rate": 4.48135593220339e-06,
|
16749 |
+
"loss": 1.3605,
|
16750 |
+
"step": 23390
|
16751 |
+
},
|
16752 |
+
{
|
16753 |
+
"epoch": 0.58,
|
16754 |
+
"grad_norm": 12.552229881286621,
|
16755 |
+
"learning_rate": 4.474576271186441e-06,
|
16756 |
+
"loss": 1.4804,
|
16757 |
+
"step": 23400
|
16758 |
+
},
|
16759 |
+
{
|
16760 |
+
"epoch": 0.59,
|
16761 |
+
"grad_norm": 6.760104656219482,
|
16762 |
+
"learning_rate": 4.467796610169492e-06,
|
16763 |
+
"loss": 1.2604,
|
16764 |
+
"step": 23410
|
16765 |
+
},
|
16766 |
+
{
|
16767 |
+
"epoch": 0.59,
|
16768 |
+
"grad_norm": 6.444414138793945,
|
16769 |
+
"learning_rate": 4.461016949152543e-06,
|
16770 |
+
"loss": 1.3677,
|
16771 |
+
"step": 23420
|
16772 |
+
},
|
16773 |
+
{
|
16774 |
+
"epoch": 0.59,
|
16775 |
+
"grad_norm": 2.223396062850952,
|
16776 |
+
"learning_rate": 4.454237288135594e-06,
|
16777 |
+
"loss": 1.4585,
|
16778 |
+
"step": 23430
|
16779 |
+
},
|
16780 |
+
{
|
16781 |
+
"epoch": 0.59,
|
16782 |
+
"grad_norm": 3.0469980239868164,
|
16783 |
+
"learning_rate": 4.447457627118645e-06,
|
16784 |
+
"loss": 1.2125,
|
16785 |
+
"step": 23440
|
16786 |
+
},
|
16787 |
+
{
|
16788 |
+
"epoch": 0.59,
|
16789 |
+
"grad_norm": 9.140281677246094,
|
16790 |
+
"learning_rate": 4.440677966101695e-06,
|
16791 |
+
"loss": 1.2847,
|
16792 |
+
"step": 23450
|
16793 |
+
},
|
16794 |
+
{
|
16795 |
+
"epoch": 0.59,
|
16796 |
+
"grad_norm": 10.596829414367676,
|
16797 |
+
"learning_rate": 4.433898305084746e-06,
|
16798 |
+
"loss": 1.1919,
|
16799 |
+
"step": 23460
|
16800 |
+
},
|
16801 |
+
{
|
16802 |
+
"epoch": 0.59,
|
16803 |
+
"grad_norm": 6.769688129425049,
|
16804 |
+
"learning_rate": 4.427118644067797e-06,
|
16805 |
+
"loss": 1.3911,
|
16806 |
+
"step": 23470
|
16807 |
+
},
|
16808 |
+
{
|
16809 |
+
"epoch": 0.59,
|
16810 |
+
"grad_norm": 8.3526029586792,
|
16811 |
+
"learning_rate": 4.420338983050848e-06,
|
16812 |
+
"loss": 1.3243,
|
16813 |
+
"step": 23480
|
16814 |
+
},
|
16815 |
+
{
|
16816 |
+
"epoch": 0.59,
|
16817 |
+
"grad_norm": 1.516774296760559,
|
16818 |
+
"learning_rate": 4.413559322033898e-06,
|
16819 |
+
"loss": 1.4328,
|
16820 |
+
"step": 23490
|
16821 |
+
},
|
16822 |
+
{
|
16823 |
+
"epoch": 0.59,
|
16824 |
+
"grad_norm": 5.758790493011475,
|
16825 |
+
"learning_rate": 4.40677966101695e-06,
|
16826 |
+
"loss": 1.1158,
|
16827 |
+
"step": 23500
|
16828 |
+
},
|
16829 |
+
{
|
16830 |
+
"epoch": 0.59,
|
16831 |
+
"eval_loss": 1.3372619152069092,
|
16832 |
+
"eval_runtime": 66.1468,
|
16833 |
+
"eval_samples_per_second": 15.118,
|
16834 |
+
"eval_steps_per_second": 15.118,
|
16835 |
+
"step": 23500
|
16836 |
+
},
|
16837 |
+
{
|
16838 |
+
"epoch": 0.59,
|
16839 |
+
"grad_norm": 10.471700668334961,
|
16840 |
+
"learning_rate": 4.4e-06,
|
16841 |
+
"loss": 1.1029,
|
16842 |
+
"step": 23510
|
16843 |
+
},
|
16844 |
+
{
|
16845 |
+
"epoch": 0.59,
|
16846 |
+
"grad_norm": 6.78934383392334,
|
16847 |
+
"learning_rate": 4.393220338983052e-06,
|
16848 |
+
"loss": 1.2838,
|
16849 |
+
"step": 23520
|
16850 |
+
},
|
16851 |
+
{
|
16852 |
+
"epoch": 0.59,
|
16853 |
+
"grad_norm": 4.890566825866699,
|
16854 |
+
"learning_rate": 4.386440677966102e-06,
|
16855 |
+
"loss": 1.2861,
|
16856 |
+
"step": 23530
|
16857 |
+
},
|
16858 |
+
{
|
16859 |
+
"epoch": 0.59,
|
16860 |
+
"grad_norm": 9.901065826416016,
|
16861 |
+
"learning_rate": 4.379661016949153e-06,
|
16862 |
+
"loss": 1.3043,
|
16863 |
+
"step": 23540
|
16864 |
+
},
|
16865 |
+
{
|
16866 |
+
"epoch": 0.59,
|
16867 |
+
"grad_norm": 3.332019805908203,
|
16868 |
+
"learning_rate": 4.372881355932203e-06,
|
16869 |
+
"loss": 1.5098,
|
16870 |
+
"step": 23550
|
16871 |
+
},
|
16872 |
+
{
|
16873 |
+
"epoch": 0.59,
|
16874 |
+
"grad_norm": 9.1102876663208,
|
16875 |
+
"learning_rate": 4.366101694915255e-06,
|
16876 |
+
"loss": 1.2222,
|
16877 |
+
"step": 23560
|
16878 |
+
},
|
16879 |
+
{
|
16880 |
+
"epoch": 0.59,
|
16881 |
+
"grad_norm": 2.800964832305908,
|
16882 |
+
"learning_rate": 4.359322033898305e-06,
|
16883 |
+
"loss": 1.1916,
|
16884 |
+
"step": 23570
|
16885 |
+
},
|
16886 |
+
{
|
16887 |
+
"epoch": 0.59,
|
16888 |
+
"grad_norm": 4.45274019241333,
|
16889 |
+
"learning_rate": 4.352542372881357e-06,
|
16890 |
+
"loss": 1.496,
|
16891 |
+
"step": 23580
|
16892 |
+
},
|
16893 |
+
{
|
16894 |
+
"epoch": 0.59,
|
16895 |
+
"grad_norm": 7.7979350090026855,
|
16896 |
+
"learning_rate": 4.345762711864407e-06,
|
16897 |
+
"loss": 1.3483,
|
16898 |
+
"step": 23590
|
16899 |
+
},
|
16900 |
+
{
|
16901 |
+
"epoch": 0.59,
|
16902 |
+
"grad_norm": 5.517279148101807,
|
16903 |
+
"learning_rate": 4.338983050847458e-06,
|
16904 |
+
"loss": 1.1602,
|
16905 |
+
"step": 23600
|
16906 |
+
},
|
16907 |
+
{
|
16908 |
+
"epoch": 0.59,
|
16909 |
+
"grad_norm": 8.224603652954102,
|
16910 |
+
"learning_rate": 4.332203389830509e-06,
|
16911 |
+
"loss": 1.2216,
|
16912 |
+
"step": 23610
|
16913 |
+
},
|
16914 |
+
{
|
16915 |
+
"epoch": 0.59,
|
16916 |
+
"grad_norm": 3.9079153537750244,
|
16917 |
+
"learning_rate": 4.32542372881356e-06,
|
16918 |
+
"loss": 1.4094,
|
16919 |
+
"step": 23620
|
16920 |
+
},
|
16921 |
+
{
|
16922 |
+
"epoch": 0.59,
|
16923 |
+
"grad_norm": 7.209962844848633,
|
16924 |
+
"learning_rate": 4.318644067796611e-06,
|
16925 |
+
"loss": 1.4033,
|
16926 |
+
"step": 23630
|
16927 |
+
},
|
16928 |
+
{
|
16929 |
+
"epoch": 0.59,
|
16930 |
+
"grad_norm": 6.915498733520508,
|
16931 |
+
"learning_rate": 4.311864406779661e-06,
|
16932 |
+
"loss": 1.3988,
|
16933 |
+
"step": 23640
|
16934 |
+
},
|
16935 |
+
{
|
16936 |
+
"epoch": 0.59,
|
16937 |
+
"grad_norm": 6.8702778816223145,
|
16938 |
+
"learning_rate": 4.305084745762712e-06,
|
16939 |
+
"loss": 1.3308,
|
16940 |
+
"step": 23650
|
16941 |
+
},
|
16942 |
+
{
|
16943 |
+
"epoch": 0.59,
|
16944 |
+
"grad_norm": 6.673946380615234,
|
16945 |
+
"learning_rate": 4.298305084745763e-06,
|
16946 |
+
"loss": 1.2756,
|
16947 |
+
"step": 23660
|
16948 |
+
},
|
16949 |
+
{
|
16950 |
+
"epoch": 0.59,
|
16951 |
+
"grad_norm": 2.729367971420288,
|
16952 |
+
"learning_rate": 4.291525423728814e-06,
|
16953 |
+
"loss": 1.2702,
|
16954 |
+
"step": 23670
|
16955 |
+
},
|
16956 |
+
{
|
16957 |
+
"epoch": 0.59,
|
16958 |
+
"grad_norm": 4.333055019378662,
|
16959 |
+
"learning_rate": 4.284745762711865e-06,
|
16960 |
+
"loss": 1.3365,
|
16961 |
+
"step": 23680
|
16962 |
+
},
|
16963 |
+
{
|
16964 |
+
"epoch": 0.59,
|
16965 |
+
"grad_norm": 8.36184024810791,
|
16966 |
+
"learning_rate": 4.277966101694915e-06,
|
16967 |
+
"loss": 1.2783,
|
16968 |
+
"step": 23690
|
16969 |
+
},
|
16970 |
+
{
|
16971 |
+
"epoch": 0.59,
|
16972 |
+
"grad_norm": 4.62699031829834,
|
16973 |
+
"learning_rate": 4.271186440677967e-06,
|
16974 |
+
"loss": 1.4385,
|
16975 |
+
"step": 23700
|
16976 |
+
},
|
16977 |
+
{
|
16978 |
+
"epoch": 0.59,
|
16979 |
+
"grad_norm": 3.193026304244995,
|
16980 |
+
"learning_rate": 4.264406779661017e-06,
|
16981 |
+
"loss": 1.4843,
|
16982 |
+
"step": 23710
|
16983 |
+
},
|
16984 |
+
{
|
16985 |
+
"epoch": 0.59,
|
16986 |
+
"grad_norm": 8.289533615112305,
|
16987 |
+
"learning_rate": 4.257627118644068e-06,
|
16988 |
+
"loss": 1.5263,
|
16989 |
+
"step": 23720
|
16990 |
+
},
|
16991 |
+
{
|
16992 |
+
"epoch": 0.59,
|
16993 |
+
"grad_norm": 3.887775182723999,
|
16994 |
+
"learning_rate": 4.250847457627119e-06,
|
16995 |
+
"loss": 1.228,
|
16996 |
+
"step": 23730
|
16997 |
+
},
|
16998 |
+
{
|
16999 |
+
"epoch": 0.59,
|
17000 |
+
"grad_norm": 10.728804588317871,
|
17001 |
+
"learning_rate": 4.24406779661017e-06,
|
17002 |
+
"loss": 1.4097,
|
17003 |
+
"step": 23740
|
17004 |
+
},
|
17005 |
+
{
|
17006 |
+
"epoch": 0.59,
|
17007 |
+
"grad_norm": 5.405580997467041,
|
17008 |
+
"learning_rate": 4.23728813559322e-06,
|
17009 |
+
"loss": 1.227,
|
17010 |
+
"step": 23750
|
17011 |
+
},
|
17012 |
+
{
|
17013 |
+
"epoch": 0.59,
|
17014 |
+
"grad_norm": 2.104985237121582,
|
17015 |
+
"learning_rate": 4.230508474576272e-06,
|
17016 |
+
"loss": 1.3258,
|
17017 |
+
"step": 23760
|
17018 |
+
},
|
17019 |
+
{
|
17020 |
+
"epoch": 0.59,
|
17021 |
+
"grad_norm": 11.678805351257324,
|
17022 |
+
"learning_rate": 4.223728813559322e-06,
|
17023 |
+
"loss": 1.1797,
|
17024 |
+
"step": 23770
|
17025 |
+
},
|
17026 |
+
{
|
17027 |
+
"epoch": 0.59,
|
17028 |
+
"grad_norm": 12.024051666259766,
|
17029 |
+
"learning_rate": 4.216949152542374e-06,
|
17030 |
+
"loss": 1.3278,
|
17031 |
+
"step": 23780
|
17032 |
+
},
|
17033 |
+
{
|
17034 |
+
"epoch": 0.59,
|
17035 |
+
"grad_norm": 12.879485130310059,
|
17036 |
+
"learning_rate": 4.210169491525424e-06,
|
17037 |
+
"loss": 1.2552,
|
17038 |
+
"step": 23790
|
17039 |
+
},
|
17040 |
+
{
|
17041 |
+
"epoch": 0.59,
|
17042 |
+
"grad_norm": 6.001992702484131,
|
17043 |
+
"learning_rate": 4.203389830508475e-06,
|
17044 |
+
"loss": 1.4639,
|
17045 |
+
"step": 23800
|
17046 |
+
},
|
17047 |
+
{
|
17048 |
+
"epoch": 0.6,
|
17049 |
+
"grad_norm": 7.713657855987549,
|
17050 |
+
"learning_rate": 4.196610169491525e-06,
|
17051 |
+
"loss": 1.3542,
|
17052 |
+
"step": 23810
|
17053 |
+
},
|
17054 |
+
{
|
17055 |
+
"epoch": 0.6,
|
17056 |
+
"grad_norm": 25.137435913085938,
|
17057 |
+
"learning_rate": 4.189830508474577e-06,
|
17058 |
+
"loss": 1.2694,
|
17059 |
+
"step": 23820
|
17060 |
+
},
|
17061 |
+
{
|
17062 |
+
"epoch": 0.6,
|
17063 |
+
"grad_norm": 13.080780029296875,
|
17064 |
+
"learning_rate": 4.183050847457627e-06,
|
17065 |
+
"loss": 1.5512,
|
17066 |
+
"step": 23830
|
17067 |
+
},
|
17068 |
+
{
|
17069 |
+
"epoch": 0.6,
|
17070 |
+
"grad_norm": 3.648967981338501,
|
17071 |
+
"learning_rate": 4.176271186440679e-06,
|
17072 |
+
"loss": 1.4919,
|
17073 |
+
"step": 23840
|
17074 |
+
},
|
17075 |
+
{
|
17076 |
+
"epoch": 0.6,
|
17077 |
+
"grad_norm": 2.8366498947143555,
|
17078 |
+
"learning_rate": 4.169491525423729e-06,
|
17079 |
+
"loss": 1.3528,
|
17080 |
+
"step": 23850
|
17081 |
+
},
|
17082 |
+
{
|
17083 |
+
"epoch": 0.6,
|
17084 |
+
"grad_norm": 2.3198916912078857,
|
17085 |
+
"learning_rate": 4.16271186440678e-06,
|
17086 |
+
"loss": 1.4445,
|
17087 |
+
"step": 23860
|
17088 |
+
},
|
17089 |
+
{
|
17090 |
+
"epoch": 0.6,
|
17091 |
+
"grad_norm": 9.170830726623535,
|
17092 |
+
"learning_rate": 4.155932203389831e-06,
|
17093 |
+
"loss": 1.1801,
|
17094 |
+
"step": 23870
|
17095 |
+
},
|
17096 |
+
{
|
17097 |
+
"epoch": 0.6,
|
17098 |
+
"grad_norm": 7.985089302062988,
|
17099 |
+
"learning_rate": 4.149152542372882e-06,
|
17100 |
+
"loss": 1.3195,
|
17101 |
+
"step": 23880
|
17102 |
+
},
|
17103 |
+
{
|
17104 |
+
"epoch": 0.6,
|
17105 |
+
"grad_norm": 10.688753128051758,
|
17106 |
+
"learning_rate": 4.142372881355933e-06,
|
17107 |
+
"loss": 1.4527,
|
17108 |
+
"step": 23890
|
17109 |
+
},
|
17110 |
+
{
|
17111 |
+
"epoch": 0.6,
|
17112 |
+
"grad_norm": 12.181285858154297,
|
17113 |
+
"learning_rate": 4.135593220338983e-06,
|
17114 |
+
"loss": 1.3875,
|
17115 |
+
"step": 23900
|
17116 |
+
},
|
17117 |
+
{
|
17118 |
+
"epoch": 0.6,
|
17119 |
+
"grad_norm": 10.353550910949707,
|
17120 |
+
"learning_rate": 4.128813559322034e-06,
|
17121 |
+
"loss": 1.3937,
|
17122 |
+
"step": 23910
|
17123 |
+
},
|
17124 |
+
{
|
17125 |
+
"epoch": 0.6,
|
17126 |
+
"grad_norm": 3.3962326049804688,
|
17127 |
+
"learning_rate": 4.122033898305085e-06,
|
17128 |
+
"loss": 1.2212,
|
17129 |
+
"step": 23920
|
17130 |
+
},
|
17131 |
+
{
|
17132 |
+
"epoch": 0.6,
|
17133 |
+
"grad_norm": 9.191743850708008,
|
17134 |
+
"learning_rate": 4.115254237288136e-06,
|
17135 |
+
"loss": 1.3884,
|
17136 |
+
"step": 23930
|
17137 |
+
},
|
17138 |
+
{
|
17139 |
+
"epoch": 0.6,
|
17140 |
+
"grad_norm": 8.74504566192627,
|
17141 |
+
"learning_rate": 4.108474576271187e-06,
|
17142 |
+
"loss": 1.369,
|
17143 |
+
"step": 23940
|
17144 |
+
},
|
17145 |
+
{
|
17146 |
+
"epoch": 0.6,
|
17147 |
+
"grad_norm": 15.484914779663086,
|
17148 |
+
"learning_rate": 4.101694915254237e-06,
|
17149 |
+
"loss": 1.3607,
|
17150 |
+
"step": 23950
|
17151 |
+
},
|
17152 |
+
{
|
17153 |
+
"epoch": 0.6,
|
17154 |
+
"grad_norm": 8.069631576538086,
|
17155 |
+
"learning_rate": 4.094915254237289e-06,
|
17156 |
+
"loss": 1.1674,
|
17157 |
+
"step": 23960
|
17158 |
+
},
|
17159 |
+
{
|
17160 |
+
"epoch": 0.6,
|
17161 |
+
"grad_norm": 5.688279151916504,
|
17162 |
+
"learning_rate": 4.088135593220339e-06,
|
17163 |
+
"loss": 1.2652,
|
17164 |
+
"step": 23970
|
17165 |
+
},
|
17166 |
+
{
|
17167 |
+
"epoch": 0.6,
|
17168 |
+
"grad_norm": 2.326960325241089,
|
17169 |
+
"learning_rate": 4.081355932203391e-06,
|
17170 |
+
"loss": 1.2149,
|
17171 |
+
"step": 23980
|
17172 |
+
},
|
17173 |
+
{
|
17174 |
+
"epoch": 0.6,
|
17175 |
+
"grad_norm": 9.749725341796875,
|
17176 |
+
"learning_rate": 4.074576271186441e-06,
|
17177 |
+
"loss": 1.2378,
|
17178 |
+
"step": 23990
|
17179 |
+
},
|
17180 |
+
{
|
17181 |
+
"epoch": 0.6,
|
17182 |
+
"grad_norm": 5.552289962768555,
|
17183 |
+
"learning_rate": 4.067796610169492e-06,
|
17184 |
+
"loss": 1.2699,
|
17185 |
+
"step": 24000
|
17186 |
+
},
|
17187 |
+
{
|
17188 |
+
"epoch": 0.6,
|
17189 |
+
"eval_loss": 1.3411681652069092,
|
17190 |
+
"eval_runtime": 66.1396,
|
17191 |
+
"eval_samples_per_second": 15.12,
|
17192 |
+
"eval_steps_per_second": 15.12,
|
17193 |
+
"step": 24000
|
17194 |
+
},
|
17195 |
+
{
|
17196 |
+
"epoch": 0.6,
|
17197 |
+
"grad_norm": 8.768118858337402,
|
17198 |
+
"learning_rate": 4.061016949152542e-06,
|
17199 |
+
"loss": 1.1869,
|
17200 |
+
"step": 24010
|
17201 |
+
},
|
17202 |
+
{
|
17203 |
+
"epoch": 0.6,
|
17204 |
+
"grad_norm": 1.0193852186203003,
|
17205 |
+
"learning_rate": 4.054237288135594e-06,
|
17206 |
+
"loss": 1.1477,
|
17207 |
+
"step": 24020
|
17208 |
+
},
|
17209 |
+
{
|
17210 |
+
"epoch": 0.6,
|
17211 |
+
"grad_norm": 11.04339599609375,
|
17212 |
+
"learning_rate": 4.047457627118644e-06,
|
17213 |
+
"loss": 1.2492,
|
17214 |
+
"step": 24030
|
17215 |
+
},
|
17216 |
+
{
|
17217 |
+
"epoch": 0.6,
|
17218 |
+
"grad_norm": 2.5347607135772705,
|
17219 |
+
"learning_rate": 4.040677966101696e-06,
|
17220 |
+
"loss": 1.2424,
|
17221 |
+
"step": 24040
|
17222 |
+
},
|
17223 |
+
{
|
17224 |
+
"epoch": 0.6,
|
17225 |
+
"grad_norm": 5.121871471405029,
|
17226 |
+
"learning_rate": 4.033898305084746e-06,
|
17227 |
+
"loss": 1.1773,
|
17228 |
+
"step": 24050
|
17229 |
+
},
|
17230 |
+
{
|
17231 |
+
"epoch": 0.6,
|
17232 |
+
"grad_norm": 8.53433609008789,
|
17233 |
+
"learning_rate": 4.027118644067797e-06,
|
17234 |
+
"loss": 1.4407,
|
17235 |
+
"step": 24060
|
17236 |
+
},
|
17237 |
+
{
|
17238 |
+
"epoch": 0.6,
|
17239 |
+
"grad_norm": 11.311376571655273,
|
17240 |
+
"learning_rate": 4.020338983050847e-06,
|
17241 |
+
"loss": 1.4095,
|
17242 |
+
"step": 24070
|
17243 |
+
},
|
17244 |
+
{
|
17245 |
+
"epoch": 0.6,
|
17246 |
+
"grad_norm": 2.8956375122070312,
|
17247 |
+
"learning_rate": 4.013559322033899e-06,
|
17248 |
+
"loss": 1.3076,
|
17249 |
+
"step": 24080
|
17250 |
+
},
|
17251 |
+
{
|
17252 |
+
"epoch": 0.6,
|
17253 |
+
"grad_norm": 3.6406021118164062,
|
17254 |
+
"learning_rate": 4.006779661016949e-06,
|
17255 |
+
"loss": 1.1801,
|
17256 |
+
"step": 24090
|
17257 |
+
},
|
17258 |
+
{
|
17259 |
+
"epoch": 0.6,
|
17260 |
+
"grad_norm": 4.67333459854126,
|
17261 |
+
"learning_rate": 4.000000000000001e-06,
|
17262 |
+
"loss": 1.3627,
|
17263 |
+
"step": 24100
|
17264 |
+
},
|
17265 |
+
{
|
17266 |
+
"epoch": 0.6,
|
17267 |
+
"grad_norm": 4.243159294128418,
|
17268 |
+
"learning_rate": 3.993220338983051e-06,
|
17269 |
+
"loss": 1.2065,
|
17270 |
+
"step": 24110
|
17271 |
+
},
|
17272 |
+
{
|
17273 |
+
"epoch": 0.6,
|
17274 |
+
"grad_norm": 4.570652484893799,
|
17275 |
+
"learning_rate": 3.986440677966102e-06,
|
17276 |
+
"loss": 1.4576,
|
17277 |
+
"step": 24120
|
17278 |
+
},
|
17279 |
+
{
|
17280 |
+
"epoch": 0.6,
|
17281 |
+
"grad_norm": 10.30574893951416,
|
17282 |
+
"learning_rate": 3.979661016949153e-06,
|
17283 |
+
"loss": 1.3798,
|
17284 |
+
"step": 24130
|
17285 |
+
},
|
17286 |
+
{
|
17287 |
+
"epoch": 0.6,
|
17288 |
+
"grad_norm": 1.7883845567703247,
|
17289 |
+
"learning_rate": 3.972881355932204e-06,
|
17290 |
+
"loss": 1.1474,
|
17291 |
+
"step": 24140
|
17292 |
+
},
|
17293 |
+
{
|
17294 |
+
"epoch": 0.6,
|
17295 |
+
"grad_norm": 2.429614305496216,
|
17296 |
+
"learning_rate": 3.966101694915255e-06,
|
17297 |
+
"loss": 1.3992,
|
17298 |
+
"step": 24150
|
17299 |
+
},
|
17300 |
+
{
|
17301 |
+
"epoch": 0.6,
|
17302 |
+
"grad_norm": 5.791226863861084,
|
17303 |
+
"learning_rate": 3.959322033898305e-06,
|
17304 |
+
"loss": 1.4164,
|
17305 |
+
"step": 24160
|
17306 |
+
},
|
17307 |
+
{
|
17308 |
+
"epoch": 0.6,
|
17309 |
+
"grad_norm": 6.212001800537109,
|
17310 |
+
"learning_rate": 3.952542372881356e-06,
|
17311 |
+
"loss": 1.4647,
|
17312 |
+
"step": 24170
|
17313 |
+
},
|
17314 |
+
{
|
17315 |
+
"epoch": 0.6,
|
17316 |
+
"grad_norm": 4.9569292068481445,
|
17317 |
+
"learning_rate": 3.945762711864407e-06,
|
17318 |
+
"loss": 1.2948,
|
17319 |
+
"step": 24180
|
17320 |
+
},
|
17321 |
+
{
|
17322 |
+
"epoch": 0.6,
|
17323 |
+
"grad_norm": 2.2119970321655273,
|
17324 |
+
"learning_rate": 3.938983050847458e-06,
|
17325 |
+
"loss": 1.3955,
|
17326 |
+
"step": 24190
|
17327 |
+
},
|
17328 |
+
{
|
17329 |
+
"epoch": 0.6,
|
17330 |
+
"grad_norm": 10.280770301818848,
|
17331 |
+
"learning_rate": 3.932203389830509e-06,
|
17332 |
+
"loss": 1.4461,
|
17333 |
+
"step": 24200
|
17334 |
+
},
|
17335 |
+
{
|
17336 |
+
"epoch": 0.61,
|
17337 |
+
"grad_norm": 3.701272487640381,
|
17338 |
+
"learning_rate": 3.925423728813559e-06,
|
17339 |
+
"loss": 1.348,
|
17340 |
+
"step": 24210
|
17341 |
+
},
|
17342 |
+
{
|
17343 |
+
"epoch": 0.61,
|
17344 |
+
"grad_norm": 8.827926635742188,
|
17345 |
+
"learning_rate": 3.918644067796611e-06,
|
17346 |
+
"loss": 1.3398,
|
17347 |
+
"step": 24220
|
17348 |
+
},
|
17349 |
+
{
|
17350 |
+
"epoch": 0.61,
|
17351 |
+
"grad_norm": 6.997286319732666,
|
17352 |
+
"learning_rate": 3.911864406779661e-06,
|
17353 |
+
"loss": 1.4724,
|
17354 |
+
"step": 24230
|
17355 |
+
},
|
17356 |
+
{
|
17357 |
+
"epoch": 0.61,
|
17358 |
+
"grad_norm": 5.5268449783325195,
|
17359 |
+
"learning_rate": 3.905084745762713e-06,
|
17360 |
+
"loss": 1.388,
|
17361 |
+
"step": 24240
|
17362 |
+
},
|
17363 |
+
{
|
17364 |
+
"epoch": 0.61,
|
17365 |
+
"grad_norm": 8.842992782592773,
|
17366 |
+
"learning_rate": 3.898305084745763e-06,
|
17367 |
+
"loss": 1.2382,
|
17368 |
+
"step": 24250
|
17369 |
+
},
|
17370 |
+
{
|
17371 |
+
"epoch": 0.61,
|
17372 |
+
"grad_norm": 11.24975872039795,
|
17373 |
+
"learning_rate": 3.891525423728814e-06,
|
17374 |
+
"loss": 1.2194,
|
17375 |
+
"step": 24260
|
17376 |
+
},
|
17377 |
+
{
|
17378 |
+
"epoch": 0.61,
|
17379 |
+
"grad_norm": 2.875722646713257,
|
17380 |
+
"learning_rate": 3.884745762711864e-06,
|
17381 |
+
"loss": 1.3792,
|
17382 |
+
"step": 24270
|
17383 |
+
},
|
17384 |
+
{
|
17385 |
+
"epoch": 0.61,
|
17386 |
+
"grad_norm": 8.459474563598633,
|
17387 |
+
"learning_rate": 3.877966101694916e-06,
|
17388 |
+
"loss": 1.3021,
|
17389 |
+
"step": 24280
|
17390 |
+
},
|
17391 |
+
{
|
17392 |
+
"epoch": 0.61,
|
17393 |
+
"grad_norm": 3.315873861312866,
|
17394 |
+
"learning_rate": 3.871186440677966e-06,
|
17395 |
+
"loss": 1.2976,
|
17396 |
+
"step": 24290
|
17397 |
+
},
|
17398 |
+
{
|
17399 |
+
"epoch": 0.61,
|
17400 |
+
"grad_norm": 6.280729293823242,
|
17401 |
+
"learning_rate": 3.864406779661018e-06,
|
17402 |
+
"loss": 1.3294,
|
17403 |
+
"step": 24300
|
17404 |
+
},
|
17405 |
+
{
|
17406 |
+
"epoch": 0.61,
|
17407 |
+
"grad_norm": 6.004711627960205,
|
17408 |
+
"learning_rate": 3.857627118644068e-06,
|
17409 |
+
"loss": 1.3178,
|
17410 |
+
"step": 24310
|
17411 |
+
},
|
17412 |
+
{
|
17413 |
+
"epoch": 0.61,
|
17414 |
+
"grad_norm": 8.207845687866211,
|
17415 |
+
"learning_rate": 3.850847457627119e-06,
|
17416 |
+
"loss": 1.424,
|
17417 |
+
"step": 24320
|
17418 |
+
},
|
17419 |
+
{
|
17420 |
+
"epoch": 0.61,
|
17421 |
+
"grad_norm": 8.01065444946289,
|
17422 |
+
"learning_rate": 3.844067796610169e-06,
|
17423 |
+
"loss": 1.2842,
|
17424 |
+
"step": 24330
|
17425 |
+
},
|
17426 |
+
{
|
17427 |
+
"epoch": 0.61,
|
17428 |
+
"grad_norm": 9.126721382141113,
|
17429 |
+
"learning_rate": 3.837288135593221e-06,
|
17430 |
+
"loss": 1.368,
|
17431 |
+
"step": 24340
|
17432 |
+
},
|
17433 |
+
{
|
17434 |
+
"epoch": 0.61,
|
17435 |
+
"grad_norm": 11.590188026428223,
|
17436 |
+
"learning_rate": 3.830508474576271e-06,
|
17437 |
+
"loss": 1.375,
|
17438 |
+
"step": 24350
|
17439 |
+
},
|
17440 |
+
{
|
17441 |
+
"epoch": 0.61,
|
17442 |
+
"grad_norm": 7.325139045715332,
|
17443 |
+
"learning_rate": 3.823728813559323e-06,
|
17444 |
+
"loss": 1.2593,
|
17445 |
+
"step": 24360
|
17446 |
+
},
|
17447 |
+
{
|
17448 |
+
"epoch": 0.61,
|
17449 |
+
"grad_norm": 6.3924760818481445,
|
17450 |
+
"learning_rate": 3.816949152542373e-06,
|
17451 |
+
"loss": 1.3238,
|
17452 |
+
"step": 24370
|
17453 |
+
},
|
17454 |
+
{
|
17455 |
+
"epoch": 0.61,
|
17456 |
+
"grad_norm": 5.093543529510498,
|
17457 |
+
"learning_rate": 3.8101694915254238e-06,
|
17458 |
+
"loss": 1.3398,
|
17459 |
+
"step": 24380
|
17460 |
+
},
|
17461 |
+
{
|
17462 |
+
"epoch": 0.61,
|
17463 |
+
"grad_norm": 4.488302707672119,
|
17464 |
+
"learning_rate": 3.8033898305084748e-06,
|
17465 |
+
"loss": 1.2532,
|
17466 |
+
"step": 24390
|
17467 |
+
},
|
17468 |
+
{
|
17469 |
+
"epoch": 0.61,
|
17470 |
+
"grad_norm": 3.5369062423706055,
|
17471 |
+
"learning_rate": 3.7966101694915257e-06,
|
17472 |
+
"loss": 1.1723,
|
17473 |
+
"step": 24400
|
17474 |
+
},
|
17475 |
+
{
|
17476 |
+
"epoch": 0.61,
|
17477 |
+
"grad_norm": 3.2012510299682617,
|
17478 |
+
"learning_rate": 3.7898305084745767e-06,
|
17479 |
+
"loss": 1.3348,
|
17480 |
+
"step": 24410
|
17481 |
+
},
|
17482 |
+
{
|
17483 |
+
"epoch": 0.61,
|
17484 |
+
"grad_norm": 1.5665017366409302,
|
17485 |
+
"learning_rate": 3.7830508474576273e-06,
|
17486 |
+
"loss": 1.4159,
|
17487 |
+
"step": 24420
|
17488 |
+
},
|
17489 |
+
{
|
17490 |
+
"epoch": 0.61,
|
17491 |
+
"grad_norm": 12.912787437438965,
|
17492 |
+
"learning_rate": 3.7762711864406782e-06,
|
17493 |
+
"loss": 1.2406,
|
17494 |
+
"step": 24430
|
17495 |
+
},
|
17496 |
+
{
|
17497 |
+
"epoch": 0.61,
|
17498 |
+
"grad_norm": 6.572142124176025,
|
17499 |
+
"learning_rate": 3.7694915254237292e-06,
|
17500 |
+
"loss": 1.1544,
|
17501 |
+
"step": 24440
|
17502 |
+
},
|
17503 |
+
{
|
17504 |
+
"epoch": 0.61,
|
17505 |
+
"grad_norm": 4.999161720275879,
|
17506 |
+
"learning_rate": 3.76271186440678e-06,
|
17507 |
+
"loss": 1.1744,
|
17508 |
+
"step": 24450
|
17509 |
+
},
|
17510 |
+
{
|
17511 |
+
"epoch": 0.61,
|
17512 |
+
"grad_norm": 3.322866439819336,
|
17513 |
+
"learning_rate": 3.755932203389831e-06,
|
17514 |
+
"loss": 1.3713,
|
17515 |
+
"step": 24460
|
17516 |
+
},
|
17517 |
+
{
|
17518 |
+
"epoch": 0.61,
|
17519 |
+
"grad_norm": 5.197652816772461,
|
17520 |
+
"learning_rate": 3.7491525423728813e-06,
|
17521 |
+
"loss": 1.1858,
|
17522 |
+
"step": 24470
|
17523 |
+
},
|
17524 |
+
{
|
17525 |
+
"epoch": 0.61,
|
17526 |
+
"grad_norm": 6.7361369132995605,
|
17527 |
+
"learning_rate": 3.7423728813559323e-06,
|
17528 |
+
"loss": 1.2778,
|
17529 |
+
"step": 24480
|
17530 |
+
},
|
17531 |
+
{
|
17532 |
+
"epoch": 0.61,
|
17533 |
+
"grad_norm": 1.1276848316192627,
|
17534 |
+
"learning_rate": 3.7355932203389833e-06,
|
17535 |
+
"loss": 1.3971,
|
17536 |
+
"step": 24490
|
17537 |
+
},
|
17538 |
+
{
|
17539 |
+
"epoch": 0.61,
|
17540 |
+
"grad_norm": 4.62593412399292,
|
17541 |
+
"learning_rate": 3.7288135593220342e-06,
|
17542 |
+
"loss": 1.479,
|
17543 |
+
"step": 24500
|
17544 |
+
},
|
17545 |
+
{
|
17546 |
+
"epoch": 0.61,
|
17547 |
+
"eval_loss": 1.3212531805038452,
|
17548 |
+
"eval_runtime": 66.1318,
|
17549 |
+
"eval_samples_per_second": 15.121,
|
17550 |
+
"eval_steps_per_second": 15.121,
|
17551 |
+
"step": 24500
|
17552 |
+
},
|
17553 |
+
{
|
17554 |
+
"epoch": 0.61,
|
17555 |
+
"grad_norm": 7.978855133056641,
|
17556 |
+
"learning_rate": 3.7220338983050852e-06,
|
17557 |
+
"loss": 1.1919,
|
17558 |
+
"step": 24510
|
17559 |
+
},
|
17560 |
+
{
|
17561 |
+
"epoch": 0.61,
|
17562 |
+
"grad_norm": 6.347212314605713,
|
17563 |
+
"learning_rate": 3.715254237288136e-06,
|
17564 |
+
"loss": 1.2493,
|
17565 |
+
"step": 24520
|
17566 |
+
},
|
17567 |
+
{
|
17568 |
+
"epoch": 0.61,
|
17569 |
+
"grad_norm": 6.2206573486328125,
|
17570 |
+
"learning_rate": 3.7084745762711867e-06,
|
17571 |
+
"loss": 1.2258,
|
17572 |
+
"step": 24530
|
17573 |
+
},
|
17574 |
+
{
|
17575 |
+
"epoch": 0.61,
|
17576 |
+
"grad_norm": 2.548797607421875,
|
17577 |
+
"learning_rate": 3.7016949152542377e-06,
|
17578 |
+
"loss": 1.262,
|
17579 |
+
"step": 24540
|
17580 |
+
},
|
17581 |
+
{
|
17582 |
+
"epoch": 0.61,
|
17583 |
+
"grad_norm": 9.5992431640625,
|
17584 |
+
"learning_rate": 3.6949152542372883e-06,
|
17585 |
+
"loss": 1.4298,
|
17586 |
+
"step": 24550
|
17587 |
+
},
|
17588 |
+
{
|
17589 |
+
"epoch": 0.61,
|
17590 |
+
"grad_norm": 2.236175298690796,
|
17591 |
+
"learning_rate": 3.6881355932203393e-06,
|
17592 |
+
"loss": 1.3466,
|
17593 |
+
"step": 24560
|
17594 |
+
},
|
17595 |
+
{
|
17596 |
+
"epoch": 0.61,
|
17597 |
+
"grad_norm": 7.134004592895508,
|
17598 |
+
"learning_rate": 3.6813559322033902e-06,
|
17599 |
+
"loss": 1.2642,
|
17600 |
+
"step": 24570
|
17601 |
+
},
|
17602 |
+
{
|
17603 |
+
"epoch": 0.61,
|
17604 |
+
"grad_norm": 12.453125,
|
17605 |
+
"learning_rate": 3.6745762711864408e-06,
|
17606 |
+
"loss": 1.369,
|
17607 |
+
"step": 24580
|
17608 |
+
},
|
17609 |
+
{
|
17610 |
+
"epoch": 0.61,
|
17611 |
+
"grad_norm": 16.243106842041016,
|
17612 |
+
"learning_rate": 3.6677966101694918e-06,
|
17613 |
+
"loss": 1.3269,
|
17614 |
+
"step": 24590
|
17615 |
+
},
|
17616 |
+
{
|
17617 |
+
"epoch": 0.61,
|
17618 |
+
"grad_norm": 11.406882286071777,
|
17619 |
+
"learning_rate": 3.6610169491525427e-06,
|
17620 |
+
"loss": 1.4339,
|
17621 |
+
"step": 24600
|
17622 |
+
},
|
17623 |
+
{
|
17624 |
+
"epoch": 0.62,
|
17625 |
+
"grad_norm": 6.334946632385254,
|
17626 |
+
"learning_rate": 3.6542372881355937e-06,
|
17627 |
+
"loss": 1.2445,
|
17628 |
+
"step": 24610
|
17629 |
+
},
|
17630 |
+
{
|
17631 |
+
"epoch": 0.62,
|
17632 |
+
"grad_norm": 9.451517105102539,
|
17633 |
+
"learning_rate": 3.6474576271186447e-06,
|
17634 |
+
"loss": 1.1902,
|
17635 |
+
"step": 24620
|
17636 |
+
},
|
17637 |
+
{
|
17638 |
+
"epoch": 0.62,
|
17639 |
+
"grad_norm": 1.9071747064590454,
|
17640 |
+
"learning_rate": 3.640677966101695e-06,
|
17641 |
+
"loss": 1.3273,
|
17642 |
+
"step": 24630
|
17643 |
+
},
|
17644 |
+
{
|
17645 |
+
"epoch": 0.62,
|
17646 |
+
"grad_norm": 4.054659843444824,
|
17647 |
+
"learning_rate": 3.633898305084746e-06,
|
17648 |
+
"loss": 1.2808,
|
17649 |
+
"step": 24640
|
17650 |
+
},
|
17651 |
+
{
|
17652 |
+
"epoch": 0.62,
|
17653 |
+
"grad_norm": 6.314877986907959,
|
17654 |
+
"learning_rate": 3.6271186440677968e-06,
|
17655 |
+
"loss": 1.4351,
|
17656 |
+
"step": 24650
|
17657 |
+
},
|
17658 |
+
{
|
17659 |
+
"epoch": 0.62,
|
17660 |
+
"grad_norm": 6.971933841705322,
|
17661 |
+
"learning_rate": 3.6203389830508478e-06,
|
17662 |
+
"loss": 1.2973,
|
17663 |
+
"step": 24660
|
17664 |
+
},
|
17665 |
+
{
|
17666 |
+
"epoch": 0.62,
|
17667 |
+
"grad_norm": 2.7064402103424072,
|
17668 |
+
"learning_rate": 3.6135593220338987e-06,
|
17669 |
+
"loss": 1.4335,
|
17670 |
+
"step": 24670
|
17671 |
+
},
|
17672 |
+
{
|
17673 |
+
"epoch": 0.62,
|
17674 |
+
"grad_norm": 5.698015213012695,
|
17675 |
+
"learning_rate": 3.6067796610169493e-06,
|
17676 |
+
"loss": 1.3042,
|
17677 |
+
"step": 24680
|
17678 |
+
},
|
17679 |
+
{
|
17680 |
+
"epoch": 0.62,
|
17681 |
+
"grad_norm": 5.647088050842285,
|
17682 |
+
"learning_rate": 3.6000000000000003e-06,
|
17683 |
+
"loss": 1.3832,
|
17684 |
+
"step": 24690
|
17685 |
+
},
|
17686 |
+
{
|
17687 |
+
"epoch": 0.62,
|
17688 |
+
"grad_norm": 10.945414543151855,
|
17689 |
+
"learning_rate": 3.5932203389830512e-06,
|
17690 |
+
"loss": 1.2363,
|
17691 |
+
"step": 24700
|
17692 |
+
},
|
17693 |
+
{
|
17694 |
+
"epoch": 0.62,
|
17695 |
+
"grad_norm": 7.516660213470459,
|
17696 |
+
"learning_rate": 3.5864406779661022e-06,
|
17697 |
+
"loss": 1.2615,
|
17698 |
+
"step": 24710
|
17699 |
+
},
|
17700 |
+
{
|
17701 |
+
"epoch": 0.62,
|
17702 |
+
"grad_norm": 3.6117703914642334,
|
17703 |
+
"learning_rate": 3.579661016949153e-06,
|
17704 |
+
"loss": 1.3533,
|
17705 |
+
"step": 24720
|
17706 |
+
},
|
17707 |
+
{
|
17708 |
+
"epoch": 0.62,
|
17709 |
+
"grad_norm": 10.817008972167969,
|
17710 |
+
"learning_rate": 3.5728813559322033e-06,
|
17711 |
+
"loss": 1.3425,
|
17712 |
+
"step": 24730
|
17713 |
+
},
|
17714 |
+
{
|
17715 |
+
"epoch": 0.62,
|
17716 |
+
"grad_norm": 7.7072858810424805,
|
17717 |
+
"learning_rate": 3.5661016949152543e-06,
|
17718 |
+
"loss": 1.2762,
|
17719 |
+
"step": 24740
|
17720 |
+
},
|
17721 |
+
{
|
17722 |
+
"epoch": 0.62,
|
17723 |
+
"grad_norm": 5.9887495040893555,
|
17724 |
+
"learning_rate": 3.5593220338983053e-06,
|
17725 |
+
"loss": 1.3516,
|
17726 |
+
"step": 24750
|
17727 |
+
},
|
17728 |
+
{
|
17729 |
+
"epoch": 0.62,
|
17730 |
+
"grad_norm": 3.4481067657470703,
|
17731 |
+
"learning_rate": 3.5525423728813563e-06,
|
17732 |
+
"loss": 1.0325,
|
17733 |
+
"step": 24760
|
17734 |
+
},
|
17735 |
+
{
|
17736 |
+
"epoch": 0.62,
|
17737 |
+
"grad_norm": 5.2485551834106445,
|
17738 |
+
"learning_rate": 3.5457627118644072e-06,
|
17739 |
+
"loss": 1.4088,
|
17740 |
+
"step": 24770
|
17741 |
+
},
|
17742 |
+
{
|
17743 |
+
"epoch": 0.62,
|
17744 |
+
"grad_norm": 6.970777988433838,
|
17745 |
+
"learning_rate": 3.538983050847458e-06,
|
17746 |
+
"loss": 1.4962,
|
17747 |
+
"step": 24780
|
17748 |
+
},
|
17749 |
+
{
|
17750 |
+
"epoch": 0.62,
|
17751 |
+
"grad_norm": 6.037806034088135,
|
17752 |
+
"learning_rate": 3.5322033898305088e-06,
|
17753 |
+
"loss": 1.3806,
|
17754 |
+
"step": 24790
|
17755 |
+
},
|
17756 |
+
{
|
17757 |
+
"epoch": 0.62,
|
17758 |
+
"grad_norm": 4.5726518630981445,
|
17759 |
+
"learning_rate": 3.5254237288135597e-06,
|
17760 |
+
"loss": 1.3086,
|
17761 |
+
"step": 24800
|
17762 |
+
},
|
17763 |
+
{
|
17764 |
+
"epoch": 0.62,
|
17765 |
+
"grad_norm": 4.184850692749023,
|
17766 |
+
"learning_rate": 3.5186440677966103e-06,
|
17767 |
+
"loss": 1.4004,
|
17768 |
+
"step": 24810
|
17769 |
+
},
|
17770 |
+
{
|
17771 |
+
"epoch": 0.62,
|
17772 |
+
"grad_norm": 9.181009292602539,
|
17773 |
+
"learning_rate": 3.5118644067796613e-06,
|
17774 |
+
"loss": 1.4348,
|
17775 |
+
"step": 24820
|
17776 |
+
},
|
17777 |
+
{
|
17778 |
+
"epoch": 0.62,
|
17779 |
+
"grad_norm": 5.124319076538086,
|
17780 |
+
"learning_rate": 3.5050847457627122e-06,
|
17781 |
+
"loss": 1.2272,
|
17782 |
+
"step": 24830
|
17783 |
+
},
|
17784 |
+
{
|
17785 |
+
"epoch": 0.62,
|
17786 |
+
"grad_norm": 7.30942440032959,
|
17787 |
+
"learning_rate": 3.498305084745763e-06,
|
17788 |
+
"loss": 1.3819,
|
17789 |
+
"step": 24840
|
17790 |
+
},
|
17791 |
+
{
|
17792 |
+
"epoch": 0.62,
|
17793 |
+
"grad_norm": 7.727287769317627,
|
17794 |
+
"learning_rate": 3.4915254237288138e-06,
|
17795 |
+
"loss": 1.1956,
|
17796 |
+
"step": 24850
|
17797 |
+
},
|
17798 |
+
{
|
17799 |
+
"epoch": 0.62,
|
17800 |
+
"grad_norm": 5.2934112548828125,
|
17801 |
+
"learning_rate": 3.4847457627118648e-06,
|
17802 |
+
"loss": 1.3141,
|
17803 |
+
"step": 24860
|
17804 |
+
},
|
17805 |
+
{
|
17806 |
+
"epoch": 0.62,
|
17807 |
+
"grad_norm": 7.7370500564575195,
|
17808 |
+
"learning_rate": 3.4779661016949157e-06,
|
17809 |
+
"loss": 1.2418,
|
17810 |
+
"step": 24870
|
17811 |
+
},
|
17812 |
+
{
|
17813 |
+
"epoch": 0.62,
|
17814 |
+
"grad_norm": 5.379338264465332,
|
17815 |
+
"learning_rate": 3.4711864406779667e-06,
|
17816 |
+
"loss": 1.435,
|
17817 |
+
"step": 24880
|
17818 |
+
},
|
17819 |
+
{
|
17820 |
+
"epoch": 0.62,
|
17821 |
+
"grad_norm": 5.592279434204102,
|
17822 |
+
"learning_rate": 3.464406779661017e-06,
|
17823 |
+
"loss": 1.324,
|
17824 |
+
"step": 24890
|
17825 |
+
},
|
17826 |
+
{
|
17827 |
+
"epoch": 0.62,
|
17828 |
+
"grad_norm": 4.178751468658447,
|
17829 |
+
"learning_rate": 3.457627118644068e-06,
|
17830 |
+
"loss": 1.4095,
|
17831 |
+
"step": 24900
|
17832 |
+
},
|
17833 |
+
{
|
17834 |
+
"epoch": 0.62,
|
17835 |
+
"grad_norm": 2.7852957248687744,
|
17836 |
+
"learning_rate": 3.450847457627119e-06,
|
17837 |
+
"loss": 1.1834,
|
17838 |
+
"step": 24910
|
17839 |
+
},
|
17840 |
+
{
|
17841 |
+
"epoch": 0.62,
|
17842 |
+
"grad_norm": 12.001542091369629,
|
17843 |
+
"learning_rate": 3.4440677966101698e-06,
|
17844 |
+
"loss": 1.3294,
|
17845 |
+
"step": 24920
|
17846 |
+
},
|
17847 |
+
{
|
17848 |
+
"epoch": 0.62,
|
17849 |
+
"grad_norm": 4.010140895843506,
|
17850 |
+
"learning_rate": 3.4372881355932207e-06,
|
17851 |
+
"loss": 1.4615,
|
17852 |
+
"step": 24930
|
17853 |
+
},
|
17854 |
+
{
|
17855 |
+
"epoch": 0.62,
|
17856 |
+
"grad_norm": 2.7130627632141113,
|
17857 |
+
"learning_rate": 3.4305084745762713e-06,
|
17858 |
+
"loss": 1.4789,
|
17859 |
+
"step": 24940
|
17860 |
+
},
|
17861 |
+
{
|
17862 |
+
"epoch": 0.62,
|
17863 |
+
"grad_norm": 5.209987640380859,
|
17864 |
+
"learning_rate": 3.4237288135593223e-06,
|
17865 |
+
"loss": 1.2389,
|
17866 |
+
"step": 24950
|
17867 |
+
},
|
17868 |
+
{
|
17869 |
+
"epoch": 0.62,
|
17870 |
+
"grad_norm": 4.1047515869140625,
|
17871 |
+
"learning_rate": 3.4169491525423733e-06,
|
17872 |
+
"loss": 1.3449,
|
17873 |
+
"step": 24960
|
17874 |
+
},
|
17875 |
+
{
|
17876 |
+
"epoch": 0.62,
|
17877 |
+
"grad_norm": 19.238649368286133,
|
17878 |
+
"learning_rate": 3.4101694915254242e-06,
|
17879 |
+
"loss": 1.3224,
|
17880 |
+
"step": 24970
|
17881 |
+
},
|
17882 |
+
{
|
17883 |
+
"epoch": 0.62,
|
17884 |
+
"grad_norm": 11.792010307312012,
|
17885 |
+
"learning_rate": 3.403389830508475e-06,
|
17886 |
+
"loss": 1.1061,
|
17887 |
+
"step": 24980
|
17888 |
+
},
|
17889 |
+
{
|
17890 |
+
"epoch": 0.62,
|
17891 |
+
"grad_norm": 10.934020042419434,
|
17892 |
+
"learning_rate": 3.3966101694915253e-06,
|
17893 |
+
"loss": 1.4179,
|
17894 |
+
"step": 24990
|
17895 |
+
},
|
17896 |
+
{
|
17897 |
+
"epoch": 0.62,
|
17898 |
+
"grad_norm": 1.5776662826538086,
|
17899 |
+
"learning_rate": 3.3898305084745763e-06,
|
17900 |
+
"loss": 1.3254,
|
17901 |
+
"step": 25000
|
17902 |
+
},
|
17903 |
+
{
|
17904 |
+
"epoch": 0.62,
|
17905 |
+
"eval_loss": 1.3409814834594727,
|
17906 |
+
"eval_runtime": 66.1173,
|
17907 |
+
"eval_samples_per_second": 15.125,
|
17908 |
+
"eval_steps_per_second": 15.125,
|
17909 |
+
"step": 25000
|
17910 |
}
|
17911 |
],
|
17912 |
"logging_steps": 10,
|
|
|
17914 |
"num_input_tokens_seen": 0,
|
17915 |
"num_train_epochs": 1,
|
17916 |
"save_steps": 2500,
|
17917 |
+
"total_flos": 4.025531498496e+17,
|
17918 |
"train_batch_size": 1,
|
17919 |
"trial_name": null,
|
17920 |
"trial_params": null
|