Uploaded checkpoint-22500
Browse files- adapter_model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +1795 -5
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 119975656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e72ed832ca82b47560da4b70a43bfb85762b41d54a4b1df89cee4b8816cb6fc
|
3 |
size 119975656
|
optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 240145026
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:19756b617f2bf91f55b9e8c9b87ec2279b8dca12dd91f8f9e92a075a7d6745b9
|
3 |
size 240145026
|
rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:da5d678c8111a8bb6e7f07c6d826c3d293cb2dc841c1a7d8cdada1cef59bd3c9
|
3 |
size 14244
|
scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2ebf87c3777a880efd4523ce05af816d67a6a12edb3e1d54f156890382c1db41
|
3 |
size 1064
|
trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
-
"best_metric": 1.
|
3 |
-
"best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-
|
4 |
-
"epoch": 0.
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -14327,6 +14327,1796 @@
|
|
14327 |
"eval_samples_per_second": 15.124,
|
14328 |
"eval_steps_per_second": 15.124,
|
14329 |
"step": 20000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14330 |
}
|
14331 |
],
|
14332 |
"logging_steps": 10,
|
@@ -14334,7 +16124,7 @@
|
|
14334 |
"num_input_tokens_seen": 0,
|
14335 |
"num_train_epochs": 1,
|
14336 |
"save_steps": 2500,
|
14337 |
-
"total_flos": 3.
|
14338 |
"train_batch_size": 1,
|
14339 |
"trial_name": null,
|
14340 |
"trial_params": null
|
|
|
1 |
{
|
2 |
+
"best_metric": 1.3439137935638428,
|
3 |
+
"best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-22500",
|
4 |
+
"epoch": 0.5625,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 22500,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
14327 |
"eval_samples_per_second": 15.124,
|
14328 |
"eval_steps_per_second": 15.124,
|
14329 |
"step": 20000
|
14330 |
+
},
|
14331 |
+
{
|
14332 |
+
"epoch": 0.5,
|
14333 |
+
"grad_norm": 2.159421920776367,
|
14334 |
+
"learning_rate": 6.772881355932204e-06,
|
14335 |
+
"loss": 1.4193,
|
14336 |
+
"step": 20010
|
14337 |
+
},
|
14338 |
+
{
|
14339 |
+
"epoch": 0.5,
|
14340 |
+
"grad_norm": 7.60541296005249,
|
14341 |
+
"learning_rate": 6.766101694915255e-06,
|
14342 |
+
"loss": 1.4747,
|
14343 |
+
"step": 20020
|
14344 |
+
},
|
14345 |
+
{
|
14346 |
+
"epoch": 0.5,
|
14347 |
+
"grad_norm": 2.0991897583007812,
|
14348 |
+
"learning_rate": 6.759322033898306e-06,
|
14349 |
+
"loss": 1.2775,
|
14350 |
+
"step": 20030
|
14351 |
+
},
|
14352 |
+
{
|
14353 |
+
"epoch": 0.5,
|
14354 |
+
"grad_norm": 7.756378173828125,
|
14355 |
+
"learning_rate": 6.7525423728813565e-06,
|
14356 |
+
"loss": 1.343,
|
14357 |
+
"step": 20040
|
14358 |
+
},
|
14359 |
+
{
|
14360 |
+
"epoch": 0.5,
|
14361 |
+
"grad_norm": 7.062932014465332,
|
14362 |
+
"learning_rate": 6.745762711864408e-06,
|
14363 |
+
"loss": 1.5168,
|
14364 |
+
"step": 20050
|
14365 |
+
},
|
14366 |
+
{
|
14367 |
+
"epoch": 0.5,
|
14368 |
+
"grad_norm": 6.689007759094238,
|
14369 |
+
"learning_rate": 6.7389830508474585e-06,
|
14370 |
+
"loss": 1.2894,
|
14371 |
+
"step": 20060
|
14372 |
+
},
|
14373 |
+
{
|
14374 |
+
"epoch": 0.5,
|
14375 |
+
"grad_norm": 5.31237268447876,
|
14376 |
+
"learning_rate": 6.73220338983051e-06,
|
14377 |
+
"loss": 1.3382,
|
14378 |
+
"step": 20070
|
14379 |
+
},
|
14380 |
+
{
|
14381 |
+
"epoch": 0.5,
|
14382 |
+
"grad_norm": 3.6598782539367676,
|
14383 |
+
"learning_rate": 6.7254237288135604e-06,
|
14384 |
+
"loss": 1.3839,
|
14385 |
+
"step": 20080
|
14386 |
+
},
|
14387 |
+
{
|
14388 |
+
"epoch": 0.5,
|
14389 |
+
"grad_norm": 3.3091630935668945,
|
14390 |
+
"learning_rate": 6.71864406779661e-06,
|
14391 |
+
"loss": 1.3654,
|
14392 |
+
"step": 20090
|
14393 |
+
},
|
14394 |
+
{
|
14395 |
+
"epoch": 0.5,
|
14396 |
+
"grad_norm": 2.1399312019348145,
|
14397 |
+
"learning_rate": 6.7118644067796615e-06,
|
14398 |
+
"loss": 1.5429,
|
14399 |
+
"step": 20100
|
14400 |
+
},
|
14401 |
+
{
|
14402 |
+
"epoch": 0.5,
|
14403 |
+
"grad_norm": 4.380695819854736,
|
14404 |
+
"learning_rate": 6.705084745762712e-06,
|
14405 |
+
"loss": 1.4629,
|
14406 |
+
"step": 20110
|
14407 |
+
},
|
14408 |
+
{
|
14409 |
+
"epoch": 0.5,
|
14410 |
+
"grad_norm": 9.330241203308105,
|
14411 |
+
"learning_rate": 6.6983050847457635e-06,
|
14412 |
+
"loss": 1.3547,
|
14413 |
+
"step": 20120
|
14414 |
+
},
|
14415 |
+
{
|
14416 |
+
"epoch": 0.5,
|
14417 |
+
"grad_norm": 5.803046226501465,
|
14418 |
+
"learning_rate": 6.691525423728814e-06,
|
14419 |
+
"loss": 1.3534,
|
14420 |
+
"step": 20130
|
14421 |
+
},
|
14422 |
+
{
|
14423 |
+
"epoch": 0.5,
|
14424 |
+
"grad_norm": 4.967430591583252,
|
14425 |
+
"learning_rate": 6.6847457627118655e-06,
|
14426 |
+
"loss": 1.4965,
|
14427 |
+
"step": 20140
|
14428 |
+
},
|
14429 |
+
{
|
14430 |
+
"epoch": 0.5,
|
14431 |
+
"grad_norm": 8.131784439086914,
|
14432 |
+
"learning_rate": 6.677966101694916e-06,
|
14433 |
+
"loss": 1.4012,
|
14434 |
+
"step": 20150
|
14435 |
+
},
|
14436 |
+
{
|
14437 |
+
"epoch": 0.5,
|
14438 |
+
"grad_norm": 9.220160484313965,
|
14439 |
+
"learning_rate": 6.6711864406779666e-06,
|
14440 |
+
"loss": 1.3637,
|
14441 |
+
"step": 20160
|
14442 |
+
},
|
14443 |
+
{
|
14444 |
+
"epoch": 0.5,
|
14445 |
+
"grad_norm": 6.833899974822998,
|
14446 |
+
"learning_rate": 6.664406779661018e-06,
|
14447 |
+
"loss": 1.3748,
|
14448 |
+
"step": 20170
|
14449 |
+
},
|
14450 |
+
{
|
14451 |
+
"epoch": 0.5,
|
14452 |
+
"grad_norm": 13.869956970214844,
|
14453 |
+
"learning_rate": 6.6576271186440685e-06,
|
14454 |
+
"loss": 1.4474,
|
14455 |
+
"step": 20180
|
14456 |
+
},
|
14457 |
+
{
|
14458 |
+
"epoch": 0.5,
|
14459 |
+
"grad_norm": 9.462739944458008,
|
14460 |
+
"learning_rate": 6.650847457627119e-06,
|
14461 |
+
"loss": 1.3893,
|
14462 |
+
"step": 20190
|
14463 |
+
},
|
14464 |
+
{
|
14465 |
+
"epoch": 0.51,
|
14466 |
+
"grad_norm": 2.321714162826538,
|
14467 |
+
"learning_rate": 6.64406779661017e-06,
|
14468 |
+
"loss": 1.3836,
|
14469 |
+
"step": 20200
|
14470 |
+
},
|
14471 |
+
{
|
14472 |
+
"epoch": 0.51,
|
14473 |
+
"grad_norm": 11.65925407409668,
|
14474 |
+
"learning_rate": 6.637288135593221e-06,
|
14475 |
+
"loss": 1.493,
|
14476 |
+
"step": 20210
|
14477 |
+
},
|
14478 |
+
{
|
14479 |
+
"epoch": 0.51,
|
14480 |
+
"grad_norm": 6.808838367462158,
|
14481 |
+
"learning_rate": 6.6305084745762716e-06,
|
14482 |
+
"loss": 1.3632,
|
14483 |
+
"step": 20220
|
14484 |
+
},
|
14485 |
+
{
|
14486 |
+
"epoch": 0.51,
|
14487 |
+
"grad_norm": 4.534874439239502,
|
14488 |
+
"learning_rate": 6.623728813559322e-06,
|
14489 |
+
"loss": 1.4303,
|
14490 |
+
"step": 20230
|
14491 |
+
},
|
14492 |
+
{
|
14493 |
+
"epoch": 0.51,
|
14494 |
+
"grad_norm": 1.9837802648544312,
|
14495 |
+
"learning_rate": 6.6169491525423735e-06,
|
14496 |
+
"loss": 1.446,
|
14497 |
+
"step": 20240
|
14498 |
+
},
|
14499 |
+
{
|
14500 |
+
"epoch": 0.51,
|
14501 |
+
"grad_norm": 7.779874324798584,
|
14502 |
+
"learning_rate": 6.610169491525424e-06,
|
14503 |
+
"loss": 1.4566,
|
14504 |
+
"step": 20250
|
14505 |
+
},
|
14506 |
+
{
|
14507 |
+
"epoch": 0.51,
|
14508 |
+
"grad_norm": 5.729377746582031,
|
14509 |
+
"learning_rate": 6.6033898305084755e-06,
|
14510 |
+
"loss": 1.401,
|
14511 |
+
"step": 20260
|
14512 |
+
},
|
14513 |
+
{
|
14514 |
+
"epoch": 0.51,
|
14515 |
+
"grad_norm": 12.633749008178711,
|
14516 |
+
"learning_rate": 6.596610169491526e-06,
|
14517 |
+
"loss": 1.2969,
|
14518 |
+
"step": 20270
|
14519 |
+
},
|
14520 |
+
{
|
14521 |
+
"epoch": 0.51,
|
14522 |
+
"grad_norm": 2.7961552143096924,
|
14523 |
+
"learning_rate": 6.5898305084745774e-06,
|
14524 |
+
"loss": 1.4151,
|
14525 |
+
"step": 20280
|
14526 |
+
},
|
14527 |
+
{
|
14528 |
+
"epoch": 0.51,
|
14529 |
+
"grad_norm": 8.461606979370117,
|
14530 |
+
"learning_rate": 6.583050847457627e-06,
|
14531 |
+
"loss": 1.2665,
|
14532 |
+
"step": 20290
|
14533 |
+
},
|
14534 |
+
{
|
14535 |
+
"epoch": 0.51,
|
14536 |
+
"grad_norm": 10.325592041015625,
|
14537 |
+
"learning_rate": 6.576271186440678e-06,
|
14538 |
+
"loss": 1.3666,
|
14539 |
+
"step": 20300
|
14540 |
+
},
|
14541 |
+
{
|
14542 |
+
"epoch": 0.51,
|
14543 |
+
"grad_norm": 3.0818538665771484,
|
14544 |
+
"learning_rate": 6.569491525423729e-06,
|
14545 |
+
"loss": 1.4307,
|
14546 |
+
"step": 20310
|
14547 |
+
},
|
14548 |
+
{
|
14549 |
+
"epoch": 0.51,
|
14550 |
+
"grad_norm": 7.449318885803223,
|
14551 |
+
"learning_rate": 6.56271186440678e-06,
|
14552 |
+
"loss": 1.4585,
|
14553 |
+
"step": 20320
|
14554 |
+
},
|
14555 |
+
{
|
14556 |
+
"epoch": 0.51,
|
14557 |
+
"grad_norm": 4.6587042808532715,
|
14558 |
+
"learning_rate": 6.555932203389831e-06,
|
14559 |
+
"loss": 1.4053,
|
14560 |
+
"step": 20330
|
14561 |
+
},
|
14562 |
+
{
|
14563 |
+
"epoch": 0.51,
|
14564 |
+
"grad_norm": 5.837299346923828,
|
14565 |
+
"learning_rate": 6.549152542372882e-06,
|
14566 |
+
"loss": 1.3456,
|
14567 |
+
"step": 20340
|
14568 |
+
},
|
14569 |
+
{
|
14570 |
+
"epoch": 0.51,
|
14571 |
+
"grad_norm": 7.345305442810059,
|
14572 |
+
"learning_rate": 6.542372881355933e-06,
|
14573 |
+
"loss": 1.3618,
|
14574 |
+
"step": 20350
|
14575 |
+
},
|
14576 |
+
{
|
14577 |
+
"epoch": 0.51,
|
14578 |
+
"grad_norm": 5.679592609405518,
|
14579 |
+
"learning_rate": 6.5355932203389836e-06,
|
14580 |
+
"loss": 1.3875,
|
14581 |
+
"step": 20360
|
14582 |
+
},
|
14583 |
+
{
|
14584 |
+
"epoch": 0.51,
|
14585 |
+
"grad_norm": 1.062429428100586,
|
14586 |
+
"learning_rate": 6.528813559322035e-06,
|
14587 |
+
"loss": 1.3083,
|
14588 |
+
"step": 20370
|
14589 |
+
},
|
14590 |
+
{
|
14591 |
+
"epoch": 0.51,
|
14592 |
+
"grad_norm": 5.2157769203186035,
|
14593 |
+
"learning_rate": 6.5220338983050855e-06,
|
14594 |
+
"loss": 1.3582,
|
14595 |
+
"step": 20380
|
14596 |
+
},
|
14597 |
+
{
|
14598 |
+
"epoch": 0.51,
|
14599 |
+
"grad_norm": 4.890625476837158,
|
14600 |
+
"learning_rate": 6.515254237288137e-06,
|
14601 |
+
"loss": 1.3184,
|
14602 |
+
"step": 20390
|
14603 |
+
},
|
14604 |
+
{
|
14605 |
+
"epoch": 0.51,
|
14606 |
+
"grad_norm": 1.587312936782837,
|
14607 |
+
"learning_rate": 6.508474576271187e-06,
|
14608 |
+
"loss": 1.2191,
|
14609 |
+
"step": 20400
|
14610 |
+
},
|
14611 |
+
{
|
14612 |
+
"epoch": 0.51,
|
14613 |
+
"grad_norm": 4.013046741485596,
|
14614 |
+
"learning_rate": 6.501694915254237e-06,
|
14615 |
+
"loss": 1.3331,
|
14616 |
+
"step": 20410
|
14617 |
+
},
|
14618 |
+
{
|
14619 |
+
"epoch": 0.51,
|
14620 |
+
"grad_norm": 4.076744556427002,
|
14621 |
+
"learning_rate": 6.4949152542372886e-06,
|
14622 |
+
"loss": 1.4351,
|
14623 |
+
"step": 20420
|
14624 |
+
},
|
14625 |
+
{
|
14626 |
+
"epoch": 0.51,
|
14627 |
+
"grad_norm": 6.493119239807129,
|
14628 |
+
"learning_rate": 6.488135593220339e-06,
|
14629 |
+
"loss": 1.3519,
|
14630 |
+
"step": 20430
|
14631 |
+
},
|
14632 |
+
{
|
14633 |
+
"epoch": 0.51,
|
14634 |
+
"grad_norm": 6.127922058105469,
|
14635 |
+
"learning_rate": 6.4813559322033905e-06,
|
14636 |
+
"loss": 1.4255,
|
14637 |
+
"step": 20440
|
14638 |
+
},
|
14639 |
+
{
|
14640 |
+
"epoch": 0.51,
|
14641 |
+
"grad_norm": 4.424916744232178,
|
14642 |
+
"learning_rate": 6.474576271186441e-06,
|
14643 |
+
"loss": 1.44,
|
14644 |
+
"step": 20450
|
14645 |
+
},
|
14646 |
+
{
|
14647 |
+
"epoch": 0.51,
|
14648 |
+
"grad_norm": 9.894845008850098,
|
14649 |
+
"learning_rate": 6.4677966101694925e-06,
|
14650 |
+
"loss": 1.2922,
|
14651 |
+
"step": 20460
|
14652 |
+
},
|
14653 |
+
{
|
14654 |
+
"epoch": 0.51,
|
14655 |
+
"grad_norm": 5.174190521240234,
|
14656 |
+
"learning_rate": 6.461016949152543e-06,
|
14657 |
+
"loss": 1.439,
|
14658 |
+
"step": 20470
|
14659 |
+
},
|
14660 |
+
{
|
14661 |
+
"epoch": 0.51,
|
14662 |
+
"grad_norm": 6.313706398010254,
|
14663 |
+
"learning_rate": 6.4542372881355944e-06,
|
14664 |
+
"loss": 1.3229,
|
14665 |
+
"step": 20480
|
14666 |
+
},
|
14667 |
+
{
|
14668 |
+
"epoch": 0.51,
|
14669 |
+
"grad_norm": 3.240434169769287,
|
14670 |
+
"learning_rate": 6.447457627118645e-06,
|
14671 |
+
"loss": 1.3911,
|
14672 |
+
"step": 20490
|
14673 |
+
},
|
14674 |
+
{
|
14675 |
+
"epoch": 0.51,
|
14676 |
+
"grad_norm": 7.466948986053467,
|
14677 |
+
"learning_rate": 6.440677966101695e-06,
|
14678 |
+
"loss": 1.4467,
|
14679 |
+
"step": 20500
|
14680 |
+
},
|
14681 |
+
{
|
14682 |
+
"epoch": 0.51,
|
14683 |
+
"eval_loss": 1.3339221477508545,
|
14684 |
+
"eval_runtime": 66.152,
|
14685 |
+
"eval_samples_per_second": 15.117,
|
14686 |
+
"eval_steps_per_second": 15.117,
|
14687 |
+
"step": 20500
|
14688 |
+
},
|
14689 |
+
{
|
14690 |
+
"epoch": 0.51,
|
14691 |
+
"grad_norm": 32.95411682128906,
|
14692 |
+
"learning_rate": 6.433898305084746e-06,
|
14693 |
+
"loss": 1.2821,
|
14694 |
+
"step": 20510
|
14695 |
+
},
|
14696 |
+
{
|
14697 |
+
"epoch": 0.51,
|
14698 |
+
"grad_norm": 11.001514434814453,
|
14699 |
+
"learning_rate": 6.427118644067797e-06,
|
14700 |
+
"loss": 1.3523,
|
14701 |
+
"step": 20520
|
14702 |
+
},
|
14703 |
+
{
|
14704 |
+
"epoch": 0.51,
|
14705 |
+
"grad_norm": 1.6923043727874756,
|
14706 |
+
"learning_rate": 6.420338983050848e-06,
|
14707 |
+
"loss": 1.3424,
|
14708 |
+
"step": 20530
|
14709 |
+
},
|
14710 |
+
{
|
14711 |
+
"epoch": 0.51,
|
14712 |
+
"grad_norm": 6.267253398895264,
|
14713 |
+
"learning_rate": 6.413559322033899e-06,
|
14714 |
+
"loss": 1.2163,
|
14715 |
+
"step": 20540
|
14716 |
+
},
|
14717 |
+
{
|
14718 |
+
"epoch": 0.51,
|
14719 |
+
"grad_norm": 15.797646522521973,
|
14720 |
+
"learning_rate": 6.40677966101695e-06,
|
14721 |
+
"loss": 1.2104,
|
14722 |
+
"step": 20550
|
14723 |
+
},
|
14724 |
+
{
|
14725 |
+
"epoch": 0.51,
|
14726 |
+
"grad_norm": 3.1398627758026123,
|
14727 |
+
"learning_rate": 6.4000000000000006e-06,
|
14728 |
+
"loss": 1.2264,
|
14729 |
+
"step": 20560
|
14730 |
+
},
|
14731 |
+
{
|
14732 |
+
"epoch": 0.51,
|
14733 |
+
"grad_norm": 1.7001848220825195,
|
14734 |
+
"learning_rate": 6.393220338983052e-06,
|
14735 |
+
"loss": 1.345,
|
14736 |
+
"step": 20570
|
14737 |
+
},
|
14738 |
+
{
|
14739 |
+
"epoch": 0.51,
|
14740 |
+
"grad_norm": 6.551231861114502,
|
14741 |
+
"learning_rate": 6.3864406779661025e-06,
|
14742 |
+
"loss": 1.295,
|
14743 |
+
"step": 20580
|
14744 |
+
},
|
14745 |
+
{
|
14746 |
+
"epoch": 0.51,
|
14747 |
+
"grad_norm": 2.501774787902832,
|
14748 |
+
"learning_rate": 6.379661016949154e-06,
|
14749 |
+
"loss": 1.2727,
|
14750 |
+
"step": 20590
|
14751 |
+
},
|
14752 |
+
{
|
14753 |
+
"epoch": 0.52,
|
14754 |
+
"grad_norm": 5.2971906661987305,
|
14755 |
+
"learning_rate": 6.372881355932204e-06,
|
14756 |
+
"loss": 1.1617,
|
14757 |
+
"step": 20600
|
14758 |
+
},
|
14759 |
+
{
|
14760 |
+
"epoch": 0.52,
|
14761 |
+
"grad_norm": 8.737128257751465,
|
14762 |
+
"learning_rate": 6.366101694915254e-06,
|
14763 |
+
"loss": 1.3792,
|
14764 |
+
"step": 20610
|
14765 |
+
},
|
14766 |
+
{
|
14767 |
+
"epoch": 0.52,
|
14768 |
+
"grad_norm": 1.8445500135421753,
|
14769 |
+
"learning_rate": 6.3593220338983056e-06,
|
14770 |
+
"loss": 1.2818,
|
14771 |
+
"step": 20620
|
14772 |
+
},
|
14773 |
+
{
|
14774 |
+
"epoch": 0.52,
|
14775 |
+
"grad_norm": 6.586302280426025,
|
14776 |
+
"learning_rate": 6.352542372881356e-06,
|
14777 |
+
"loss": 1.471,
|
14778 |
+
"step": 20630
|
14779 |
+
},
|
14780 |
+
{
|
14781 |
+
"epoch": 0.52,
|
14782 |
+
"grad_norm": 2.3581125736236572,
|
14783 |
+
"learning_rate": 6.3457627118644075e-06,
|
14784 |
+
"loss": 1.26,
|
14785 |
+
"step": 20640
|
14786 |
+
},
|
14787 |
+
{
|
14788 |
+
"epoch": 0.52,
|
14789 |
+
"grad_norm": 3.006883382797241,
|
14790 |
+
"learning_rate": 6.338983050847458e-06,
|
14791 |
+
"loss": 1.4312,
|
14792 |
+
"step": 20650
|
14793 |
+
},
|
14794 |
+
{
|
14795 |
+
"epoch": 0.52,
|
14796 |
+
"grad_norm": 1.8936930894851685,
|
14797 |
+
"learning_rate": 6.3322033898305095e-06,
|
14798 |
+
"loss": 1.3705,
|
14799 |
+
"step": 20660
|
14800 |
+
},
|
14801 |
+
{
|
14802 |
+
"epoch": 0.52,
|
14803 |
+
"grad_norm": 5.22953462600708,
|
14804 |
+
"learning_rate": 6.32542372881356e-06,
|
14805 |
+
"loss": 1.3252,
|
14806 |
+
"step": 20670
|
14807 |
+
},
|
14808 |
+
{
|
14809 |
+
"epoch": 0.52,
|
14810 |
+
"grad_norm": 7.669064044952393,
|
14811 |
+
"learning_rate": 6.318644067796611e-06,
|
14812 |
+
"loss": 1.4673,
|
14813 |
+
"step": 20680
|
14814 |
+
},
|
14815 |
+
{
|
14816 |
+
"epoch": 0.52,
|
14817 |
+
"grad_norm": 3.7508702278137207,
|
14818 |
+
"learning_rate": 6.311864406779662e-06,
|
14819 |
+
"loss": 1.3465,
|
14820 |
+
"step": 20690
|
14821 |
+
},
|
14822 |
+
{
|
14823 |
+
"epoch": 0.52,
|
14824 |
+
"grad_norm": 7.959824085235596,
|
14825 |
+
"learning_rate": 6.3050847457627125e-06,
|
14826 |
+
"loss": 1.4148,
|
14827 |
+
"step": 20700
|
14828 |
+
},
|
14829 |
+
{
|
14830 |
+
"epoch": 0.52,
|
14831 |
+
"grad_norm": 8.888174057006836,
|
14832 |
+
"learning_rate": 6.298305084745763e-06,
|
14833 |
+
"loss": 1.4874,
|
14834 |
+
"step": 20710
|
14835 |
+
},
|
14836 |
+
{
|
14837 |
+
"epoch": 0.52,
|
14838 |
+
"grad_norm": 11.385110855102539,
|
14839 |
+
"learning_rate": 6.291525423728814e-06,
|
14840 |
+
"loss": 1.3519,
|
14841 |
+
"step": 20720
|
14842 |
+
},
|
14843 |
+
{
|
14844 |
+
"epoch": 0.52,
|
14845 |
+
"grad_norm": 11.636566162109375,
|
14846 |
+
"learning_rate": 6.284745762711865e-06,
|
14847 |
+
"loss": 1.3237,
|
14848 |
+
"step": 20730
|
14849 |
+
},
|
14850 |
+
{
|
14851 |
+
"epoch": 0.52,
|
14852 |
+
"grad_norm": 2.6630842685699463,
|
14853 |
+
"learning_rate": 6.277966101694916e-06,
|
14854 |
+
"loss": 1.5615,
|
14855 |
+
"step": 20740
|
14856 |
+
},
|
14857 |
+
{
|
14858 |
+
"epoch": 0.52,
|
14859 |
+
"grad_norm": 15.339405059814453,
|
14860 |
+
"learning_rate": 6.271186440677966e-06,
|
14861 |
+
"loss": 1.0971,
|
14862 |
+
"step": 20750
|
14863 |
+
},
|
14864 |
+
{
|
14865 |
+
"epoch": 0.52,
|
14866 |
+
"grad_norm": 5.97685432434082,
|
14867 |
+
"learning_rate": 6.2644067796610176e-06,
|
14868 |
+
"loss": 1.2582,
|
14869 |
+
"step": 20760
|
14870 |
+
},
|
14871 |
+
{
|
14872 |
+
"epoch": 0.52,
|
14873 |
+
"grad_norm": 5.218130111694336,
|
14874 |
+
"learning_rate": 6.257627118644068e-06,
|
14875 |
+
"loss": 1.3272,
|
14876 |
+
"step": 20770
|
14877 |
+
},
|
14878 |
+
{
|
14879 |
+
"epoch": 0.52,
|
14880 |
+
"grad_norm": 8.943811416625977,
|
14881 |
+
"learning_rate": 6.2508474576271195e-06,
|
14882 |
+
"loss": 1.4001,
|
14883 |
+
"step": 20780
|
14884 |
+
},
|
14885 |
+
{
|
14886 |
+
"epoch": 0.52,
|
14887 |
+
"grad_norm": 3.288783550262451,
|
14888 |
+
"learning_rate": 6.24406779661017e-06,
|
14889 |
+
"loss": 1.3729,
|
14890 |
+
"step": 20790
|
14891 |
+
},
|
14892 |
+
{
|
14893 |
+
"epoch": 0.52,
|
14894 |
+
"grad_norm": 2.7321832180023193,
|
14895 |
+
"learning_rate": 6.2372881355932215e-06,
|
14896 |
+
"loss": 1.4365,
|
14897 |
+
"step": 20800
|
14898 |
+
},
|
14899 |
+
{
|
14900 |
+
"epoch": 0.52,
|
14901 |
+
"grad_norm": 7.656303405761719,
|
14902 |
+
"learning_rate": 6.230508474576271e-06,
|
14903 |
+
"loss": 1.3646,
|
14904 |
+
"step": 20810
|
14905 |
+
},
|
14906 |
+
{
|
14907 |
+
"epoch": 0.52,
|
14908 |
+
"grad_norm": 3.2849278450012207,
|
14909 |
+
"learning_rate": 6.223728813559322e-06,
|
14910 |
+
"loss": 1.0755,
|
14911 |
+
"step": 20820
|
14912 |
+
},
|
14913 |
+
{
|
14914 |
+
"epoch": 0.52,
|
14915 |
+
"grad_norm": 3.6145408153533936,
|
14916 |
+
"learning_rate": 6.216949152542373e-06,
|
14917 |
+
"loss": 1.2836,
|
14918 |
+
"step": 20830
|
14919 |
+
},
|
14920 |
+
{
|
14921 |
+
"epoch": 0.52,
|
14922 |
+
"grad_norm": 6.668073654174805,
|
14923 |
+
"learning_rate": 6.210169491525424e-06,
|
14924 |
+
"loss": 1.3706,
|
14925 |
+
"step": 20840
|
14926 |
+
},
|
14927 |
+
{
|
14928 |
+
"epoch": 0.52,
|
14929 |
+
"grad_norm": 10.408007621765137,
|
14930 |
+
"learning_rate": 6.203389830508475e-06,
|
14931 |
+
"loss": 1.5154,
|
14932 |
+
"step": 20850
|
14933 |
+
},
|
14934 |
+
{
|
14935 |
+
"epoch": 0.52,
|
14936 |
+
"grad_norm": 2.708711862564087,
|
14937 |
+
"learning_rate": 6.196610169491526e-06,
|
14938 |
+
"loss": 1.3841,
|
14939 |
+
"step": 20860
|
14940 |
+
},
|
14941 |
+
{
|
14942 |
+
"epoch": 0.52,
|
14943 |
+
"grad_norm": 1.7991631031036377,
|
14944 |
+
"learning_rate": 6.189830508474577e-06,
|
14945 |
+
"loss": 1.261,
|
14946 |
+
"step": 20870
|
14947 |
+
},
|
14948 |
+
{
|
14949 |
+
"epoch": 0.52,
|
14950 |
+
"grad_norm": 6.492557048797607,
|
14951 |
+
"learning_rate": 6.183050847457628e-06,
|
14952 |
+
"loss": 1.469,
|
14953 |
+
"step": 20880
|
14954 |
+
},
|
14955 |
+
{
|
14956 |
+
"epoch": 0.52,
|
14957 |
+
"grad_norm": 3.5670902729034424,
|
14958 |
+
"learning_rate": 6.176271186440679e-06,
|
14959 |
+
"loss": 1.4484,
|
14960 |
+
"step": 20890
|
14961 |
+
},
|
14962 |
+
{
|
14963 |
+
"epoch": 0.52,
|
14964 |
+
"grad_norm": 3.4511330127716064,
|
14965 |
+
"learning_rate": 6.1694915254237295e-06,
|
14966 |
+
"loss": 1.3566,
|
14967 |
+
"step": 20900
|
14968 |
+
},
|
14969 |
+
{
|
14970 |
+
"epoch": 0.52,
|
14971 |
+
"grad_norm": 7.958609580993652,
|
14972 |
+
"learning_rate": 6.162711864406781e-06,
|
14973 |
+
"loss": 1.3272,
|
14974 |
+
"step": 20910
|
14975 |
+
},
|
14976 |
+
{
|
14977 |
+
"epoch": 0.52,
|
14978 |
+
"grad_norm": 6.594333648681641,
|
14979 |
+
"learning_rate": 6.155932203389831e-06,
|
14980 |
+
"loss": 1.3926,
|
14981 |
+
"step": 20920
|
14982 |
+
},
|
14983 |
+
{
|
14984 |
+
"epoch": 0.52,
|
14985 |
+
"grad_norm": 7.329288005828857,
|
14986 |
+
"learning_rate": 6.149152542372881e-06,
|
14987 |
+
"loss": 1.4948,
|
14988 |
+
"step": 20930
|
14989 |
+
},
|
14990 |
+
{
|
14991 |
+
"epoch": 0.52,
|
14992 |
+
"grad_norm": 10.120712280273438,
|
14993 |
+
"learning_rate": 6.142372881355933e-06,
|
14994 |
+
"loss": 1.5394,
|
14995 |
+
"step": 20940
|
14996 |
+
},
|
14997 |
+
{
|
14998 |
+
"epoch": 0.52,
|
14999 |
+
"grad_norm": 4.3365983963012695,
|
15000 |
+
"learning_rate": 6.135593220338983e-06,
|
15001 |
+
"loss": 1.2156,
|
15002 |
+
"step": 20950
|
15003 |
+
},
|
15004 |
+
{
|
15005 |
+
"epoch": 0.52,
|
15006 |
+
"grad_norm": 2.0522942543029785,
|
15007 |
+
"learning_rate": 6.1288135593220346e-06,
|
15008 |
+
"loss": 1.3206,
|
15009 |
+
"step": 20960
|
15010 |
+
},
|
15011 |
+
{
|
15012 |
+
"epoch": 0.52,
|
15013 |
+
"grad_norm": 5.730597496032715,
|
15014 |
+
"learning_rate": 6.122033898305085e-06,
|
15015 |
+
"loss": 1.4396,
|
15016 |
+
"step": 20970
|
15017 |
+
},
|
15018 |
+
{
|
15019 |
+
"epoch": 0.52,
|
15020 |
+
"grad_norm": 13.125938415527344,
|
15021 |
+
"learning_rate": 6.1152542372881365e-06,
|
15022 |
+
"loss": 1.1969,
|
15023 |
+
"step": 20980
|
15024 |
+
},
|
15025 |
+
{
|
15026 |
+
"epoch": 0.52,
|
15027 |
+
"grad_norm": 3.8410260677337646,
|
15028 |
+
"learning_rate": 6.108474576271187e-06,
|
15029 |
+
"loss": 1.3842,
|
15030 |
+
"step": 20990
|
15031 |
+
},
|
15032 |
+
{
|
15033 |
+
"epoch": 0.53,
|
15034 |
+
"grad_norm": 3.415696144104004,
|
15035 |
+
"learning_rate": 6.1016949152542385e-06,
|
15036 |
+
"loss": 1.3296,
|
15037 |
+
"step": 21000
|
15038 |
+
},
|
15039 |
+
{
|
15040 |
+
"epoch": 0.53,
|
15041 |
+
"eval_loss": 1.344160556793213,
|
15042 |
+
"eval_runtime": 66.1512,
|
15043 |
+
"eval_samples_per_second": 15.117,
|
15044 |
+
"eval_steps_per_second": 15.117,
|
15045 |
+
"step": 21000
|
15046 |
+
},
|
15047 |
+
{
|
15048 |
+
"epoch": 0.53,
|
15049 |
+
"grad_norm": 9.95438003540039,
|
15050 |
+
"learning_rate": 6.094915254237289e-06,
|
15051 |
+
"loss": 1.3206,
|
15052 |
+
"step": 21010
|
15053 |
+
},
|
15054 |
+
{
|
15055 |
+
"epoch": 0.53,
|
15056 |
+
"grad_norm": 10.410721778869629,
|
15057 |
+
"learning_rate": 6.088135593220339e-06,
|
15058 |
+
"loss": 1.4014,
|
15059 |
+
"step": 21020
|
15060 |
+
},
|
15061 |
+
{
|
15062 |
+
"epoch": 0.53,
|
15063 |
+
"grad_norm": 9.159972190856934,
|
15064 |
+
"learning_rate": 6.08135593220339e-06,
|
15065 |
+
"loss": 1.3827,
|
15066 |
+
"step": 21030
|
15067 |
+
},
|
15068 |
+
{
|
15069 |
+
"epoch": 0.53,
|
15070 |
+
"grad_norm": 4.856491565704346,
|
15071 |
+
"learning_rate": 6.074576271186441e-06,
|
15072 |
+
"loss": 1.4561,
|
15073 |
+
"step": 21040
|
15074 |
+
},
|
15075 |
+
{
|
15076 |
+
"epoch": 0.53,
|
15077 |
+
"grad_norm": 5.863302707672119,
|
15078 |
+
"learning_rate": 6.067796610169492e-06,
|
15079 |
+
"loss": 1.3225,
|
15080 |
+
"step": 21050
|
15081 |
+
},
|
15082 |
+
{
|
15083 |
+
"epoch": 0.53,
|
15084 |
+
"grad_norm": 2.968809127807617,
|
15085 |
+
"learning_rate": 6.061016949152543e-06,
|
15086 |
+
"loss": 1.3713,
|
15087 |
+
"step": 21060
|
15088 |
+
},
|
15089 |
+
{
|
15090 |
+
"epoch": 0.53,
|
15091 |
+
"grad_norm": 4.19352388381958,
|
15092 |
+
"learning_rate": 6.054237288135594e-06,
|
15093 |
+
"loss": 1.3894,
|
15094 |
+
"step": 21070
|
15095 |
+
},
|
15096 |
+
{
|
15097 |
+
"epoch": 0.53,
|
15098 |
+
"grad_norm": 4.841989040374756,
|
15099 |
+
"learning_rate": 6.047457627118645e-06,
|
15100 |
+
"loss": 1.3609,
|
15101 |
+
"step": 21080
|
15102 |
+
},
|
15103 |
+
{
|
15104 |
+
"epoch": 0.53,
|
15105 |
+
"grad_norm": 8.693398475646973,
|
15106 |
+
"learning_rate": 6.040677966101696e-06,
|
15107 |
+
"loss": 1.4118,
|
15108 |
+
"step": 21090
|
15109 |
+
},
|
15110 |
+
{
|
15111 |
+
"epoch": 0.53,
|
15112 |
+
"grad_norm": 10.902780532836914,
|
15113 |
+
"learning_rate": 6.0338983050847465e-06,
|
15114 |
+
"loss": 1.4529,
|
15115 |
+
"step": 21100
|
15116 |
+
},
|
15117 |
+
{
|
15118 |
+
"epoch": 0.53,
|
15119 |
+
"grad_norm": 5.1729607582092285,
|
15120 |
+
"learning_rate": 6.027118644067798e-06,
|
15121 |
+
"loss": 1.3302,
|
15122 |
+
"step": 21110
|
15123 |
+
},
|
15124 |
+
{
|
15125 |
+
"epoch": 0.53,
|
15126 |
+
"grad_norm": 1.9906094074249268,
|
15127 |
+
"learning_rate": 6.020338983050848e-06,
|
15128 |
+
"loss": 1.2294,
|
15129 |
+
"step": 21120
|
15130 |
+
},
|
15131 |
+
{
|
15132 |
+
"epoch": 0.53,
|
15133 |
+
"grad_norm": 3.285928964614868,
|
15134 |
+
"learning_rate": 6.013559322033898e-06,
|
15135 |
+
"loss": 1.5006,
|
15136 |
+
"step": 21130
|
15137 |
+
},
|
15138 |
+
{
|
15139 |
+
"epoch": 0.53,
|
15140 |
+
"grad_norm": 13.098603248596191,
|
15141 |
+
"learning_rate": 6.00677966101695e-06,
|
15142 |
+
"loss": 1.425,
|
15143 |
+
"step": 21140
|
15144 |
+
},
|
15145 |
+
{
|
15146 |
+
"epoch": 0.53,
|
15147 |
+
"grad_norm": 4.391754150390625,
|
15148 |
+
"learning_rate": 6e-06,
|
15149 |
+
"loss": 1.2668,
|
15150 |
+
"step": 21150
|
15151 |
+
},
|
15152 |
+
{
|
15153 |
+
"epoch": 0.53,
|
15154 |
+
"grad_norm": 8.409893035888672,
|
15155 |
+
"learning_rate": 5.9932203389830516e-06,
|
15156 |
+
"loss": 1.2061,
|
15157 |
+
"step": 21160
|
15158 |
+
},
|
15159 |
+
{
|
15160 |
+
"epoch": 0.53,
|
15161 |
+
"grad_norm": 10.568397521972656,
|
15162 |
+
"learning_rate": 5.986440677966102e-06,
|
15163 |
+
"loss": 1.4191,
|
15164 |
+
"step": 21170
|
15165 |
+
},
|
15166 |
+
{
|
15167 |
+
"epoch": 0.53,
|
15168 |
+
"grad_norm": 7.371358394622803,
|
15169 |
+
"learning_rate": 5.9796610169491535e-06,
|
15170 |
+
"loss": 1.2942,
|
15171 |
+
"step": 21180
|
15172 |
+
},
|
15173 |
+
{
|
15174 |
+
"epoch": 0.53,
|
15175 |
+
"grad_norm": 2.6429450511932373,
|
15176 |
+
"learning_rate": 5.972881355932204e-06,
|
15177 |
+
"loss": 1.4216,
|
15178 |
+
"step": 21190
|
15179 |
+
},
|
15180 |
+
{
|
15181 |
+
"epoch": 0.53,
|
15182 |
+
"grad_norm": 5.85234260559082,
|
15183 |
+
"learning_rate": 5.9661016949152555e-06,
|
15184 |
+
"loss": 1.3541,
|
15185 |
+
"step": 21200
|
15186 |
+
},
|
15187 |
+
{
|
15188 |
+
"epoch": 0.53,
|
15189 |
+
"grad_norm": 3.2903425693511963,
|
15190 |
+
"learning_rate": 5.959322033898306e-06,
|
15191 |
+
"loss": 1.2969,
|
15192 |
+
"step": 21210
|
15193 |
+
},
|
15194 |
+
{
|
15195 |
+
"epoch": 0.53,
|
15196 |
+
"grad_norm": 3.9652106761932373,
|
15197 |
+
"learning_rate": 5.9525423728813566e-06,
|
15198 |
+
"loss": 1.2853,
|
15199 |
+
"step": 21220
|
15200 |
+
},
|
15201 |
+
{
|
15202 |
+
"epoch": 0.53,
|
15203 |
+
"grad_norm": 3.8703153133392334,
|
15204 |
+
"learning_rate": 5.945762711864407e-06,
|
15205 |
+
"loss": 1.3094,
|
15206 |
+
"step": 21230
|
15207 |
+
},
|
15208 |
+
{
|
15209 |
+
"epoch": 0.53,
|
15210 |
+
"grad_norm": 8.018983840942383,
|
15211 |
+
"learning_rate": 5.938983050847458e-06,
|
15212 |
+
"loss": 1.4798,
|
15213 |
+
"step": 21240
|
15214 |
+
},
|
15215 |
+
{
|
15216 |
+
"epoch": 0.53,
|
15217 |
+
"grad_norm": 2.752399206161499,
|
15218 |
+
"learning_rate": 5.932203389830509e-06,
|
15219 |
+
"loss": 1.4141,
|
15220 |
+
"step": 21250
|
15221 |
+
},
|
15222 |
+
{
|
15223 |
+
"epoch": 0.53,
|
15224 |
+
"grad_norm": 5.574487686157227,
|
15225 |
+
"learning_rate": 5.92542372881356e-06,
|
15226 |
+
"loss": 1.2121,
|
15227 |
+
"step": 21260
|
15228 |
+
},
|
15229 |
+
{
|
15230 |
+
"epoch": 0.53,
|
15231 |
+
"grad_norm": 10.352456092834473,
|
15232 |
+
"learning_rate": 5.91864406779661e-06,
|
15233 |
+
"loss": 1.3264,
|
15234 |
+
"step": 21270
|
15235 |
+
},
|
15236 |
+
{
|
15237 |
+
"epoch": 0.53,
|
15238 |
+
"grad_norm": 4.182732582092285,
|
15239 |
+
"learning_rate": 5.911864406779662e-06,
|
15240 |
+
"loss": 1.4334,
|
15241 |
+
"step": 21280
|
15242 |
+
},
|
15243 |
+
{
|
15244 |
+
"epoch": 0.53,
|
15245 |
+
"grad_norm": 5.310421943664551,
|
15246 |
+
"learning_rate": 5.905084745762712e-06,
|
15247 |
+
"loss": 1.5641,
|
15248 |
+
"step": 21290
|
15249 |
+
},
|
15250 |
+
{
|
15251 |
+
"epoch": 0.53,
|
15252 |
+
"grad_norm": 5.489622592926025,
|
15253 |
+
"learning_rate": 5.8983050847457635e-06,
|
15254 |
+
"loss": 1.4699,
|
15255 |
+
"step": 21300
|
15256 |
+
},
|
15257 |
+
{
|
15258 |
+
"epoch": 0.53,
|
15259 |
+
"grad_norm": 6.644534111022949,
|
15260 |
+
"learning_rate": 5.891525423728814e-06,
|
15261 |
+
"loss": 1.3273,
|
15262 |
+
"step": 21310
|
15263 |
+
},
|
15264 |
+
{
|
15265 |
+
"epoch": 0.53,
|
15266 |
+
"grad_norm": 13.480459213256836,
|
15267 |
+
"learning_rate": 5.8847457627118655e-06,
|
15268 |
+
"loss": 1.2957,
|
15269 |
+
"step": 21320
|
15270 |
+
},
|
15271 |
+
{
|
15272 |
+
"epoch": 0.53,
|
15273 |
+
"grad_norm": 9.34183120727539,
|
15274 |
+
"learning_rate": 5.877966101694915e-06,
|
15275 |
+
"loss": 1.4145,
|
15276 |
+
"step": 21330
|
15277 |
+
},
|
15278 |
+
{
|
15279 |
+
"epoch": 0.53,
|
15280 |
+
"grad_norm": 4.792973518371582,
|
15281 |
+
"learning_rate": 5.871186440677966e-06,
|
15282 |
+
"loss": 1.4912,
|
15283 |
+
"step": 21340
|
15284 |
+
},
|
15285 |
+
{
|
15286 |
+
"epoch": 0.53,
|
15287 |
+
"grad_norm": 4.098564147949219,
|
15288 |
+
"learning_rate": 5.864406779661017e-06,
|
15289 |
+
"loss": 1.1895,
|
15290 |
+
"step": 21350
|
15291 |
+
},
|
15292 |
+
{
|
15293 |
+
"epoch": 0.53,
|
15294 |
+
"grad_norm": 7.23917293548584,
|
15295 |
+
"learning_rate": 5.857627118644068e-06,
|
15296 |
+
"loss": 1.3875,
|
15297 |
+
"step": 21360
|
15298 |
+
},
|
15299 |
+
{
|
15300 |
+
"epoch": 0.53,
|
15301 |
+
"grad_norm": 4.86613130569458,
|
15302 |
+
"learning_rate": 5.850847457627119e-06,
|
15303 |
+
"loss": 1.4827,
|
15304 |
+
"step": 21370
|
15305 |
+
},
|
15306 |
+
{
|
15307 |
+
"epoch": 0.53,
|
15308 |
+
"grad_norm": 7.629755973815918,
|
15309 |
+
"learning_rate": 5.84406779661017e-06,
|
15310 |
+
"loss": 1.2189,
|
15311 |
+
"step": 21380
|
15312 |
+
},
|
15313 |
+
{
|
15314 |
+
"epoch": 0.53,
|
15315 |
+
"grad_norm": 3.80531907081604,
|
15316 |
+
"learning_rate": 5.837288135593221e-06,
|
15317 |
+
"loss": 1.4064,
|
15318 |
+
"step": 21390
|
15319 |
+
},
|
15320 |
+
{
|
15321 |
+
"epoch": 0.54,
|
15322 |
+
"grad_norm": 3.432089328765869,
|
15323 |
+
"learning_rate": 5.830508474576272e-06,
|
15324 |
+
"loss": 1.1929,
|
15325 |
+
"step": 21400
|
15326 |
+
},
|
15327 |
+
{
|
15328 |
+
"epoch": 0.54,
|
15329 |
+
"grad_norm": 9.766077995300293,
|
15330 |
+
"learning_rate": 5.823728813559323e-06,
|
15331 |
+
"loss": 1.4525,
|
15332 |
+
"step": 21410
|
15333 |
+
},
|
15334 |
+
{
|
15335 |
+
"epoch": 0.54,
|
15336 |
+
"grad_norm": 4.745760440826416,
|
15337 |
+
"learning_rate": 5.8169491525423736e-06,
|
15338 |
+
"loss": 1.4924,
|
15339 |
+
"step": 21420
|
15340 |
+
},
|
15341 |
+
{
|
15342 |
+
"epoch": 0.54,
|
15343 |
+
"grad_norm": 5.188168525695801,
|
15344 |
+
"learning_rate": 5.810169491525425e-06,
|
15345 |
+
"loss": 1.27,
|
15346 |
+
"step": 21430
|
15347 |
+
},
|
15348 |
+
{
|
15349 |
+
"epoch": 0.54,
|
15350 |
+
"grad_norm": 4.576213359832764,
|
15351 |
+
"learning_rate": 5.803389830508475e-06,
|
15352 |
+
"loss": 1.3165,
|
15353 |
+
"step": 21440
|
15354 |
+
},
|
15355 |
+
{
|
15356 |
+
"epoch": 0.54,
|
15357 |
+
"grad_norm": 10.540860176086426,
|
15358 |
+
"learning_rate": 5.796610169491525e-06,
|
15359 |
+
"loss": 1.2652,
|
15360 |
+
"step": 21450
|
15361 |
+
},
|
15362 |
+
{
|
15363 |
+
"epoch": 0.54,
|
15364 |
+
"grad_norm": 4.210390567779541,
|
15365 |
+
"learning_rate": 5.789830508474577e-06,
|
15366 |
+
"loss": 1.4372,
|
15367 |
+
"step": 21460
|
15368 |
+
},
|
15369 |
+
{
|
15370 |
+
"epoch": 0.54,
|
15371 |
+
"grad_norm": 8.733638763427734,
|
15372 |
+
"learning_rate": 5.783050847457627e-06,
|
15373 |
+
"loss": 1.2691,
|
15374 |
+
"step": 21470
|
15375 |
+
},
|
15376 |
+
{
|
15377 |
+
"epoch": 0.54,
|
15378 |
+
"grad_norm": 4.997326374053955,
|
15379 |
+
"learning_rate": 5.776271186440679e-06,
|
15380 |
+
"loss": 1.1454,
|
15381 |
+
"step": 21480
|
15382 |
+
},
|
15383 |
+
{
|
15384 |
+
"epoch": 0.54,
|
15385 |
+
"grad_norm": 10.108692169189453,
|
15386 |
+
"learning_rate": 5.769491525423729e-06,
|
15387 |
+
"loss": 1.2982,
|
15388 |
+
"step": 21490
|
15389 |
+
},
|
15390 |
+
{
|
15391 |
+
"epoch": 0.54,
|
15392 |
+
"grad_norm": 13.393025398254395,
|
15393 |
+
"learning_rate": 5.7627118644067805e-06,
|
15394 |
+
"loss": 1.4995,
|
15395 |
+
"step": 21500
|
15396 |
+
},
|
15397 |
+
{
|
15398 |
+
"epoch": 0.54,
|
15399 |
+
"eval_loss": 1.3565526008605957,
|
15400 |
+
"eval_runtime": 66.1537,
|
15401 |
+
"eval_samples_per_second": 15.116,
|
15402 |
+
"eval_steps_per_second": 15.116,
|
15403 |
+
"step": 21500
|
15404 |
+
},
|
15405 |
+
{
|
15406 |
+
"epoch": 0.54,
|
15407 |
+
"grad_norm": 5.675364017486572,
|
15408 |
+
"learning_rate": 5.755932203389831e-06,
|
15409 |
+
"loss": 1.2256,
|
15410 |
+
"step": 21510
|
15411 |
+
},
|
15412 |
+
{
|
15413 |
+
"epoch": 0.54,
|
15414 |
+
"grad_norm": 2.069751262664795,
|
15415 |
+
"learning_rate": 5.7491525423728825e-06,
|
15416 |
+
"loss": 1.2355,
|
15417 |
+
"step": 21520
|
15418 |
+
},
|
15419 |
+
{
|
15420 |
+
"epoch": 0.54,
|
15421 |
+
"grad_norm": 4.536093711853027,
|
15422 |
+
"learning_rate": 5.742372881355933e-06,
|
15423 |
+
"loss": 1.1957,
|
15424 |
+
"step": 21530
|
15425 |
+
},
|
15426 |
+
{
|
15427 |
+
"epoch": 0.54,
|
15428 |
+
"grad_norm": 2.5274765491485596,
|
15429 |
+
"learning_rate": 5.735593220338983e-06,
|
15430 |
+
"loss": 1.1843,
|
15431 |
+
"step": 21540
|
15432 |
+
},
|
15433 |
+
{
|
15434 |
+
"epoch": 0.54,
|
15435 |
+
"grad_norm": 4.418458461761475,
|
15436 |
+
"learning_rate": 5.728813559322034e-06,
|
15437 |
+
"loss": 1.1994,
|
15438 |
+
"step": 21550
|
15439 |
+
},
|
15440 |
+
{
|
15441 |
+
"epoch": 0.54,
|
15442 |
+
"grad_norm": 13.488496780395508,
|
15443 |
+
"learning_rate": 5.722033898305085e-06,
|
15444 |
+
"loss": 1.4173,
|
15445 |
+
"step": 21560
|
15446 |
+
},
|
15447 |
+
{
|
15448 |
+
"epoch": 0.54,
|
15449 |
+
"grad_norm": 5.223592758178711,
|
15450 |
+
"learning_rate": 5.715254237288136e-06,
|
15451 |
+
"loss": 1.3032,
|
15452 |
+
"step": 21570
|
15453 |
+
},
|
15454 |
+
{
|
15455 |
+
"epoch": 0.54,
|
15456 |
+
"grad_norm": 5.894464492797852,
|
15457 |
+
"learning_rate": 5.708474576271187e-06,
|
15458 |
+
"loss": 1.3027,
|
15459 |
+
"step": 21580
|
15460 |
+
},
|
15461 |
+
{
|
15462 |
+
"epoch": 0.54,
|
15463 |
+
"grad_norm": 6.945793151855469,
|
15464 |
+
"learning_rate": 5.701694915254238e-06,
|
15465 |
+
"loss": 1.3102,
|
15466 |
+
"step": 21590
|
15467 |
+
},
|
15468 |
+
{
|
15469 |
+
"epoch": 0.54,
|
15470 |
+
"grad_norm": 7.386875629425049,
|
15471 |
+
"learning_rate": 5.694915254237289e-06,
|
15472 |
+
"loss": 1.3643,
|
15473 |
+
"step": 21600
|
15474 |
+
},
|
15475 |
+
{
|
15476 |
+
"epoch": 0.54,
|
15477 |
+
"grad_norm": 3.5999162197113037,
|
15478 |
+
"learning_rate": 5.68813559322034e-06,
|
15479 |
+
"loss": 1.2456,
|
15480 |
+
"step": 21610
|
15481 |
+
},
|
15482 |
+
{
|
15483 |
+
"epoch": 0.54,
|
15484 |
+
"grad_norm": 5.8258490562438965,
|
15485 |
+
"learning_rate": 5.6813559322033906e-06,
|
15486 |
+
"loss": 1.1922,
|
15487 |
+
"step": 21620
|
15488 |
+
},
|
15489 |
+
{
|
15490 |
+
"epoch": 0.54,
|
15491 |
+
"grad_norm": 10.920169830322266,
|
15492 |
+
"learning_rate": 5.674576271186442e-06,
|
15493 |
+
"loss": 1.5312,
|
15494 |
+
"step": 21630
|
15495 |
+
},
|
15496 |
+
{
|
15497 |
+
"epoch": 0.54,
|
15498 |
+
"grad_norm": 3.898834705352783,
|
15499 |
+
"learning_rate": 5.667796610169492e-06,
|
15500 |
+
"loss": 1.5012,
|
15501 |
+
"step": 21640
|
15502 |
+
},
|
15503 |
+
{
|
15504 |
+
"epoch": 0.54,
|
15505 |
+
"grad_norm": 6.2130866050720215,
|
15506 |
+
"learning_rate": 5.661016949152542e-06,
|
15507 |
+
"loss": 1.543,
|
15508 |
+
"step": 21650
|
15509 |
+
},
|
15510 |
+
{
|
15511 |
+
"epoch": 0.54,
|
15512 |
+
"grad_norm": 3.604144811630249,
|
15513 |
+
"learning_rate": 5.654237288135594e-06,
|
15514 |
+
"loss": 1.4586,
|
15515 |
+
"step": 21660
|
15516 |
+
},
|
15517 |
+
{
|
15518 |
+
"epoch": 0.54,
|
15519 |
+
"grad_norm": 4.859696388244629,
|
15520 |
+
"learning_rate": 5.647457627118644e-06,
|
15521 |
+
"loss": 1.3056,
|
15522 |
+
"step": 21670
|
15523 |
+
},
|
15524 |
+
{
|
15525 |
+
"epoch": 0.54,
|
15526 |
+
"grad_norm": 2.9596614837646484,
|
15527 |
+
"learning_rate": 5.640677966101696e-06,
|
15528 |
+
"loss": 1.3888,
|
15529 |
+
"step": 21680
|
15530 |
+
},
|
15531 |
+
{
|
15532 |
+
"epoch": 0.54,
|
15533 |
+
"grad_norm": 5.489665985107422,
|
15534 |
+
"learning_rate": 5.633898305084746e-06,
|
15535 |
+
"loss": 1.2491,
|
15536 |
+
"step": 21690
|
15537 |
+
},
|
15538 |
+
{
|
15539 |
+
"epoch": 0.54,
|
15540 |
+
"grad_norm": 5.618114948272705,
|
15541 |
+
"learning_rate": 5.6271186440677975e-06,
|
15542 |
+
"loss": 1.2746,
|
15543 |
+
"step": 21700
|
15544 |
+
},
|
15545 |
+
{
|
15546 |
+
"epoch": 0.54,
|
15547 |
+
"grad_norm": 2.145024061203003,
|
15548 |
+
"learning_rate": 5.620338983050848e-06,
|
15549 |
+
"loss": 1.4036,
|
15550 |
+
"step": 21710
|
15551 |
+
},
|
15552 |
+
{
|
15553 |
+
"epoch": 0.54,
|
15554 |
+
"grad_norm": 2.4870400428771973,
|
15555 |
+
"learning_rate": 5.6135593220338995e-06,
|
15556 |
+
"loss": 1.4355,
|
15557 |
+
"step": 21720
|
15558 |
+
},
|
15559 |
+
{
|
15560 |
+
"epoch": 0.54,
|
15561 |
+
"grad_norm": 2.576144218444824,
|
15562 |
+
"learning_rate": 5.60677966101695e-06,
|
15563 |
+
"loss": 1.2988,
|
15564 |
+
"step": 21730
|
15565 |
+
},
|
15566 |
+
{
|
15567 |
+
"epoch": 0.54,
|
15568 |
+
"grad_norm": 5.971595764160156,
|
15569 |
+
"learning_rate": 5.600000000000001e-06,
|
15570 |
+
"loss": 1.3201,
|
15571 |
+
"step": 21740
|
15572 |
+
},
|
15573 |
+
{
|
15574 |
+
"epoch": 0.54,
|
15575 |
+
"grad_norm": 7.581085205078125,
|
15576 |
+
"learning_rate": 5.593220338983051e-06,
|
15577 |
+
"loss": 1.3358,
|
15578 |
+
"step": 21750
|
15579 |
+
},
|
15580 |
+
{
|
15581 |
+
"epoch": 0.54,
|
15582 |
+
"grad_norm": 4.148537635803223,
|
15583 |
+
"learning_rate": 5.586440677966102e-06,
|
15584 |
+
"loss": 1.2464,
|
15585 |
+
"step": 21760
|
15586 |
+
},
|
15587 |
+
{
|
15588 |
+
"epoch": 0.54,
|
15589 |
+
"grad_norm": 6.613537788391113,
|
15590 |
+
"learning_rate": 5.579661016949153e-06,
|
15591 |
+
"loss": 1.3156,
|
15592 |
+
"step": 21770
|
15593 |
+
},
|
15594 |
+
{
|
15595 |
+
"epoch": 0.54,
|
15596 |
+
"grad_norm": 10.526129722595215,
|
15597 |
+
"learning_rate": 5.572881355932204e-06,
|
15598 |
+
"loss": 1.2483,
|
15599 |
+
"step": 21780
|
15600 |
+
},
|
15601 |
+
{
|
15602 |
+
"epoch": 0.54,
|
15603 |
+
"grad_norm": 7.221047401428223,
|
15604 |
+
"learning_rate": 5.566101694915255e-06,
|
15605 |
+
"loss": 1.5878,
|
15606 |
+
"step": 21790
|
15607 |
+
},
|
15608 |
+
{
|
15609 |
+
"epoch": 0.55,
|
15610 |
+
"grad_norm": 6.365529537200928,
|
15611 |
+
"learning_rate": 5.559322033898306e-06,
|
15612 |
+
"loss": 1.4117,
|
15613 |
+
"step": 21800
|
15614 |
+
},
|
15615 |
+
{
|
15616 |
+
"epoch": 0.55,
|
15617 |
+
"grad_norm": 3.8305916786193848,
|
15618 |
+
"learning_rate": 5.552542372881356e-06,
|
15619 |
+
"loss": 1.1894,
|
15620 |
+
"step": 21810
|
15621 |
+
},
|
15622 |
+
{
|
15623 |
+
"epoch": 0.55,
|
15624 |
+
"grad_norm": 3.672477960586548,
|
15625 |
+
"learning_rate": 5.5457627118644076e-06,
|
15626 |
+
"loss": 1.2199,
|
15627 |
+
"step": 21820
|
15628 |
+
},
|
15629 |
+
{
|
15630 |
+
"epoch": 0.55,
|
15631 |
+
"grad_norm": 2.586512565612793,
|
15632 |
+
"learning_rate": 5.538983050847458e-06,
|
15633 |
+
"loss": 1.392,
|
15634 |
+
"step": 21830
|
15635 |
+
},
|
15636 |
+
{
|
15637 |
+
"epoch": 0.55,
|
15638 |
+
"grad_norm": 4.2184624671936035,
|
15639 |
+
"learning_rate": 5.5322033898305095e-06,
|
15640 |
+
"loss": 1.3008,
|
15641 |
+
"step": 21840
|
15642 |
+
},
|
15643 |
+
{
|
15644 |
+
"epoch": 0.55,
|
15645 |
+
"grad_norm": 7.834671974182129,
|
15646 |
+
"learning_rate": 5.525423728813559e-06,
|
15647 |
+
"loss": 1.1794,
|
15648 |
+
"step": 21850
|
15649 |
+
},
|
15650 |
+
{
|
15651 |
+
"epoch": 0.55,
|
15652 |
+
"grad_norm": 3.5877692699432373,
|
15653 |
+
"learning_rate": 5.518644067796611e-06,
|
15654 |
+
"loss": 1.4339,
|
15655 |
+
"step": 21860
|
15656 |
+
},
|
15657 |
+
{
|
15658 |
+
"epoch": 0.55,
|
15659 |
+
"grad_norm": 3.0174179077148438,
|
15660 |
+
"learning_rate": 5.511864406779661e-06,
|
15661 |
+
"loss": 1.2704,
|
15662 |
+
"step": 21870
|
15663 |
+
},
|
15664 |
+
{
|
15665 |
+
"epoch": 0.55,
|
15666 |
+
"grad_norm": 2.9889588356018066,
|
15667 |
+
"learning_rate": 5.505084745762712e-06,
|
15668 |
+
"loss": 1.3588,
|
15669 |
+
"step": 21880
|
15670 |
+
},
|
15671 |
+
{
|
15672 |
+
"epoch": 0.55,
|
15673 |
+
"grad_norm": 10.810959815979004,
|
15674 |
+
"learning_rate": 5.498305084745763e-06,
|
15675 |
+
"loss": 1.3539,
|
15676 |
+
"step": 21890
|
15677 |
+
},
|
15678 |
+
{
|
15679 |
+
"epoch": 0.55,
|
15680 |
+
"grad_norm": 5.771850109100342,
|
15681 |
+
"learning_rate": 5.491525423728814e-06,
|
15682 |
+
"loss": 1.4278,
|
15683 |
+
"step": 21900
|
15684 |
+
},
|
15685 |
+
{
|
15686 |
+
"epoch": 0.55,
|
15687 |
+
"grad_norm": 4.13969612121582,
|
15688 |
+
"learning_rate": 5.484745762711865e-06,
|
15689 |
+
"loss": 1.2798,
|
15690 |
+
"step": 21910
|
15691 |
+
},
|
15692 |
+
{
|
15693 |
+
"epoch": 0.55,
|
15694 |
+
"grad_norm": 15.295929908752441,
|
15695 |
+
"learning_rate": 5.477966101694916e-06,
|
15696 |
+
"loss": 1.5213,
|
15697 |
+
"step": 21920
|
15698 |
+
},
|
15699 |
+
{
|
15700 |
+
"epoch": 0.55,
|
15701 |
+
"grad_norm": 6.445948600769043,
|
15702 |
+
"learning_rate": 5.471186440677967e-06,
|
15703 |
+
"loss": 1.3498,
|
15704 |
+
"step": 21930
|
15705 |
+
},
|
15706 |
+
{
|
15707 |
+
"epoch": 0.55,
|
15708 |
+
"grad_norm": 9.28097152709961,
|
15709 |
+
"learning_rate": 5.464406779661018e-06,
|
15710 |
+
"loss": 1.3046,
|
15711 |
+
"step": 21940
|
15712 |
+
},
|
15713 |
+
{
|
15714 |
+
"epoch": 0.55,
|
15715 |
+
"grad_norm": 6.094447135925293,
|
15716 |
+
"learning_rate": 5.457627118644067e-06,
|
15717 |
+
"loss": 1.4336,
|
15718 |
+
"step": 21950
|
15719 |
+
},
|
15720 |
+
{
|
15721 |
+
"epoch": 0.55,
|
15722 |
+
"grad_norm": 9.818504333496094,
|
15723 |
+
"learning_rate": 5.450847457627119e-06,
|
15724 |
+
"loss": 1.516,
|
15725 |
+
"step": 21960
|
15726 |
+
},
|
15727 |
+
{
|
15728 |
+
"epoch": 0.55,
|
15729 |
+
"grad_norm": 11.956009864807129,
|
15730 |
+
"learning_rate": 5.444067796610169e-06,
|
15731 |
+
"loss": 1.3435,
|
15732 |
+
"step": 21970
|
15733 |
+
},
|
15734 |
+
{
|
15735 |
+
"epoch": 0.55,
|
15736 |
+
"grad_norm": 7.544681072235107,
|
15737 |
+
"learning_rate": 5.437288135593221e-06,
|
15738 |
+
"loss": 1.4625,
|
15739 |
+
"step": 21980
|
15740 |
+
},
|
15741 |
+
{
|
15742 |
+
"epoch": 0.55,
|
15743 |
+
"grad_norm": 3.394897222518921,
|
15744 |
+
"learning_rate": 5.430508474576271e-06,
|
15745 |
+
"loss": 1.4565,
|
15746 |
+
"step": 21990
|
15747 |
+
},
|
15748 |
+
{
|
15749 |
+
"epoch": 0.55,
|
15750 |
+
"grad_norm": 5.722468852996826,
|
15751 |
+
"learning_rate": 5.423728813559323e-06,
|
15752 |
+
"loss": 1.2133,
|
15753 |
+
"step": 22000
|
15754 |
+
},
|
15755 |
+
{
|
15756 |
+
"epoch": 0.55,
|
15757 |
+
"eval_loss": 1.3403723239898682,
|
15758 |
+
"eval_runtime": 66.1471,
|
15759 |
+
"eval_samples_per_second": 15.118,
|
15760 |
+
"eval_steps_per_second": 15.118,
|
15761 |
+
"step": 22000
|
15762 |
+
},
|
15763 |
+
{
|
15764 |
+
"epoch": 0.55,
|
15765 |
+
"grad_norm": 7.046730995178223,
|
15766 |
+
"learning_rate": 5.416949152542373e-06,
|
15767 |
+
"loss": 1.1353,
|
15768 |
+
"step": 22010
|
15769 |
+
},
|
15770 |
+
{
|
15771 |
+
"epoch": 0.55,
|
15772 |
+
"grad_norm": 7.013365268707275,
|
15773 |
+
"learning_rate": 5.4101694915254246e-06,
|
15774 |
+
"loss": 1.4333,
|
15775 |
+
"step": 22020
|
15776 |
+
},
|
15777 |
+
{
|
15778 |
+
"epoch": 0.55,
|
15779 |
+
"grad_norm": 3.9469892978668213,
|
15780 |
+
"learning_rate": 5.403389830508475e-06,
|
15781 |
+
"loss": 1.3601,
|
15782 |
+
"step": 22030
|
15783 |
+
},
|
15784 |
+
{
|
15785 |
+
"epoch": 0.55,
|
15786 |
+
"grad_norm": 3.740983486175537,
|
15787 |
+
"learning_rate": 5.3966101694915265e-06,
|
15788 |
+
"loss": 1.0232,
|
15789 |
+
"step": 22040
|
15790 |
+
},
|
15791 |
+
{
|
15792 |
+
"epoch": 0.55,
|
15793 |
+
"grad_norm": 4.604060649871826,
|
15794 |
+
"learning_rate": 5.389830508474577e-06,
|
15795 |
+
"loss": 1.4011,
|
15796 |
+
"step": 22050
|
15797 |
+
},
|
15798 |
+
{
|
15799 |
+
"epoch": 0.55,
|
15800 |
+
"grad_norm": 0.5601249933242798,
|
15801 |
+
"learning_rate": 5.383050847457627e-06,
|
15802 |
+
"loss": 1.2068,
|
15803 |
+
"step": 22060
|
15804 |
+
},
|
15805 |
+
{
|
15806 |
+
"epoch": 0.55,
|
15807 |
+
"grad_norm": 2.7781484127044678,
|
15808 |
+
"learning_rate": 5.376271186440678e-06,
|
15809 |
+
"loss": 1.3976,
|
15810 |
+
"step": 22070
|
15811 |
+
},
|
15812 |
+
{
|
15813 |
+
"epoch": 0.55,
|
15814 |
+
"grad_norm": 10.417901992797852,
|
15815 |
+
"learning_rate": 5.369491525423729e-06,
|
15816 |
+
"loss": 1.2444,
|
15817 |
+
"step": 22080
|
15818 |
+
},
|
15819 |
+
{
|
15820 |
+
"epoch": 0.55,
|
15821 |
+
"grad_norm": 8.263280868530273,
|
15822 |
+
"learning_rate": 5.36271186440678e-06,
|
15823 |
+
"loss": 1.0937,
|
15824 |
+
"step": 22090
|
15825 |
+
},
|
15826 |
+
{
|
15827 |
+
"epoch": 0.55,
|
15828 |
+
"grad_norm": 6.128343105316162,
|
15829 |
+
"learning_rate": 5.355932203389831e-06,
|
15830 |
+
"loss": 1.2891,
|
15831 |
+
"step": 22100
|
15832 |
+
},
|
15833 |
+
{
|
15834 |
+
"epoch": 0.55,
|
15835 |
+
"grad_norm": 4.099800109863281,
|
15836 |
+
"learning_rate": 5.349152542372882e-06,
|
15837 |
+
"loss": 1.117,
|
15838 |
+
"step": 22110
|
15839 |
+
},
|
15840 |
+
{
|
15841 |
+
"epoch": 0.55,
|
15842 |
+
"grad_norm": 7.756937026977539,
|
15843 |
+
"learning_rate": 5.342372881355933e-06,
|
15844 |
+
"loss": 1.3112,
|
15845 |
+
"step": 22120
|
15846 |
+
},
|
15847 |
+
{
|
15848 |
+
"epoch": 0.55,
|
15849 |
+
"grad_norm": 5.372160911560059,
|
15850 |
+
"learning_rate": 5.335593220338984e-06,
|
15851 |
+
"loss": 1.5276,
|
15852 |
+
"step": 22130
|
15853 |
+
},
|
15854 |
+
{
|
15855 |
+
"epoch": 0.55,
|
15856 |
+
"grad_norm": 5.017634391784668,
|
15857 |
+
"learning_rate": 5.328813559322035e-06,
|
15858 |
+
"loss": 1.2825,
|
15859 |
+
"step": 22140
|
15860 |
+
},
|
15861 |
+
{
|
15862 |
+
"epoch": 0.55,
|
15863 |
+
"grad_norm": 5.629271030426025,
|
15864 |
+
"learning_rate": 5.322033898305086e-06,
|
15865 |
+
"loss": 1.2428,
|
15866 |
+
"step": 22150
|
15867 |
+
},
|
15868 |
+
{
|
15869 |
+
"epoch": 0.55,
|
15870 |
+
"grad_norm": 6.938544273376465,
|
15871 |
+
"learning_rate": 5.315254237288136e-06,
|
15872 |
+
"loss": 1.4563,
|
15873 |
+
"step": 22160
|
15874 |
+
},
|
15875 |
+
{
|
15876 |
+
"epoch": 0.55,
|
15877 |
+
"grad_norm": 13.804441452026367,
|
15878 |
+
"learning_rate": 5.308474576271186e-06,
|
15879 |
+
"loss": 1.332,
|
15880 |
+
"step": 22170
|
15881 |
+
},
|
15882 |
+
{
|
15883 |
+
"epoch": 0.55,
|
15884 |
+
"grad_norm": 10.347596168518066,
|
15885 |
+
"learning_rate": 5.301694915254238e-06,
|
15886 |
+
"loss": 1.3879,
|
15887 |
+
"step": 22180
|
15888 |
+
},
|
15889 |
+
{
|
15890 |
+
"epoch": 0.55,
|
15891 |
+
"grad_norm": 2.613632917404175,
|
15892 |
+
"learning_rate": 5.294915254237288e-06,
|
15893 |
+
"loss": 1.3298,
|
15894 |
+
"step": 22190
|
15895 |
+
},
|
15896 |
+
{
|
15897 |
+
"epoch": 0.56,
|
15898 |
+
"grad_norm": 14.637787818908691,
|
15899 |
+
"learning_rate": 5.28813559322034e-06,
|
15900 |
+
"loss": 1.3301,
|
15901 |
+
"step": 22200
|
15902 |
+
},
|
15903 |
+
{
|
15904 |
+
"epoch": 0.56,
|
15905 |
+
"grad_norm": 2.5796003341674805,
|
15906 |
+
"learning_rate": 5.28135593220339e-06,
|
15907 |
+
"loss": 1.41,
|
15908 |
+
"step": 22210
|
15909 |
+
},
|
15910 |
+
{
|
15911 |
+
"epoch": 0.56,
|
15912 |
+
"grad_norm": 5.326439380645752,
|
15913 |
+
"learning_rate": 5.2745762711864416e-06,
|
15914 |
+
"loss": 1.2483,
|
15915 |
+
"step": 22220
|
15916 |
+
},
|
15917 |
+
{
|
15918 |
+
"epoch": 0.56,
|
15919 |
+
"grad_norm": 8.928110122680664,
|
15920 |
+
"learning_rate": 5.267796610169492e-06,
|
15921 |
+
"loss": 1.2566,
|
15922 |
+
"step": 22230
|
15923 |
+
},
|
15924 |
+
{
|
15925 |
+
"epoch": 0.56,
|
15926 |
+
"grad_norm": 9.285192489624023,
|
15927 |
+
"learning_rate": 5.2610169491525435e-06,
|
15928 |
+
"loss": 1.2982,
|
15929 |
+
"step": 22240
|
15930 |
+
},
|
15931 |
+
{
|
15932 |
+
"epoch": 0.56,
|
15933 |
+
"grad_norm": 2.101649045944214,
|
15934 |
+
"learning_rate": 5.254237288135594e-06,
|
15935 |
+
"loss": 1.3181,
|
15936 |
+
"step": 22250
|
15937 |
+
},
|
15938 |
+
{
|
15939 |
+
"epoch": 0.56,
|
15940 |
+
"grad_norm": 5.57994270324707,
|
15941 |
+
"learning_rate": 5.247457627118645e-06,
|
15942 |
+
"loss": 1.3745,
|
15943 |
+
"step": 22260
|
15944 |
+
},
|
15945 |
+
{
|
15946 |
+
"epoch": 0.56,
|
15947 |
+
"grad_norm": 12.0460844039917,
|
15948 |
+
"learning_rate": 5.240677966101695e-06,
|
15949 |
+
"loss": 1.202,
|
15950 |
+
"step": 22270
|
15951 |
+
},
|
15952 |
+
{
|
15953 |
+
"epoch": 0.56,
|
15954 |
+
"grad_norm": 5.367649078369141,
|
15955 |
+
"learning_rate": 5.233898305084746e-06,
|
15956 |
+
"loss": 1.2969,
|
15957 |
+
"step": 22280
|
15958 |
+
},
|
15959 |
+
{
|
15960 |
+
"epoch": 0.56,
|
15961 |
+
"grad_norm": 4.784932613372803,
|
15962 |
+
"learning_rate": 5.227118644067797e-06,
|
15963 |
+
"loss": 1.363,
|
15964 |
+
"step": 22290
|
15965 |
+
},
|
15966 |
+
{
|
15967 |
+
"epoch": 0.56,
|
15968 |
+
"grad_norm": 8.103744506835938,
|
15969 |
+
"learning_rate": 5.220338983050848e-06,
|
15970 |
+
"loss": 1.4144,
|
15971 |
+
"step": 22300
|
15972 |
+
},
|
15973 |
+
{
|
15974 |
+
"epoch": 0.56,
|
15975 |
+
"grad_norm": 5.561450958251953,
|
15976 |
+
"learning_rate": 5.213559322033899e-06,
|
15977 |
+
"loss": 1.3018,
|
15978 |
+
"step": 22310
|
15979 |
+
},
|
15980 |
+
{
|
15981 |
+
"epoch": 0.56,
|
15982 |
+
"grad_norm": 8.294450759887695,
|
15983 |
+
"learning_rate": 5.20677966101695e-06,
|
15984 |
+
"loss": 1.3509,
|
15985 |
+
"step": 22320
|
15986 |
+
},
|
15987 |
+
{
|
15988 |
+
"epoch": 0.56,
|
15989 |
+
"grad_norm": 8.498016357421875,
|
15990 |
+
"learning_rate": 5.2e-06,
|
15991 |
+
"loss": 1.4932,
|
15992 |
+
"step": 22330
|
15993 |
+
},
|
15994 |
+
{
|
15995 |
+
"epoch": 0.56,
|
15996 |
+
"grad_norm": 3.3640925884246826,
|
15997 |
+
"learning_rate": 5.193220338983052e-06,
|
15998 |
+
"loss": 1.4413,
|
15999 |
+
"step": 22340
|
16000 |
+
},
|
16001 |
+
{
|
16002 |
+
"epoch": 0.56,
|
16003 |
+
"grad_norm": 3.236924886703491,
|
16004 |
+
"learning_rate": 5.186440677966102e-06,
|
16005 |
+
"loss": 1.2535,
|
16006 |
+
"step": 22350
|
16007 |
+
},
|
16008 |
+
{
|
16009 |
+
"epoch": 0.56,
|
16010 |
+
"grad_norm": 6.234226703643799,
|
16011 |
+
"learning_rate": 5.1796610169491535e-06,
|
16012 |
+
"loss": 1.1816,
|
16013 |
+
"step": 22360
|
16014 |
+
},
|
16015 |
+
{
|
16016 |
+
"epoch": 0.56,
|
16017 |
+
"grad_norm": 6.9503703117370605,
|
16018 |
+
"learning_rate": 5.172881355932203e-06,
|
16019 |
+
"loss": 1.2022,
|
16020 |
+
"step": 22370
|
16021 |
+
},
|
16022 |
+
{
|
16023 |
+
"epoch": 0.56,
|
16024 |
+
"grad_norm": 11.745012283325195,
|
16025 |
+
"learning_rate": 5.166101694915255e-06,
|
16026 |
+
"loss": 1.543,
|
16027 |
+
"step": 22380
|
16028 |
+
},
|
16029 |
+
{
|
16030 |
+
"epoch": 0.56,
|
16031 |
+
"grad_norm": 8.977341651916504,
|
16032 |
+
"learning_rate": 5.159322033898305e-06,
|
16033 |
+
"loss": 1.4185,
|
16034 |
+
"step": 22390
|
16035 |
+
},
|
16036 |
+
{
|
16037 |
+
"epoch": 0.56,
|
16038 |
+
"grad_norm": 2.515448570251465,
|
16039 |
+
"learning_rate": 5.152542372881356e-06,
|
16040 |
+
"loss": 1.2665,
|
16041 |
+
"step": 22400
|
16042 |
+
},
|
16043 |
+
{
|
16044 |
+
"epoch": 0.56,
|
16045 |
+
"grad_norm": 9.47354507446289,
|
16046 |
+
"learning_rate": 5.145762711864407e-06,
|
16047 |
+
"loss": 1.3086,
|
16048 |
+
"step": 22410
|
16049 |
+
},
|
16050 |
+
{
|
16051 |
+
"epoch": 0.56,
|
16052 |
+
"grad_norm": 6.550108432769775,
|
16053 |
+
"learning_rate": 5.138983050847458e-06,
|
16054 |
+
"loss": 1.3129,
|
16055 |
+
"step": 22420
|
16056 |
+
},
|
16057 |
+
{
|
16058 |
+
"epoch": 0.56,
|
16059 |
+
"grad_norm": 3.860886335372925,
|
16060 |
+
"learning_rate": 5.132203389830509e-06,
|
16061 |
+
"loss": 1.4046,
|
16062 |
+
"step": 22430
|
16063 |
+
},
|
16064 |
+
{
|
16065 |
+
"epoch": 0.56,
|
16066 |
+
"grad_norm": 8.062068939208984,
|
16067 |
+
"learning_rate": 5.12542372881356e-06,
|
16068 |
+
"loss": 1.207,
|
16069 |
+
"step": 22440
|
16070 |
+
},
|
16071 |
+
{
|
16072 |
+
"epoch": 0.56,
|
16073 |
+
"grad_norm": 5.823556423187256,
|
16074 |
+
"learning_rate": 5.118644067796611e-06,
|
16075 |
+
"loss": 1.2547,
|
16076 |
+
"step": 22450
|
16077 |
+
},
|
16078 |
+
{
|
16079 |
+
"epoch": 0.56,
|
16080 |
+
"grad_norm": 7.168029308319092,
|
16081 |
+
"learning_rate": 5.111864406779662e-06,
|
16082 |
+
"loss": 1.2772,
|
16083 |
+
"step": 22460
|
16084 |
+
},
|
16085 |
+
{
|
16086 |
+
"epoch": 0.56,
|
16087 |
+
"grad_norm": 9.224081039428711,
|
16088 |
+
"learning_rate": 5.105084745762711e-06,
|
16089 |
+
"loss": 1.3066,
|
16090 |
+
"step": 22470
|
16091 |
+
},
|
16092 |
+
{
|
16093 |
+
"epoch": 0.56,
|
16094 |
+
"grad_norm": 3.878537178039551,
|
16095 |
+
"learning_rate": 5.098305084745763e-06,
|
16096 |
+
"loss": 1.2413,
|
16097 |
+
"step": 22480
|
16098 |
+
},
|
16099 |
+
{
|
16100 |
+
"epoch": 0.56,
|
16101 |
+
"grad_norm": 5.821982383728027,
|
16102 |
+
"learning_rate": 5.091525423728813e-06,
|
16103 |
+
"loss": 1.3609,
|
16104 |
+
"step": 22490
|
16105 |
+
},
|
16106 |
+
{
|
16107 |
+
"epoch": 0.56,
|
16108 |
+
"grad_norm": 9.057456970214844,
|
16109 |
+
"learning_rate": 5.084745762711865e-06,
|
16110 |
+
"loss": 1.2255,
|
16111 |
+
"step": 22500
|
16112 |
+
},
|
16113 |
+
{
|
16114 |
+
"epoch": 0.56,
|
16115 |
+
"eval_loss": 1.3439137935638428,
|
16116 |
+
"eval_runtime": 66.2088,
|
16117 |
+
"eval_samples_per_second": 15.104,
|
16118 |
+
"eval_steps_per_second": 15.104,
|
16119 |
+
"step": 22500
|
16120 |
}
|
16121 |
],
|
16122 |
"logging_steps": 10,
|
|
|
16124 |
"num_input_tokens_seen": 0,
|
16125 |
"num_train_epochs": 1,
|
16126 |
"save_steps": 2500,
|
16127 |
+
"total_flos": 3.6229783486464e+17,
|
16128 |
"train_batch_size": 1,
|
16129 |
"trial_name": null,
|
16130 |
"trial_params": null
|