pszemraj commited on
Commit
1cdbc3d
1 Parent(s): 90c7391

Upload folder using huggingface_hub

Browse files
checkpoints/.hydra/config.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mode: ft
2
+ device: gpu
3
+ precision: bf16
4
+ eval_only: false
5
+ predict_only: false
6
+ seed: 23
7
+ model:
8
+ klass: hf_t5
9
+ compile: true
10
+ name: google/t5-efficient-mini-nl24
11
+ random_init: false
12
+ checkpoint_path: ''
13
+ data:
14
+ dataset: flan
15
+ max_seq_len: 1024
16
+ max_target_len: 128
17
+ num_workers: 8
18
+ n_eval_examples: 500
19
+ exec_file_path: .
20
+ data_dir: .
21
+ task_dir: .
22
+ optim:
23
+ name: adamw
24
+ base_lr: 5.0e-05
25
+ final_cosine: 1.0e-06
26
+ lr_scheduler: constant
27
+ epochs: -1
28
+ batch_size: 64
29
+ grad_acc: 8
30
+ weight_decay: 0.001
31
+ grad_clip: 1.0
32
+ total_steps: 25000
33
+ warmup_steps: 2000
34
+ eval:
35
+ steps: 500
36
+ every_steps: 4000
37
+ checkpoint:
38
+ every_steps: 5000
39
+ logging:
40
+ neptune: false
41
+ neptune_creds:
42
+ project: null
43
+ api_token: null
44
+ tags: ''
45
+ every_steps: 50
46
+ grad_l2: true
47
+ weights_l2: true
checkpoints/.hydra/hydra.yaml ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}-${logging.neptune_creds.tags}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: main
117
+ chdir: true
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: default
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.1'
132
+ cwd: /workspace/nanoT5
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /workspace/nanoT5/nanoT5/configs
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /workspace/nanoT5/logs/2024-08-07/04-07-32-
144
+ choices:
145
+ local_env: default
146
+ hydra/env: default
147
+ hydra/callbacks: null
148
+ hydra/job_logging: default
149
+ hydra/hydra_logging: default
150
+ hydra/hydra_help: default
151
+ hydra/help: default
152
+ hydra/sweeper: basic
153
+ hydra/launcher: basic
154
+ hydra/output: default
155
+ verbose: false
checkpoints/.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
checkpoints/checkpoint-ft-10000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:192ce7574b7f1fee56d28b4233c1419492bd07c85b0e294f522c95bf9d1a4cc0
3
+ size 502583392
checkpoints/checkpoint-ft-10000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cdbcf0f8b24de4549975f78565d5c3dbc12269874ba5a2e8de35a84257f5ac0
3
+ size 14344
checkpoints/checkpoint-ft-15000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02d2317f4e6810fdf537af8a693a836cc125b2d6dd6492c8a46a2268a69a6c79
3
+ size 502583392
checkpoints/checkpoint-ft-15000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfa4f09fdb0f32aa3d01eac304f065f247e976e784f9bc6768ab19b229734962
3
+ size 14344
checkpoints/checkpoint-ft-20000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6be6570c10abf613503afad9fd26152a9221172840b77b47d9475abd610487b6
3
+ size 502583392
checkpoints/checkpoint-ft-20000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:504ad73e70abf3cd83e3f02d6c2a0e1ef5491c7d1cf9af3ed2cfbeb246b09102
3
+ size 14344
checkpoints/checkpoint-ft-5000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6501a8766a1f4272a59f5175863ae3388feb447e634e37534fa861b9ef6fb2c
3
+ size 502583392
checkpoints/checkpoint-ft-5000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2075711e135da3b9d8d419bbe69236c8015d5e6b2ea68c9ffe2cec404ed7d1eb
3
+ size 14344
checkpoints/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/t5-efficient-mini-nl24",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 1536,
8
+ "d_kv": 64,
9
+ "d_model": 384,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "relu",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "relu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": false,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "t5",
20
+ "n_positions": 512,
21
+ "num_decoder_layers": 24,
22
+ "num_heads": 8,
23
+ "num_layers": 24,
24
+ "pad_token_id": 0,
25
+ "relative_attention_max_distance": 128,
26
+ "relative_attention_num_buckets": 32,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.44.0",
29
+ "use_cache": true,
30
+ "vocab_size": 32128
31
+ }
checkpoints/main.log ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2024-08-07 04:07:32,726][Main][INFO] - Distributed environment: NO
2
+ Num processes: 1
3
+ Process index: 0
4
+ Local process index: 0
5
+ Device: cuda
6
+
7
+ Mixed precision type: bf16
8
+
9
+ [2024-08-07 04:07:32,726][Main][INFO] - Working directory is /workspace/nanoT5/logs/2024-08-07/04-07-32-
10
+ [2024-08-07 04:12:18,030][Main][INFO] - [train] Step 50 out of 25000 | Loss --> 11.343 | Grad_l2 --> 43.017 | Weights_l2 --> 47200.114 | Lr --> 0.000 | Seconds_per_step --> 5.126 |
11
+ [2024-08-07 04:13:43,886][Main][INFO] - [train] Step 100 out of 25000 | Loss --> 9.276 | Grad_l2 --> 45.242 | Weights_l2 --> 47199.978 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
12
+ [2024-08-07 04:15:10,676][Main][INFO] - [train] Step 150 out of 25000 | Loss --> 8.145 | Grad_l2 --> 47.689 | Weights_l2 --> 47199.838 | Lr --> 0.000 | Seconds_per_step --> 1.736 |
13
+ [2024-08-07 04:16:36,244][Main][INFO] - [train] Step 200 out of 25000 | Loss --> 7.244 | Grad_l2 --> 49.128 | Weights_l2 --> 47199.701 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
14
+ [2024-08-07 04:18:03,337][Main][INFO] - [train] Step 250 out of 25000 | Loss --> 6.159 | Grad_l2 --> 47.661 | Weights_l2 --> 47199.569 | Lr --> 0.000 | Seconds_per_step --> 1.742 |
15
+ [2024-08-07 04:19:30,382][Main][INFO] - [train] Step 300 out of 25000 | Loss --> 4.863 | Grad_l2 --> 42.692 | Weights_l2 --> 47199.433 | Lr --> 0.000 | Seconds_per_step --> 1.741 |
16
+ [2024-08-07 04:20:56,228][Main][INFO] - [train] Step 350 out of 25000 | Loss --> 3.515 | Grad_l2 --> 32.667 | Weights_l2 --> 47199.297 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
17
+ [2024-08-07 04:22:22,722][Main][INFO] - [train] Step 400 out of 25000 | Loss --> 2.114 | Grad_l2 --> 15.583 | Weights_l2 --> 47199.164 | Lr --> 0.000 | Seconds_per_step --> 1.730 |
18
+ [2024-08-07 04:23:47,989][Main][INFO] - [train] Step 450 out of 25000 | Loss --> 0.978 | Grad_l2 --> 1.009 | Weights_l2 --> 47199.033 | Lr --> 0.000 | Seconds_per_step --> 1.705 |
19
+ [2024-08-07 04:25:15,079][Main][INFO] - [train] Step 500 out of 25000 | Loss --> 0.806 | Grad_l2 --> 0.498 | Weights_l2 --> 47198.897 | Lr --> 0.000 | Seconds_per_step --> 1.742 |
20
+ [2024-08-07 04:26:40,081][Main][INFO] - [train] Step 550 out of 25000 | Loss --> 0.808 | Grad_l2 --> 0.696 | Weights_l2 --> 47198.757 | Lr --> 0.000 | Seconds_per_step --> 1.700 |
21
+ [2024-08-07 04:28:06,532][Main][INFO] - [train] Step 600 out of 25000 | Loss --> 0.785 | Grad_l2 --> 0.455 | Weights_l2 --> 47198.620 | Lr --> 0.000 | Seconds_per_step --> 1.729 |
22
+ [2024-08-07 04:29:33,537][Main][INFO] - [train] Step 650 out of 25000 | Loss --> 0.787 | Grad_l2 --> 0.733 | Weights_l2 --> 47198.484 | Lr --> 0.000 | Seconds_per_step --> 1.740 |
23
+ [2024-08-07 04:30:59,366][Main][INFO] - [train] Step 700 out of 25000 | Loss --> 0.735 | Grad_l2 --> 0.463 | Weights_l2 --> 47198.347 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
24
+ [2024-08-07 04:32:26,571][Main][INFO] - [train] Step 750 out of 25000 | Loss --> 0.714 | Grad_l2 --> 0.298 | Weights_l2 --> 47198.207 | Lr --> 0.000 | Seconds_per_step --> 1.744 |
25
+ [2024-08-07 04:33:51,938][Main][INFO] - [train] Step 800 out of 25000 | Loss --> 0.709 | Grad_l2 --> 0.358 | Weights_l2 --> 47198.070 | Lr --> 0.000 | Seconds_per_step --> 1.707 |
26
+ [2024-08-07 04:35:18,887][Main][INFO] - [train] Step 850 out of 25000 | Loss --> 0.697 | Grad_l2 --> 0.314 | Weights_l2 --> 47197.933 | Lr --> 0.000 | Seconds_per_step --> 1.739 |
27
+ [2024-08-07 04:36:44,507][Main][INFO] - [train] Step 900 out of 25000 | Loss --> 0.712 | Grad_l2 --> 0.621 | Weights_l2 --> 47197.796 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
28
+ [2024-08-07 04:38:11,744][Main][INFO] - [train] Step 950 out of 25000 | Loss --> 0.681 | Grad_l2 --> 0.380 | Weights_l2 --> 47197.660 | Lr --> 0.000 | Seconds_per_step --> 1.745 |
29
+ [2024-08-07 04:39:37,561][Main][INFO] - [train] Step 1000 out of 25000 | Loss --> 0.697 | Grad_l2 --> 0.373 | Weights_l2 --> 47197.523 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
30
+ [2024-08-07 04:41:04,005][Main][INFO] - [train] Step 1050 out of 25000 | Loss --> 0.706 | Grad_l2 --> 0.522 | Weights_l2 --> 47197.382 | Lr --> 0.000 | Seconds_per_step --> 1.729 |
31
+ [2024-08-07 04:42:29,034][Main][INFO] - [train] Step 1100 out of 25000 | Loss --> 0.685 | Grad_l2 --> 0.296 | Weights_l2 --> 47197.245 | Lr --> 0.000 | Seconds_per_step --> 1.701 |
32
+ [2024-08-07 04:43:55,529][Main][INFO] - [train] Step 1150 out of 25000 | Loss --> 0.674 | Grad_l2 --> 0.331 | Weights_l2 --> 47197.109 | Lr --> 0.000 | Seconds_per_step --> 1.730 |
33
+ [2024-08-07 04:45:20,774][Main][INFO] - [train] Step 1200 out of 25000 | Loss --> 0.667 | Grad_l2 --> 0.590 | Weights_l2 --> 47196.972 | Lr --> 0.000 | Seconds_per_step --> 1.705 |
34
+ [2024-08-07 04:46:47,532][Main][INFO] - [train] Step 1250 out of 25000 | Loss --> 0.665 | Grad_l2 --> 0.349 | Weights_l2 --> 47196.835 | Lr --> 0.000 | Seconds_per_step --> 1.735 |
35
+ [2024-08-07 04:48:14,549][Main][INFO] - [train] Step 1300 out of 25000 | Loss --> 0.663 | Grad_l2 --> 0.495 | Weights_l2 --> 47196.698 | Lr --> 0.000 | Seconds_per_step --> 1.740 |
36
+ [2024-08-07 04:49:40,267][Main][INFO] - [train] Step 1350 out of 25000 | Loss --> 0.646 | Grad_l2 --> 0.261 | Weights_l2 --> 47196.557 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
37
+ [2024-08-07 04:51:07,119][Main][INFO] - [train] Step 1400 out of 25000 | Loss --> 0.626 | Grad_l2 --> 0.245 | Weights_l2 --> 47196.420 | Lr --> 0.000 | Seconds_per_step --> 1.737 |
38
+ [2024-08-07 04:52:32,692][Main][INFO] - [train] Step 1450 out of 25000 | Loss --> 0.642 | Grad_l2 --> 0.329 | Weights_l2 --> 47196.283 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
39
+ [2024-08-07 04:53:59,630][Main][INFO] - [train] Step 1500 out of 25000 | Loss --> 0.641 | Grad_l2 --> 0.347 | Weights_l2 --> 47196.146 | Lr --> 0.000 | Seconds_per_step --> 1.739 |
40
+ [2024-08-07 04:55:25,305][Main][INFO] - [train] Step 1550 out of 25000 | Loss --> 0.645 | Grad_l2 --> 0.239 | Weights_l2 --> 47196.009 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
41
+ [2024-08-07 04:56:52,437][Main][INFO] - [train] Step 1600 out of 25000 | Loss --> 0.670 | Grad_l2 --> 0.247 | Weights_l2 --> 47195.869 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
42
+ [2024-08-07 04:58:18,243][Main][INFO] - [train] Step 1650 out of 25000 | Loss --> 0.630 | Grad_l2 --> 0.272 | Weights_l2 --> 47195.736 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
43
+ [2024-08-07 04:59:45,425][Main][INFO] - [train] Step 1700 out of 25000 | Loss --> 0.639 | Grad_l2 --> 0.281 | Weights_l2 --> 47195.599 | Lr --> 0.000 | Seconds_per_step --> 1.744 |
44
+ [2024-08-07 05:01:11,208][Main][INFO] - [train] Step 1750 out of 25000 | Loss --> 0.640 | Grad_l2 --> 0.281 | Weights_l2 --> 47195.462 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
45
+ [2024-08-07 05:02:38,396][Main][INFO] - [train] Step 1800 out of 25000 | Loss --> 0.626 | Grad_l2 --> 0.226 | Weights_l2 --> 47195.321 | Lr --> 0.000 | Seconds_per_step --> 1.744 |
46
+ [2024-08-07 05:04:04,169][Main][INFO] - [train] Step 1850 out of 25000 | Loss --> 0.632 | Grad_l2 --> 0.526 | Weights_l2 --> 47195.184 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
47
+ [2024-08-07 05:05:31,337][Main][INFO] - [train] Step 1900 out of 25000 | Loss --> 0.623 | Grad_l2 --> 0.213 | Weights_l2 --> 47195.047 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
48
+ [2024-08-07 05:06:58,450][Main][INFO] - [train] Step 1950 out of 25000 | Loss --> 0.616 | Grad_l2 --> 0.337 | Weights_l2 --> 47194.910 | Lr --> 0.000 | Seconds_per_step --> 1.742 |
49
+ [2024-08-07 05:08:24,208][Main][INFO] - [train] Step 2000 out of 25000 | Loss --> 0.632 | Grad_l2 --> 0.220 | Weights_l2 --> 47194.773 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
50
+ [2024-08-07 05:09:51,402][Main][INFO] - [train] Step 2050 out of 25000 | Loss --> 0.615 | Grad_l2 --> 0.250 | Weights_l2 --> 47194.632 | Lr --> 0.000 | Seconds_per_step --> 1.744 |
51
+ [2024-08-07 05:11:17,134][Main][INFO] - [train] Step 2100 out of 25000 | Loss --> 0.614 | Grad_l2 --> 0.443 | Weights_l2 --> 47194.495 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
52
+ [2024-08-07 05:12:44,287][Main][INFO] - [train] Step 2150 out of 25000 | Loss --> 0.623 | Grad_l2 --> 0.269 | Weights_l2 --> 47194.358 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
53
+ [2024-08-07 05:14:09,998][Main][INFO] - [train] Step 2200 out of 25000 | Loss --> 0.623 | Grad_l2 --> 0.349 | Weights_l2 --> 47194.221 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
54
+ [2024-08-07 05:15:37,150][Main][INFO] - [train] Step 2250 out of 25000 | Loss --> 0.602 | Grad_l2 --> 0.210 | Weights_l2 --> 47194.084 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
55
+ [2024-08-07 05:17:02,865][Main][INFO] - [train] Step 2300 out of 25000 | Loss --> 0.593 | Grad_l2 --> 0.206 | Weights_l2 --> 47193.947 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
56
+ [2024-08-07 05:18:29,986][Main][INFO] - [train] Step 2350 out of 25000 | Loss --> 0.596 | Grad_l2 --> 0.299 | Weights_l2 --> 47193.810 | Lr --> 0.000 | Seconds_per_step --> 1.742 |
57
+ [2024-08-07 05:19:55,710][Main][INFO] - [train] Step 2400 out of 25000 | Loss --> 0.634 | Grad_l2 --> 0.229 | Weights_l2 --> 47193.669 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
58
+ [2024-08-07 05:21:22,808][Main][INFO] - [train] Step 2450 out of 25000 | Loss --> 0.634 | Grad_l2 --> 0.187 | Weights_l2 --> 47193.532 | Lr --> 0.000 | Seconds_per_step --> 1.742 |
59
+ [2024-08-07 05:22:48,543][Main][INFO] - [train] Step 2500 out of 25000 | Loss --> 0.628 | Grad_l2 --> 0.235 | Weights_l2 --> 47193.395 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
60
+ [2024-08-07 05:24:15,698][Main][INFO] - [train] Step 2550 out of 25000 | Loss --> 0.621 | Grad_l2 --> 0.222 | Weights_l2 --> 47193.258 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
61
+ [2024-08-07 05:25:41,174][Main][INFO] - [train] Step 2600 out of 25000 | Loss --> 0.598 | Grad_l2 --> 0.200 | Weights_l2 --> 47193.121 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
62
+ [2024-08-07 05:27:08,312][Main][INFO] - [train] Step 2650 out of 25000 | Loss --> 0.605 | Grad_l2 --> 0.199 | Weights_l2 --> 47192.984 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
63
+ [2024-08-07 05:28:35,494][Main][INFO] - [train] Step 2700 out of 25000 | Loss --> 0.616 | Grad_l2 --> 0.190 | Weights_l2 --> 47192.847 | Lr --> 0.000 | Seconds_per_step --> 1.744 |
64
+ [2024-08-07 05:30:01,277][Main][INFO] - [train] Step 2750 out of 25000 | Loss --> 0.638 | Grad_l2 --> 0.208 | Weights_l2 --> 47192.710 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
65
+ [2024-08-07 05:31:28,184][Main][INFO] - [train] Step 2800 out of 25000 | Loss --> 0.625 | Grad_l2 --> 0.262 | Weights_l2 --> 47192.573 | Lr --> 0.000 | Seconds_per_step --> 1.738 |
66
+ [2024-08-07 05:32:53,230][Main][INFO] - [train] Step 2850 out of 25000 | Loss --> 0.609 | Grad_l2 --> 0.292 | Weights_l2 --> 47192.432 | Lr --> 0.000 | Seconds_per_step --> 1.701 |
67
+ [2024-08-07 05:34:19,792][Main][INFO] - [train] Step 2900 out of 25000 | Loss --> 0.597 | Grad_l2 --> 0.184 | Weights_l2 --> 47192.295 | Lr --> 0.000 | Seconds_per_step --> 1.731 |
68
+ [2024-08-07 05:35:44,824][Main][INFO] - [train] Step 2950 out of 25000 | Loss --> 0.593 | Grad_l2 --> 0.224 | Weights_l2 --> 47192.158 | Lr --> 0.000 | Seconds_per_step --> 1.701 |
69
+ [2024-08-07 05:37:11,229][Main][INFO] - [train] Step 3000 out of 25000 | Loss --> 0.638 | Grad_l2 --> 0.342 | Weights_l2 --> 47192.021 | Lr --> 0.000 | Seconds_per_step --> 1.728 |
70
+ [2024-08-07 05:38:36,575][Main][INFO] - [train] Step 3050 out of 25000 | Loss --> 0.599 | Grad_l2 --> 0.171 | Weights_l2 --> 47191.884 | Lr --> 0.000 | Seconds_per_step --> 1.707 |
71
+ [2024-08-07 05:40:03,729][Main][INFO] - [train] Step 3100 out of 25000 | Loss --> 0.592 | Grad_l2 --> 0.223 | Weights_l2 --> 47191.743 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
72
+ [2024-08-07 05:41:29,379][Main][INFO] - [train] Step 3150 out of 25000 | Loss --> 0.601 | Grad_l2 --> 0.294 | Weights_l2 --> 47191.610 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
73
+ [2024-08-07 05:42:56,819][Main][INFO] - [train] Step 3200 out of 25000 | Loss --> 0.575 | Grad_l2 --> 0.207 | Weights_l2 --> 47191.469 | Lr --> 0.000 | Seconds_per_step --> 1.749 |
74
+ [2024-08-07 05:44:22,604][Main][INFO] - [train] Step 3250 out of 25000 | Loss --> 0.596 | Grad_l2 --> 0.257 | Weights_l2 --> 47191.332 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
75
+ [2024-08-07 05:45:49,918][Main][INFO] - [train] Step 3300 out of 25000 | Loss --> 0.612 | Grad_l2 --> 0.377 | Weights_l2 --> 47191.199 | Lr --> 0.000 | Seconds_per_step --> 1.746 |
76
+ [2024-08-07 05:47:15,156][Main][INFO] - [train] Step 3350 out of 25000 | Loss --> 0.612 | Grad_l2 --> 0.250 | Weights_l2 --> 47191.062 | Lr --> 0.000 | Seconds_per_step --> 1.705 |
77
+ [2024-08-07 05:48:42,282][Main][INFO] - [train] Step 3400 out of 25000 | Loss --> 0.591 | Grad_l2 --> 0.204 | Weights_l2 --> 47190.925 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
78
+ [2024-08-07 05:50:08,971][Main][INFO] - [train] Step 3450 out of 25000 | Loss --> 0.592 | Grad_l2 --> 0.205 | Weights_l2 --> 47190.784 | Lr --> 0.000 | Seconds_per_step --> 1.734 |
79
+ [2024-08-07 05:51:34,049][Main][INFO] - [train] Step 3500 out of 25000 | Loss --> 0.604 | Grad_l2 --> 0.239 | Weights_l2 --> 47190.647 | Lr --> 0.000 | Seconds_per_step --> 1.702 |
80
+ [2024-08-07 05:53:01,117][Main][INFO] - [train] Step 3550 out of 25000 | Loss --> 0.563 | Grad_l2 --> 0.270 | Weights_l2 --> 47190.510 | Lr --> 0.000 | Seconds_per_step --> 1.741 |
81
+ [2024-08-07 05:54:26,716][Main][INFO] - [train] Step 3600 out of 25000 | Loss --> 0.585 | Grad_l2 --> 0.448 | Weights_l2 --> 47190.373 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
82
+ [2024-08-07 05:55:53,130][Main][INFO] - [train] Step 3650 out of 25000 | Loss --> 0.596 | Grad_l2 --> 0.234 | Weights_l2 --> 47190.236 | Lr --> 0.000 | Seconds_per_step --> 1.728 |
83
+ [2024-08-07 05:57:18,221][Main][INFO] - [train] Step 3700 out of 25000 | Loss --> 0.616 | Grad_l2 --> 0.206 | Weights_l2 --> 47190.099 | Lr --> 0.000 | Seconds_per_step --> 1.702 |
84
+ [2024-08-07 05:58:45,613][Main][INFO] - [train] Step 3750 out of 25000 | Loss --> 0.562 | Grad_l2 --> 0.159 | Weights_l2 --> 47189.962 | Lr --> 0.000 | Seconds_per_step --> 1.748 |
85
+ [2024-08-07 06:00:11,446][Main][INFO] - [train] Step 3800 out of 25000 | Loss --> 0.588 | Grad_l2 --> 0.199 | Weights_l2 --> 47189.821 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
86
+ [2024-08-07 06:01:38,826][Main][INFO] - [train] Step 3850 out of 25000 | Loss --> 0.581 | Grad_l2 --> 0.248 | Weights_l2 --> 47189.684 | Lr --> 0.000 | Seconds_per_step --> 1.748 |
87
+ [2024-08-07 06:03:05,865][Main][INFO] - [train] Step 3900 out of 25000 | Loss --> 0.597 | Grad_l2 --> 0.195 | Weights_l2 --> 47189.547 | Lr --> 0.000 | Seconds_per_step --> 1.741 |
88
+ [2024-08-07 06:04:31,625][Main][INFO] - [train] Step 3950 out of 25000 | Loss --> 0.595 | Grad_l2 --> 0.192 | Weights_l2 --> 47189.410 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
89
+ [2024-08-07 06:05:59,020][Main][INFO] - [train] Step 4000 out of 25000 | Loss --> 0.584 | Grad_l2 --> 0.200 | Weights_l2 --> 47189.273 | Lr --> 0.000 | Seconds_per_step --> 1.748 |
90
+ [2024-08-07 06:09:10,498][Main][INFO] - [eval] Step 4000 out of 25000 | Loss --> 0.941 | Accuracy --> 0.832 | Time --> 191.474 |
91
+ [2024-08-07 06:13:48,778][absl][INFO] - Using default tokenizer.
92
+ [2024-08-07 06:13:49,269][Main][INFO] - [test] Step 4000 out of 25000 | Rougel --> 15.204 | Time --> 278.771 |
93
+ [2024-08-07 06:15:14,427][Main][INFO] - [train] Step 4050 out of 25000 | Loss --> 0.564 | Grad_l2 --> 0.288 | Weights_l2 --> 47189.135 | Lr --> 0.000 | Seconds_per_step --> 1.703 |
94
+ [2024-08-07 06:16:39,500][Main][INFO] - [train] Step 4100 out of 25000 | Loss --> 0.567 | Grad_l2 --> 0.205 | Weights_l2 --> 47188.998 | Lr --> 0.000 | Seconds_per_step --> 1.701 |
95
+ [2024-08-07 06:18:07,374][Main][INFO] - [train] Step 4150 out of 25000 | Loss --> 0.583 | Grad_l2 --> 0.203 | Weights_l2 --> 47188.861 | Lr --> 0.000 | Seconds_per_step --> 1.757 |
96
+ [2024-08-07 06:19:32,587][Main][INFO] - [train] Step 4200 out of 25000 | Loss --> 0.596 | Grad_l2 --> 0.269 | Weights_l2 --> 47188.724 | Lr --> 0.000 | Seconds_per_step --> 1.704 |
97
+ [2024-08-07 06:20:58,195][Main][INFO] - [train] Step 4250 out of 25000 | Loss --> 0.575 | Grad_l2 --> 0.279 | Weights_l2 --> 47188.587 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
98
+ [2024-08-07 06:22:26,587][Main][INFO] - [train] Step 4300 out of 25000 | Loss --> 0.595 | Grad_l2 --> 0.256 | Weights_l2 --> 47188.446 | Lr --> 0.000 | Seconds_per_step --> 1.768 |
99
+ [2024-08-07 06:23:52,408][Main][INFO] - [train] Step 4350 out of 25000 | Loss --> 0.567 | Grad_l2 --> 0.185 | Weights_l2 --> 47188.309 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
100
+ [2024-08-07 06:25:18,175][Main][INFO] - [train] Step 4400 out of 25000 | Loss --> 0.587 | Grad_l2 --> 0.188 | Weights_l2 --> 47188.172 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
101
+ [2024-08-07 06:26:46,749][Main][INFO] - [train] Step 4450 out of 25000 | Loss --> 0.580 | Grad_l2 --> 0.164 | Weights_l2 --> 47188.035 | Lr --> 0.000 | Seconds_per_step --> 1.771 |
102
+ [2024-08-07 06:28:12,711][Main][INFO] - [train] Step 4500 out of 25000 | Loss --> 0.588 | Grad_l2 --> 0.221 | Weights_l2 --> 47187.898 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
103
+ [2024-08-07 06:29:38,541][Main][INFO] - [train] Step 4550 out of 25000 | Loss --> 0.586 | Grad_l2 --> 0.191 | Weights_l2 --> 47187.761 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
104
+ [2024-08-07 06:31:06,867][Main][INFO] - [train] Step 4600 out of 25000 | Loss --> 0.574 | Grad_l2 --> 0.215 | Weights_l2 --> 47187.628 | Lr --> 0.000 | Seconds_per_step --> 1.767 |
105
+ [2024-08-07 06:32:32,538][Main][INFO] - [train] Step 4650 out of 25000 | Loss --> 0.554 | Grad_l2 --> 0.220 | Weights_l2 --> 47187.487 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
106
+ [2024-08-07 06:33:58,044][Main][INFO] - [train] Step 4700 out of 25000 | Loss --> 0.571 | Grad_l2 --> 0.200 | Weights_l2 --> 47187.346 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
107
+ [2024-08-07 06:35:26,162][Main][INFO] - [train] Step 4750 out of 25000 | Loss --> 0.562 | Grad_l2 --> 0.273 | Weights_l2 --> 47187.209 | Lr --> 0.000 | Seconds_per_step --> 1.762 |
108
+ [2024-08-07 06:36:51,666][Main][INFO] - [train] Step 4800 out of 25000 | Loss --> 0.560 | Grad_l2 --> 0.188 | Weights_l2 --> 47187.071 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
109
+ [2024-08-07 06:38:17,107][Main][INFO] - [train] Step 4850 out of 25000 | Loss --> 0.569 | Grad_l2 --> 0.293 | Weights_l2 --> 47186.938 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
110
+ [2024-08-07 06:39:44,965][Main][INFO] - [train] Step 4900 out of 25000 | Loss --> 0.566 | Grad_l2 --> 0.177 | Weights_l2 --> 47186.797 | Lr --> 0.000 | Seconds_per_step --> 1.757 |
111
+ [2024-08-07 06:41:10,239][Main][INFO] - [train] Step 4950 out of 25000 | Loss --> 0.547 | Grad_l2 --> 0.170 | Weights_l2 --> 47186.660 | Lr --> 0.000 | Seconds_per_step --> 1.705 |
112
+ [2024-08-07 06:42:35,764][Main][INFO] - [train] Step 5000 out of 25000 | Loss --> 0.585 | Grad_l2 --> 0.219 | Weights_l2 --> 47186.523 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
113
+ [2024-08-07 06:42:35,765][accelerate.accelerator][INFO] - Saving current state to checkpoint-ft-5000
114
+ [2024-08-07 06:42:35,771][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'lm_head.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
115
+ [2024-08-07 06:42:36,580][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-ft-5000/model.safetensors
116
+ [2024-08-07 06:42:37,727][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-ft-5000/optimizer.bin
117
+ [2024-08-07 06:42:37,728][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-ft-5000/scheduler.bin
118
+ [2024-08-07 06:42:37,728][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-ft-5000/sampler.bin
119
+ [2024-08-07 06:42:37,728][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-ft-5000/sampler_1.bin
120
+ [2024-08-07 06:42:37,729][accelerate.checkpointing][INFO] - Random states saved in checkpoint-ft-5000/random_states_0.pkl
121
+ [2024-08-07 06:44:06,450][Main][INFO] - [train] Step 5050 out of 25000 | Loss --> 0.564 | Grad_l2 --> 0.230 | Weights_l2 --> 47186.386 | Lr --> 0.000 | Seconds_per_step --> 1.814 |
122
+ [2024-08-07 06:45:32,495][Main][INFO] - [train] Step 5100 out of 25000 | Loss --> 0.565 | Grad_l2 --> 0.191 | Weights_l2 --> 47186.248 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
123
+ [2024-08-07 06:46:58,589][Main][INFO] - [train] Step 5150 out of 25000 | Loss --> 0.583 | Grad_l2 --> 0.170 | Weights_l2 --> 47186.111 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
124
+ [2024-08-07 06:48:27,030][Main][INFO] - [train] Step 5200 out of 25000 | Loss --> 0.569 | Grad_l2 --> 0.164 | Weights_l2 --> 47185.974 | Lr --> 0.000 | Seconds_per_step --> 1.769 |
125
+ [2024-08-07 06:49:53,124][Main][INFO] - [train] Step 5250 out of 25000 | Loss --> 0.551 | Grad_l2 --> 0.174 | Weights_l2 --> 47185.837 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
126
+ [2024-08-07 06:51:19,188][Main][INFO] - [train] Step 5300 out of 25000 | Loss --> 0.551 | Grad_l2 --> 0.179 | Weights_l2 --> 47185.700 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
127
+ [2024-08-07 06:52:47,972][Main][INFO] - [train] Step 5350 out of 25000 | Loss --> 0.553 | Grad_l2 --> 0.183 | Weights_l2 --> 47185.558 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
128
+ [2024-08-07 06:54:13,671][Main][INFO] - [train] Step 5400 out of 25000 | Loss --> 0.565 | Grad_l2 --> 0.206 | Weights_l2 --> 47185.421 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
129
+ [2024-08-07 06:55:41,483][Main][INFO] - [train] Step 5450 out of 25000 | Loss --> 0.593 | Grad_l2 --> 0.205 | Weights_l2 --> 47185.288 | Lr --> 0.000 | Seconds_per_step --> 1.756 |
130
+ [2024-08-07 06:57:06,833][Main][INFO] - [train] Step 5500 out of 25000 | Loss --> 0.533 | Grad_l2 --> 0.157 | Weights_l2 --> 47185.151 | Lr --> 0.000 | Seconds_per_step --> 1.707 |
131
+ [2024-08-07 06:58:32,236][Main][INFO] - [train] Step 5550 out of 25000 | Loss --> 0.595 | Grad_l2 --> 0.209 | Weights_l2 --> 47185.009 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
132
+ [2024-08-07 07:00:00,311][Main][INFO] - [train] Step 5600 out of 25000 | Loss --> 0.558 | Grad_l2 --> 0.202 | Weights_l2 --> 47184.872 | Lr --> 0.000 | Seconds_per_step --> 1.761 |
133
+ [2024-08-07 07:01:26,167][Main][INFO] - [train] Step 5650 out of 25000 | Loss --> 0.590 | Grad_l2 --> 0.157 | Weights_l2 --> 47184.735 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
134
+ [2024-08-07 07:02:51,945][Main][INFO] - [train] Step 5700 out of 25000 | Loss --> 0.559 | Grad_l2 --> 0.160 | Weights_l2 --> 47184.598 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
135
+ [2024-08-07 07:04:20,503][Main][INFO] - [train] Step 5750 out of 25000 | Loss --> 0.556 | Grad_l2 --> 0.169 | Weights_l2 --> 47184.460 | Lr --> 0.000 | Seconds_per_step --> 1.771 |
136
+ [2024-08-07 07:05:45,999][Main][INFO] - [train] Step 5800 out of 25000 | Loss --> 0.548 | Grad_l2 --> 0.176 | Weights_l2 --> 47184.323 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
137
+ [2024-08-07 07:07:11,484][Main][INFO] - [train] Step 5850 out of 25000 | Loss --> 0.561 | Grad_l2 --> 0.161 | Weights_l2 --> 47184.186 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
138
+ [2024-08-07 07:08:39,677][Main][INFO] - [train] Step 5900 out of 25000 | Loss --> 0.575 | Grad_l2 --> 0.157 | Weights_l2 --> 47184.049 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
139
+ [2024-08-07 07:10:05,179][Main][INFO] - [train] Step 5950 out of 25000 | Loss --> 0.569 | Grad_l2 --> 0.147 | Weights_l2 --> 47183.911 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
140
+ [2024-08-07 07:11:30,734][Main][INFO] - [train] Step 6000 out of 25000 | Loss --> 0.565 | Grad_l2 --> 0.202 | Weights_l2 --> 47183.774 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
141
+ [2024-08-07 07:12:59,400][Main][INFO] - [train] Step 6050 out of 25000 | Loss --> 0.565 | Grad_l2 --> 0.148 | Weights_l2 --> 47183.633 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
142
+ [2024-08-07 07:14:25,522][Main][INFO] - [train] Step 6100 out of 25000 | Loss --> 0.568 | Grad_l2 --> 0.178 | Weights_l2 --> 47183.496 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
143
+ [2024-08-07 07:15:51,608][Main][INFO] - [train] Step 6150 out of 25000 | Loss --> 0.561 | Grad_l2 --> 0.176 | Weights_l2 --> 47183.362 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
144
+ [2024-08-07 07:17:20,427][Main][INFO] - [train] Step 6200 out of 25000 | Loss --> 0.561 | Grad_l2 --> 0.166 | Weights_l2 --> 47183.225 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
145
+ [2024-08-07 07:18:46,385][Main][INFO] - [train] Step 6250 out of 25000 | Loss --> 0.556 | Grad_l2 --> 0.181 | Weights_l2 --> 47183.084 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
146
+ [2024-08-07 07:20:11,716][Main][INFO] - [train] Step 6300 out of 25000 | Loss --> 0.576 | Grad_l2 --> 0.153 | Weights_l2 --> 47182.951 | Lr --> 0.000 | Seconds_per_step --> 1.707 |
147
+ [2024-08-07 07:21:39,494][Main][INFO] - [train] Step 6350 out of 25000 | Loss --> 0.562 | Grad_l2 --> 0.193 | Weights_l2 --> 47182.813 | Lr --> 0.000 | Seconds_per_step --> 1.756 |
148
+ [2024-08-07 07:23:04,833][Main][INFO] - [train] Step 6400 out of 25000 | Loss --> 0.551 | Grad_l2 --> 0.158 | Weights_l2 --> 47182.676 | Lr --> 0.000 | Seconds_per_step --> 1.707 |
149
+ [2024-08-07 07:24:30,152][Main][INFO] - [train] Step 6450 out of 25000 | Loss --> 0.544 | Grad_l2 --> 0.160 | Weights_l2 --> 47182.539 | Lr --> 0.000 | Seconds_per_step --> 1.706 |
150
+ [2024-08-07 07:25:57,971][Main][INFO] - [train] Step 6500 out of 25000 | Loss --> 0.573 | Grad_l2 --> 0.179 | Weights_l2 --> 47182.398 | Lr --> 0.000 | Seconds_per_step --> 1.756 |
151
+ [2024-08-07 07:27:23,312][Main][INFO] - [train] Step 6550 out of 25000 | Loss --> 0.557 | Grad_l2 --> 0.217 | Weights_l2 --> 47182.260 | Lr --> 0.000 | Seconds_per_step --> 1.707 |
152
+ [2024-08-07 07:28:48,618][Main][INFO] - [train] Step 6600 out of 25000 | Loss --> 0.526 | Grad_l2 --> 0.152 | Weights_l2 --> 47182.127 | Lr --> 0.000 | Seconds_per_step --> 1.706 |
153
+ [2024-08-07 07:30:16,544][Main][INFO] - [train] Step 6650 out of 25000 | Loss --> 0.547 | Grad_l2 --> 0.150 | Weights_l2 --> 47181.986 | Lr --> 0.000 | Seconds_per_step --> 1.759 |
154
+ [2024-08-07 07:31:42,081][Main][INFO] - [train] Step 6700 out of 25000 | Loss --> 0.547 | Grad_l2 --> 0.186 | Weights_l2 --> 47181.848 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
155
+ [2024-08-07 07:33:07,474][Main][INFO] - [train] Step 6750 out of 25000 | Loss --> 0.582 | Grad_l2 --> 0.156 | Weights_l2 --> 47181.711 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
156
+ [2024-08-07 07:34:36,072][Main][INFO] - [train] Step 6800 out of 25000 | Loss --> 0.541 | Grad_l2 --> 0.188 | Weights_l2 --> 47181.574 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
157
+ [2024-08-07 07:36:01,451][Main][INFO] - [train] Step 6850 out of 25000 | Loss --> 0.558 | Grad_l2 --> 0.205 | Weights_l2 --> 47181.437 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
158
+ [2024-08-07 07:37:26,901][Main][INFO] - [train] Step 6900 out of 25000 | Loss --> 0.565 | Grad_l2 --> 0.152 | Weights_l2 --> 47181.299 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
159
+ [2024-08-07 07:38:55,145][Main][INFO] - [train] Step 6950 out of 25000 | Loss --> 0.587 | Grad_l2 --> 0.193 | Weights_l2 --> 47181.162 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
160
+ [2024-08-07 07:40:21,155][Main][INFO] - [train] Step 7000 out of 25000 | Loss --> 0.539 | Grad_l2 --> 0.172 | Weights_l2 --> 47181.025 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
161
+ [2024-08-07 07:41:48,993][Main][INFO] - [train] Step 7050 out of 25000 | Loss --> 0.551 | Grad_l2 --> 0.168 | Weights_l2 --> 47180.887 | Lr --> 0.000 | Seconds_per_step --> 1.757 |
162
+ [2024-08-07 07:43:14,381][Main][INFO] - [train] Step 7100 out of 25000 | Loss --> 0.550 | Grad_l2 --> 0.143 | Weights_l2 --> 47180.750 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
163
+ [2024-08-07 07:44:40,205][Main][INFO] - [train] Step 7150 out of 25000 | Loss --> 0.553 | Grad_l2 --> 0.148 | Weights_l2 --> 47180.609 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
164
+ [2024-08-07 07:46:09,019][Main][INFO] - [train] Step 7200 out of 25000 | Loss --> 0.573 | Grad_l2 --> 0.179 | Weights_l2 --> 47180.472 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
165
+ [2024-08-07 07:47:35,121][Main][INFO] - [train] Step 7250 out of 25000 | Loss --> 0.558 | Grad_l2 --> 0.170 | Weights_l2 --> 47180.338 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
166
+ [2024-08-07 07:49:00,910][Main][INFO] - [train] Step 7300 out of 25000 | Loss --> 0.539 | Grad_l2 --> 0.161 | Weights_l2 --> 47180.201 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
167
+ [2024-08-07 07:50:29,152][Main][INFO] - [train] Step 7350 out of 25000 | Loss --> 0.575 | Grad_l2 --> 0.159 | Weights_l2 --> 47180.064 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
168
+ [2024-08-07 07:51:54,570][Main][INFO] - [train] Step 7400 out of 25000 | Loss --> 0.535 | Grad_l2 --> 0.143 | Weights_l2 --> 47179.926 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
169
+ [2024-08-07 07:53:19,946][Main][INFO] - [train] Step 7450 out of 25000 | Loss --> 0.568 | Grad_l2 --> 0.153 | Weights_l2 --> 47179.789 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
170
+ [2024-08-07 07:54:47,864][Main][INFO] - [train] Step 7500 out of 25000 | Loss --> 0.554 | Grad_l2 --> 0.161 | Weights_l2 --> 47179.652 | Lr --> 0.000 | Seconds_per_step --> 1.758 |
171
+ [2024-08-07 07:56:13,855][Main][INFO] - [train] Step 7550 out of 25000 | Loss --> 0.526 | Grad_l2 --> 0.140 | Weights_l2 --> 47179.511 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
172
+ [2024-08-07 07:57:39,992][Main][INFO] - [train] Step 7600 out of 25000 | Loss --> 0.552 | Grad_l2 --> 0.201 | Weights_l2 --> 47179.377 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
173
+ [2024-08-07 07:59:08,451][Main][INFO] - [train] Step 7650 out of 25000 | Loss --> 0.536 | Grad_l2 --> 0.179 | Weights_l2 --> 47179.236 | Lr --> 0.000 | Seconds_per_step --> 1.769 |
174
+ [2024-08-07 08:00:34,328][Main][INFO] - [train] Step 7700 out of 25000 | Loss --> 0.554 | Grad_l2 --> 0.158 | Weights_l2 --> 47179.103 | Lr --> 0.000 | Seconds_per_step --> 1.718 |
175
+ [2024-08-07 08:01:59,720][Main][INFO] - [train] Step 7750 out of 25000 | Loss --> 0.579 | Grad_l2 --> 0.156 | Weights_l2 --> 47178.965 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
176
+ [2024-08-07 08:03:27,727][Main][INFO] - [train] Step 7800 out of 25000 | Loss --> 0.565 | Grad_l2 --> 0.143 | Weights_l2 --> 47178.828 | Lr --> 0.000 | Seconds_per_step --> 1.760 |
177
+ [2024-08-07 08:04:53,570][Main][INFO] - [train] Step 7850 out of 25000 | Loss --> 0.524 | Grad_l2 --> 0.151 | Weights_l2 --> 47178.687 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
178
+ [2024-08-07 08:06:19,787][Main][INFO] - [train] Step 7900 out of 25000 | Loss --> 0.566 | Grad_l2 --> 0.145 | Weights_l2 --> 47178.549 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
179
+ [2024-08-07 08:07:48,354][Main][INFO] - [train] Step 7950 out of 25000 | Loss --> 0.554 | Grad_l2 --> 0.212 | Weights_l2 --> 47178.412 | Lr --> 0.000 | Seconds_per_step --> 1.771 |
180
+ [2024-08-07 08:09:13,971][Main][INFO] - [train] Step 8000 out of 25000 | Loss --> 0.554 | Grad_l2 --> 0.167 | Weights_l2 --> 47178.275 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
181
+ [2024-08-07 08:09:18,908][Main][INFO] - [eval] Step 8000 out of 25000 | Loss --> 0.880 | Accuracy --> 0.838 | Time --> 4.933 |
182
+ [2024-08-07 08:13:58,225][absl][INFO] - Using default tokenizer.
183
+ [2024-08-07 08:13:58,808][Main][INFO] - [test] Step 8000 out of 25000 | Rougel --> 21.696 | Time --> 279.900 |
184
+ [2024-08-07 08:15:24,876][Main][INFO] - [train] Step 8050 out of 25000 | Loss --> 0.539 | Grad_l2 --> 0.208 | Weights_l2 --> 47178.138 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
185
+ [2024-08-07 08:16:53,543][Main][INFO] - [train] Step 8100 out of 25000 | Loss --> 0.556 | Grad_l2 --> 0.159 | Weights_l2 --> 47178.000 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
186
+ [2024-08-07 08:18:19,674][Main][INFO] - [train] Step 8150 out of 25000 | Loss --> 0.540 | Grad_l2 --> 0.157 | Weights_l2 --> 47177.867 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
187
+ [2024-08-07 08:19:45,669][Main][INFO] - [train] Step 8200 out of 25000 | Loss --> 0.566 | Grad_l2 --> 0.150 | Weights_l2 --> 47177.729 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
188
+ [2024-08-07 08:21:13,985][Main][INFO] - [train] Step 8250 out of 25000 | Loss --> 0.546 | Grad_l2 --> 0.141 | Weights_l2 --> 47177.588 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
189
+ [2024-08-07 08:22:39,528][Main][INFO] - [train] Step 8300 out of 25000 | Loss --> 0.554 | Grad_l2 --> 0.158 | Weights_l2 --> 47177.451 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
190
+ [2024-08-07 08:24:05,082][Main][INFO] - [train] Step 8350 out of 25000 | Loss --> 0.526 | Grad_l2 --> 0.147 | Weights_l2 --> 47177.314 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
191
+ [2024-08-07 08:25:33,450][Main][INFO] - [train] Step 8400 out of 25000 | Loss --> 0.539 | Grad_l2 --> 0.141 | Weights_l2 --> 47177.176 | Lr --> 0.000 | Seconds_per_step --> 1.767 |
192
+ [2024-08-07 08:26:59,375][Main][INFO] - [train] Step 8450 out of 25000 | Loss --> 0.526 | Grad_l2 --> 0.134 | Weights_l2 --> 47177.039 | Lr --> 0.000 | Seconds_per_step --> 1.718 |
193
+ [2024-08-07 08:28:25,341][Main][INFO] - [train] Step 8500 out of 25000 | Loss --> 0.528 | Grad_l2 --> 0.171 | Weights_l2 --> 47176.902 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
194
+ [2024-08-07 08:29:53,919][Main][INFO] - [train] Step 8550 out of 25000 | Loss --> 0.546 | Grad_l2 --> 0.157 | Weights_l2 --> 47176.764 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
195
+ [2024-08-07 08:31:19,464][Main][INFO] - [train] Step 8600 out of 25000 | Loss --> 0.553 | Grad_l2 --> 0.146 | Weights_l2 --> 47176.631 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
196
+ [2024-08-07 08:32:47,795][Main][INFO] - [train] Step 8650 out of 25000 | Loss --> 0.532 | Grad_l2 --> 0.140 | Weights_l2 --> 47176.494 | Lr --> 0.000 | Seconds_per_step --> 1.767 |
197
+ [2024-08-07 08:34:13,984][Main][INFO] - [train] Step 8700 out of 25000 | Loss --> 0.553 | Grad_l2 --> 0.144 | Weights_l2 --> 47176.352 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
198
+ [2024-08-07 08:35:40,153][Main][INFO] - [train] Step 8750 out of 25000 | Loss --> 0.537 | Grad_l2 --> 0.135 | Weights_l2 --> 47176.215 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
199
+ [2024-08-07 08:37:08,880][Main][INFO] - [train] Step 8800 out of 25000 | Loss --> 0.552 | Grad_l2 --> 0.147 | Weights_l2 --> 47176.078 | Lr --> 0.000 | Seconds_per_step --> 1.775 |
200
+ [2024-08-07 08:38:35,095][Main][INFO] - [train] Step 8850 out of 25000 | Loss --> 0.564 | Grad_l2 --> 0.156 | Weights_l2 --> 47175.940 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
201
+ [2024-08-07 08:40:01,329][Main][INFO] - [train] Step 8900 out of 25000 | Loss --> 0.525 | Grad_l2 --> 0.144 | Weights_l2 --> 47175.803 | Lr --> 0.000 | Seconds_per_step --> 1.725 |
202
+ [2024-08-07 08:41:29,387][Main][INFO] - [train] Step 8950 out of 25000 | Loss --> 0.544 | Grad_l2 --> 0.150 | Weights_l2 --> 47175.666 | Lr --> 0.000 | Seconds_per_step --> 1.761 |
203
+ [2024-08-07 08:42:55,260][Main][INFO] - [train] Step 9000 out of 25000 | Loss --> 0.518 | Grad_l2 --> 0.151 | Weights_l2 --> 47175.528 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
204
+ [2024-08-07 08:44:21,359][Main][INFO] - [train] Step 9050 out of 25000 | Loss --> 0.562 | Grad_l2 --> 0.176 | Weights_l2 --> 47175.395 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
205
+ [2024-08-07 08:45:50,226][Main][INFO] - [train] Step 9100 out of 25000 | Loss --> 0.532 | Grad_l2 --> 0.147 | Weights_l2 --> 47175.258 | Lr --> 0.000 | Seconds_per_step --> 1.777 |
206
+ [2024-08-07 08:47:16,084][Main][INFO] - [train] Step 9150 out of 25000 | Loss --> 0.543 | Grad_l2 --> 0.174 | Weights_l2 --> 47175.117 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
207
+ [2024-08-07 08:48:41,636][Main][INFO] - [train] Step 9200 out of 25000 | Loss --> 0.546 | Grad_l2 --> 0.173 | Weights_l2 --> 47174.979 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
208
+ [2024-08-07 08:50:10,103][Main][INFO] - [train] Step 9250 out of 25000 | Loss --> 0.557 | Grad_l2 --> 0.152 | Weights_l2 --> 47174.838 | Lr --> 0.000 | Seconds_per_step --> 1.769 |
209
+ [2024-08-07 08:51:36,133][Main][INFO] - [train] Step 9300 out of 25000 | Loss --> 0.536 | Grad_l2 --> 0.155 | Weights_l2 --> 47174.701 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
210
+ [2024-08-07 08:53:02,290][Main][INFO] - [train] Step 9350 out of 25000 | Loss --> 0.552 | Grad_l2 --> 0.177 | Weights_l2 --> 47174.567 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
211
+ [2024-08-07 08:54:30,686][Main][INFO] - [train] Step 9400 out of 25000 | Loss --> 0.558 | Grad_l2 --> 0.147 | Weights_l2 --> 47174.430 | Lr --> 0.000 | Seconds_per_step --> 1.768 |
212
+ [2024-08-07 08:55:56,533][Main][INFO] - [train] Step 9450 out of 25000 | Loss --> 0.540 | Grad_l2 --> 0.151 | Weights_l2 --> 47174.293 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
213
+ [2024-08-07 08:57:22,391][Main][INFO] - [train] Step 9500 out of 25000 | Loss --> 0.507 | Grad_l2 --> 0.169 | Weights_l2 --> 47174.155 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
214
+ [2024-08-07 08:58:50,627][Main][INFO] - [train] Step 9550 out of 25000 | Loss --> 0.534 | Grad_l2 --> 0.142 | Weights_l2 --> 47174.018 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
215
+ [2024-08-07 09:00:16,767][Main][INFO] - [train] Step 9600 out of 25000 | Loss --> 0.519 | Grad_l2 --> 0.218 | Weights_l2 --> 47173.881 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
216
+ [2024-08-07 09:01:48,187][Main][INFO] - [train] Step 9650 out of 25000 | Loss --> 0.542 | Grad_l2 --> 0.153 | Weights_l2 --> 47173.743 | Lr --> 0.000 | Seconds_per_step --> 1.828 |
217
+ [2024-08-07 09:03:13,777][Main][INFO] - [train] Step 9700 out of 25000 | Loss --> 0.536 | Grad_l2 --> 0.238 | Weights_l2 --> 47173.606 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
218
+ [2024-08-07 09:04:39,235][Main][INFO] - [train] Step 9750 out of 25000 | Loss --> 0.546 | Grad_l2 --> 0.151 | Weights_l2 --> 47173.469 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
219
+ [2024-08-07 09:06:07,645][Main][INFO] - [train] Step 9800 out of 25000 | Loss --> 0.550 | Grad_l2 --> 0.154 | Weights_l2 --> 47173.332 | Lr --> 0.000 | Seconds_per_step --> 1.768 |
220
+ [2024-08-07 09:07:33,777][Main][INFO] - [train] Step 9850 out of 25000 | Loss --> 0.538 | Grad_l2 --> 0.146 | Weights_l2 --> 47173.194 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
221
+ [2024-08-07 09:08:59,343][Main][INFO] - [train] Step 9900 out of 25000 | Loss --> 0.561 | Grad_l2 --> 0.156 | Weights_l2 --> 47173.057 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
222
+ [2024-08-07 09:10:27,785][Main][INFO] - [train] Step 9950 out of 25000 | Loss --> 0.543 | Grad_l2 --> 0.143 | Weights_l2 --> 47172.920 | Lr --> 0.000 | Seconds_per_step --> 1.769 |
223
+ [2024-08-07 09:11:53,857][Main][INFO] - [train] Step 10000 out of 25000 | Loss --> 0.547 | Grad_l2 --> 0.139 | Weights_l2 --> 47172.782 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
224
+ [2024-08-07 09:11:53,857][accelerate.accelerator][INFO] - Saving current state to checkpoint-ft-10000
225
+ [2024-08-07 09:11:53,863][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'lm_head.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
226
+ [2024-08-07 09:11:54,664][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-ft-10000/model.safetensors
227
+ [2024-08-07 09:11:55,798][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-ft-10000/optimizer.bin
228
+ [2024-08-07 09:11:55,798][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-ft-10000/scheduler.bin
229
+ [2024-08-07 09:11:55,799][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-ft-10000/sampler.bin
230
+ [2024-08-07 09:11:55,799][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-ft-10000/sampler_1.bin
231
+ [2024-08-07 09:11:55,800][accelerate.checkpointing][INFO] - Random states saved in checkpoint-ft-10000/random_states_0.pkl
232
+ [2024-08-07 09:13:22,152][Main][INFO] - [train] Step 10050 out of 25000 | Loss --> 0.530 | Grad_l2 --> 0.143 | Weights_l2 --> 47172.645 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
233
+ [2024-08-07 09:14:50,831][Main][INFO] - [train] Step 10100 out of 25000 | Loss --> 0.532 | Grad_l2 --> 0.517 | Weights_l2 --> 47172.508 | Lr --> 0.000 | Seconds_per_step --> 1.774 |
234
+ [2024-08-07 09:16:16,529][Main][INFO] - [train] Step 10150 out of 25000 | Loss --> 0.577 | Grad_l2 --> 0.154 | Weights_l2 --> 47172.370 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
235
+ [2024-08-07 09:17:42,010][Main][INFO] - [train] Step 10200 out of 25000 | Loss --> 0.558 | Grad_l2 --> 0.145 | Weights_l2 --> 47172.233 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
236
+ [2024-08-07 09:19:10,193][Main][INFO] - [train] Step 10250 out of 25000 | Loss --> 0.556 | Grad_l2 --> 0.148 | Weights_l2 --> 47172.096 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
237
+ [2024-08-07 09:20:35,887][Main][INFO] - [train] Step 10300 out of 25000 | Loss --> 0.578 | Grad_l2 --> 0.179 | Weights_l2 --> 47171.958 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
238
+ [2024-08-07 09:22:01,651][Main][INFO] - [train] Step 10350 out of 25000 | Loss --> 0.568 | Grad_l2 --> 0.149 | Weights_l2 --> 47171.821 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
239
+ [2024-08-07 09:23:30,262][Main][INFO] - [train] Step 10400 out of 25000 | Loss --> 0.577 | Grad_l2 --> 0.149 | Weights_l2 --> 47171.684 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
240
+ [2024-08-07 09:24:55,846][Main][INFO] - [train] Step 10450 out of 25000 | Loss --> 0.577 | Grad_l2 --> 0.197 | Weights_l2 --> 47171.546 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
241
+ [2024-08-07 09:26:21,432][Main][INFO] - [train] Step 10500 out of 25000 | Loss --> 0.595 | Grad_l2 --> 0.148 | Weights_l2 --> 47171.409 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
242
+ [2024-08-07 09:27:49,918][Main][INFO] - [train] Step 10550 out of 25000 | Loss --> 0.599 | Grad_l2 --> 0.181 | Weights_l2 --> 47171.272 | Lr --> 0.000 | Seconds_per_step --> 1.770 |
243
+ [2024-08-07 09:29:16,097][Main][INFO] - [train] Step 10600 out of 25000 | Loss --> 0.578 | Grad_l2 --> 0.148 | Weights_l2 --> 47171.135 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
244
+ [2024-08-07 09:30:42,100][Main][INFO] - [train] Step 10650 out of 25000 | Loss --> 0.608 | Grad_l2 --> 0.156 | Weights_l2 --> 47170.997 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
245
+ [2024-08-07 09:32:10,527][Main][INFO] - [train] Step 10700 out of 25000 | Loss --> 0.611 | Grad_l2 --> 0.181 | Weights_l2 --> 47170.860 | Lr --> 0.000 | Seconds_per_step --> 1.769 |
246
+ [2024-08-07 09:33:36,488][Main][INFO] - [train] Step 10750 out of 25000 | Loss --> 0.615 | Grad_l2 --> 0.146 | Weights_l2 --> 47170.723 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
247
+ [2024-08-07 09:35:02,437][Main][INFO] - [train] Step 10800 out of 25000 | Loss --> 0.616 | Grad_l2 --> 0.151 | Weights_l2 --> 47170.585 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
248
+ [2024-08-07 09:36:31,350][Main][INFO] - [train] Step 10850 out of 25000 | Loss --> 0.606 | Grad_l2 --> 0.165 | Weights_l2 --> 47170.448 | Lr --> 0.000 | Seconds_per_step --> 1.778 |
249
+ [2024-08-07 09:37:57,470][Main][INFO] - [train] Step 10900 out of 25000 | Loss --> 0.575 | Grad_l2 --> 0.139 | Weights_l2 --> 47170.311 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
250
+ [2024-08-07 09:39:23,644][Main][INFO] - [train] Step 10950 out of 25000 | Loss --> 0.609 | Grad_l2 --> 0.145 | Weights_l2 --> 47170.173 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
251
+ [2024-08-07 09:40:52,251][Main][INFO] - [train] Step 11000 out of 25000 | Loss --> 0.585 | Grad_l2 --> 0.151 | Weights_l2 --> 47170.036 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
252
+ [2024-08-07 09:42:18,278][Main][INFO] - [train] Step 11050 out of 25000 | Loss --> 0.627 | Grad_l2 --> 0.149 | Weights_l2 --> 47169.899 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
253
+ [2024-08-07 09:43:44,271][Main][INFO] - [train] Step 11100 out of 25000 | Loss --> 0.624 | Grad_l2 --> 0.149 | Weights_l2 --> 47169.762 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
254
+ [2024-08-07 09:45:12,962][Main][INFO] - [train] Step 11150 out of 25000 | Loss --> 0.641 | Grad_l2 --> 0.165 | Weights_l2 --> 47169.620 | Lr --> 0.000 | Seconds_per_step --> 1.774 |
255
+ [2024-08-07 09:46:38,824][Main][INFO] - [train] Step 11200 out of 25000 | Loss --> 0.633 | Grad_l2 --> 0.160 | Weights_l2 --> 47169.483 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
256
+ [2024-08-07 09:48:06,984][Main][INFO] - [train] Step 11250 out of 25000 | Loss --> 0.625 | Grad_l2 --> 0.161 | Weights_l2 --> 47169.350 | Lr --> 0.000 | Seconds_per_step --> 1.763 |
257
+ [2024-08-07 09:49:32,458][Main][INFO] - [train] Step 11300 out of 25000 | Loss --> 0.629 | Grad_l2 --> 0.152 | Weights_l2 --> 47169.212 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
258
+ [2024-08-07 09:50:58,382][Main][INFO] - [train] Step 11350 out of 25000 | Loss --> 0.636 | Grad_l2 --> 0.150 | Weights_l2 --> 47169.075 | Lr --> 0.000 | Seconds_per_step --> 1.718 |
259
+ [2024-08-07 09:52:27,146][Main][INFO] - [train] Step 11400 out of 25000 | Loss --> 0.619 | Grad_l2 --> 0.149 | Weights_l2 --> 47168.938 | Lr --> 0.000 | Seconds_per_step --> 1.775 |
260
+ [2024-08-07 09:53:52,893][Main][INFO] - [train] Step 11450 out of 25000 | Loss --> 0.640 | Grad_l2 --> 0.152 | Weights_l2 --> 47168.800 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
261
+ [2024-08-07 09:55:18,641][Main][INFO] - [train] Step 11500 out of 25000 | Loss --> 0.636 | Grad_l2 --> 0.188 | Weights_l2 --> 47168.663 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
262
+ [2024-08-07 09:56:47,125][Main][INFO] - [train] Step 11550 out of 25000 | Loss --> 0.628 | Grad_l2 --> 0.188 | Weights_l2 --> 47168.526 | Lr --> 0.000 | Seconds_per_step --> 1.770 |
263
+ [2024-08-07 09:58:13,217][Main][INFO] - [train] Step 11600 out of 25000 | Loss --> 0.655 | Grad_l2 --> 0.161 | Weights_l2 --> 47168.392 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
264
+ [2024-08-07 09:59:38,994][Main][INFO] - [train] Step 11650 out of 25000 | Loss --> 0.613 | Grad_l2 --> 0.153 | Weights_l2 --> 47168.251 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
265
+ [2024-08-07 10:01:07,252][Main][INFO] - [train] Step 11700 out of 25000 | Loss --> 0.638 | Grad_l2 --> 0.158 | Weights_l2 --> 47168.114 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
266
+ [2024-08-07 10:02:32,915][Main][INFO] - [train] Step 11750 out of 25000 | Loss --> 0.633 | Grad_l2 --> 0.147 | Weights_l2 --> 47167.977 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
267
+ [2024-08-07 10:03:59,142][Main][INFO] - [train] Step 11800 out of 25000 | Loss --> 0.638 | Grad_l2 --> 0.192 | Weights_l2 --> 47167.839 | Lr --> 0.000 | Seconds_per_step --> 1.725 |
268
+ [2024-08-07 10:05:27,859][Main][INFO] - [train] Step 11850 out of 25000 | Loss --> 0.633 | Grad_l2 --> 0.149 | Weights_l2 --> 47167.702 | Lr --> 0.000 | Seconds_per_step --> 1.774 |
269
+ [2024-08-07 10:06:53,666][Main][INFO] - [train] Step 11900 out of 25000 | Loss --> 0.632 | Grad_l2 --> 0.145 | Weights_l2 --> 47167.565 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
270
+ [2024-08-07 10:08:19,233][Main][INFO] - [train] Step 11950 out of 25000 | Loss --> 0.628 | Grad_l2 --> 0.165 | Weights_l2 --> 47167.431 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
271
+ [2024-08-07 10:09:48,031][Main][INFO] - [train] Step 12000 out of 25000 | Loss --> 0.627 | Grad_l2 --> 0.148 | Weights_l2 --> 47167.294 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
272
+ [2024-08-07 10:09:52,932][Main][INFO] - [eval] Step 12000 out of 25000 | Loss --> 0.851 | Accuracy --> 0.842 | Time --> 4.898 |
273
+ [2024-08-07 10:14:26,659][absl][INFO] - Using default tokenizer.
274
+ [2024-08-07 10:14:27,253][Main][INFO] - [test] Step 12000 out of 25000 | Rougel --> 22.493 | Time --> 274.321 |
275
+ [2024-08-07 10:15:53,027][Main][INFO] - [train] Step 12050 out of 25000 | Loss --> 0.650 | Grad_l2 --> 0.157 | Weights_l2 --> 47167.157 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
276
+ [2024-08-07 10:17:18,994][Main][INFO] - [train] Step 12100 out of 25000 | Loss --> 0.639 | Grad_l2 --> 0.173 | Weights_l2 --> 47167.020 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
277
+ [2024-08-07 10:18:47,538][Main][INFO] - [train] Step 12150 out of 25000 | Loss --> 0.645 | Grad_l2 --> 0.162 | Weights_l2 --> 47166.878 | Lr --> 0.000 | Seconds_per_step --> 1.771 |
278
+ [2024-08-07 10:20:13,740][Main][INFO] - [train] Step 12200 out of 25000 | Loss --> 0.655 | Grad_l2 --> 0.185 | Weights_l2 --> 47166.741 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
279
+ [2024-08-07 10:21:39,599][Main][INFO] - [train] Step 12250 out of 25000 | Loss --> 0.659 | Grad_l2 --> 0.154 | Weights_l2 --> 47166.604 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
280
+ [2024-08-07 10:23:08,120][Main][INFO] - [train] Step 12300 out of 25000 | Loss --> 0.639 | Grad_l2 --> 0.156 | Weights_l2 --> 47166.466 | Lr --> 0.000 | Seconds_per_step --> 1.770 |
281
+ [2024-08-07 10:24:33,959][Main][INFO] - [train] Step 12350 out of 25000 | Loss --> 0.622 | Grad_l2 --> 0.145 | Weights_l2 --> 47166.329 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
282
+ [2024-08-07 10:26:00,262][Main][INFO] - [train] Step 12400 out of 25000 | Loss --> 0.653 | Grad_l2 --> 0.155 | Weights_l2 --> 47166.192 | Lr --> 0.000 | Seconds_per_step --> 1.726 |
283
+ [2024-08-07 10:27:28,963][Main][INFO] - [train] Step 12450 out of 25000 | Loss --> 0.633 | Grad_l2 --> 0.160 | Weights_l2 --> 47166.058 | Lr --> 0.000 | Seconds_per_step --> 1.774 |
284
+ [2024-08-07 10:28:55,101][Main][INFO] - [train] Step 12500 out of 25000 | Loss --> 0.626 | Grad_l2 --> 0.142 | Weights_l2 --> 47165.921 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
285
+ [2024-08-07 10:30:20,881][Main][INFO] - [train] Step 12550 out of 25000 | Loss --> 0.618 | Grad_l2 --> 0.147 | Weights_l2 --> 47165.784 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
286
+ [2024-08-07 10:31:49,106][Main][INFO] - [train] Step 12600 out of 25000 | Loss --> 0.658 | Grad_l2 --> 0.189 | Weights_l2 --> 47165.647 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
287
+ [2024-08-07 10:33:15,133][Main][INFO] - [train] Step 12650 out of 25000 | Loss --> 0.651 | Grad_l2 --> 0.150 | Weights_l2 --> 47165.509 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
288
+ [2024-08-07 10:34:44,001][Main][INFO] - [train] Step 12700 out of 25000 | Loss --> 0.631 | Grad_l2 --> 0.153 | Weights_l2 --> 47165.368 | Lr --> 0.000 | Seconds_per_step --> 1.777 |
289
+ [2024-08-07 10:36:10,488][Main][INFO] - [train] Step 12750 out of 25000 | Loss --> 0.616 | Grad_l2 --> 0.151 | Weights_l2 --> 47165.231 | Lr --> 0.000 | Seconds_per_step --> 1.730 |
290
+ [2024-08-07 10:37:45,117][Main][INFO] - [train] Step 12800 out of 25000 | Loss --> 0.660 | Grad_l2 --> 0.154 | Weights_l2 --> 47165.097 | Lr --> 0.000 | Seconds_per_step --> 1.893 |
291
+ [2024-08-07 10:39:14,106][Main][INFO] - [train] Step 12850 out of 25000 | Loss --> 0.648 | Grad_l2 --> 0.147 | Weights_l2 --> 47164.960 | Lr --> 0.000 | Seconds_per_step --> 1.780 |
292
+ [2024-08-07 10:40:39,804][Main][INFO] - [train] Step 12900 out of 25000 | Loss --> 0.623 | Grad_l2 --> 0.144 | Weights_l2 --> 47164.823 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
293
+ [2024-08-07 10:42:05,250][Main][INFO] - [train] Step 12950 out of 25000 | Loss --> 0.641 | Grad_l2 --> 0.165 | Weights_l2 --> 47164.685 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
294
+ [2024-08-07 10:43:33,303][Main][INFO] - [train] Step 13000 out of 25000 | Loss --> 0.659 | Grad_l2 --> 0.148 | Weights_l2 --> 47164.548 | Lr --> 0.000 | Seconds_per_step --> 1.761 |
295
+ [2024-08-07 10:44:59,507][Main][INFO] - [train] Step 13050 out of 25000 | Loss --> 0.657 | Grad_l2 --> 0.224 | Weights_l2 --> 47164.411 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
296
+ [2024-08-07 10:46:25,581][Main][INFO] - [train] Step 13100 out of 25000 | Loss --> 0.668 | Grad_l2 --> 0.163 | Weights_l2 --> 47164.273 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
297
+ [2024-08-07 10:47:53,904][Main][INFO] - [train] Step 13150 out of 25000 | Loss --> 0.671 | Grad_l2 --> 0.175 | Weights_l2 --> 47164.136 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
298
+ [2024-08-07 10:49:19,864][Main][INFO] - [train] Step 13200 out of 25000 | Loss --> 0.666 | Grad_l2 --> 0.158 | Weights_l2 --> 47163.999 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
299
+ [2024-08-07 10:50:45,992][Main][INFO] - [train] Step 13250 out of 25000 | Loss --> 0.681 | Grad_l2 --> 0.158 | Weights_l2 --> 47163.862 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
300
+ [2024-08-07 10:52:15,654][Main][INFO] - [train] Step 13300 out of 25000 | Loss --> 0.653 | Grad_l2 --> 0.167 | Weights_l2 --> 47163.728 | Lr --> 0.000 | Seconds_per_step --> 1.793 |
301
+ [2024-08-07 10:53:41,694][Main][INFO] - [train] Step 13350 out of 25000 | Loss --> 0.651 | Grad_l2 --> 0.154 | Weights_l2 --> 47163.587 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
302
+ [2024-08-07 10:55:07,494][Main][INFO] - [train] Step 13400 out of 25000 | Loss --> 0.683 | Grad_l2 --> 0.161 | Weights_l2 --> 47163.450 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
303
+ [2024-08-07 10:56:35,968][Main][INFO] - [train] Step 13450 out of 25000 | Loss --> 0.666 | Grad_l2 --> 0.168 | Weights_l2 --> 47163.312 | Lr --> 0.000 | Seconds_per_step --> 1.769 |
304
+ [2024-08-07 10:58:01,621][Main][INFO] - [train] Step 13500 out of 25000 | Loss --> 0.650 | Grad_l2 --> 0.158 | Weights_l2 --> 47163.175 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
305
+ [2024-08-07 10:59:27,354][Main][INFO] - [train] Step 13550 out of 25000 | Loss --> 0.691 | Grad_l2 --> 0.171 | Weights_l2 --> 47163.038 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
306
+ [2024-08-07 11:00:55,576][Main][INFO] - [train] Step 13600 out of 25000 | Loss --> 0.678 | Grad_l2 --> 0.177 | Weights_l2 --> 47162.901 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
307
+ [2024-08-07 11:02:21,441][Main][INFO] - [train] Step 13650 out of 25000 | Loss --> 0.711 | Grad_l2 --> 0.170 | Weights_l2 --> 47162.763 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
308
+ [2024-08-07 11:03:47,588][Main][INFO] - [train] Step 13700 out of 25000 | Loss --> 0.696 | Grad_l2 --> 0.170 | Weights_l2 --> 47162.630 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
309
+ [2024-08-07 11:05:16,219][Main][INFO] - [train] Step 13750 out of 25000 | Loss --> 0.711 | Grad_l2 --> 0.163 | Weights_l2 --> 47162.493 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
310
+ [2024-08-07 11:06:41,706][Main][INFO] - [train] Step 13800 out of 25000 | Loss --> 0.715 | Grad_l2 --> 0.163 | Weights_l2 --> 47162.352 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
311
+ [2024-08-07 11:08:07,195][Main][INFO] - [train] Step 13850 out of 25000 | Loss --> 0.694 | Grad_l2 --> 0.164 | Weights_l2 --> 47162.218 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
312
+ [2024-08-07 11:09:35,388][Main][INFO] - [train] Step 13900 out of 25000 | Loss --> 0.680 | Grad_l2 --> 0.152 | Weights_l2 --> 47162.081 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
313
+ [2024-08-07 11:11:00,908][Main][INFO] - [train] Step 13950 out of 25000 | Loss --> 0.696 | Grad_l2 --> 0.168 | Weights_l2 --> 47161.944 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
314
+ [2024-08-07 11:12:26,729][Main][INFO] - [train] Step 14000 out of 25000 | Loss --> 0.711 | Grad_l2 --> 0.157 | Weights_l2 --> 47161.806 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
315
+ [2024-08-07 11:13:55,581][Main][INFO] - [train] Step 14050 out of 25000 | Loss --> 0.682 | Grad_l2 --> 0.166 | Weights_l2 --> 47161.669 | Lr --> 0.000 | Seconds_per_step --> 1.777 |
316
+ [2024-08-07 11:15:21,781][Main][INFO] - [train] Step 14100 out of 25000 | Loss --> 0.709 | Grad_l2 --> 0.158 | Weights_l2 --> 47161.532 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
317
+ [2024-08-07 11:16:47,977][Main][INFO] - [train] Step 14150 out of 25000 | Loss --> 0.733 | Grad_l2 --> 0.160 | Weights_l2 --> 47161.394 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
318
+ [2024-08-07 11:18:16,612][Main][INFO] - [train] Step 14200 out of 25000 | Loss --> 0.728 | Grad_l2 --> 0.168 | Weights_l2 --> 47161.257 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
319
+ [2024-08-07 11:19:42,684][Main][INFO] - [train] Step 14250 out of 25000 | Loss --> 0.707 | Grad_l2 --> 0.160 | Weights_l2 --> 47161.120 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
320
+ [2024-08-07 11:21:08,147][Main][INFO] - [train] Step 14300 out of 25000 | Loss --> 0.717 | Grad_l2 --> 0.156 | Weights_l2 --> 47160.982 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
321
+ [2024-08-07 11:22:36,306][Main][INFO] - [train] Step 14350 out of 25000 | Loss --> 0.712 | Grad_l2 --> 0.157 | Weights_l2 --> 47160.845 | Lr --> 0.000 | Seconds_per_step --> 1.763 |
322
+ [2024-08-07 11:24:01,811][Main][INFO] - [train] Step 14400 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.164 | Weights_l2 --> 47160.708 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
323
+ [2024-08-07 11:25:27,346][Main][INFO] - [train] Step 14450 out of 25000 | Loss --> 0.706 | Grad_l2 --> 0.160 | Weights_l2 --> 47160.570 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
324
+ [2024-08-07 11:26:55,597][Main][INFO] - [train] Step 14500 out of 25000 | Loss --> 0.731 | Grad_l2 --> 0.159 | Weights_l2 --> 47160.433 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
325
+ [2024-08-07 11:28:21,073][Main][INFO] - [train] Step 14550 out of 25000 | Loss --> 0.713 | Grad_l2 --> 0.159 | Weights_l2 --> 47160.296 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
326
+ [2024-08-07 11:29:46,549][Main][INFO] - [train] Step 14600 out of 25000 | Loss --> 0.712 | Grad_l2 --> 0.170 | Weights_l2 --> 47160.155 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
327
+ [2024-08-07 11:31:14,793][Main][INFO] - [train] Step 14650 out of 25000 | Loss --> 0.732 | Grad_l2 --> 0.156 | Weights_l2 --> 47160.021 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
328
+ [2024-08-07 11:32:40,288][Main][INFO] - [train] Step 14700 out of 25000 | Loss --> 0.725 | Grad_l2 --> 0.154 | Weights_l2 --> 47159.884 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
329
+ [2024-08-07 11:34:05,850][Main][INFO] - [train] Step 14750 out of 25000 | Loss --> 0.689 | Grad_l2 --> 0.147 | Weights_l2 --> 47159.747 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
330
+ [2024-08-07 11:35:34,608][Main][INFO] - [train] Step 14800 out of 25000 | Loss --> 0.705 | Grad_l2 --> 0.170 | Weights_l2 --> 47159.609 | Lr --> 0.000 | Seconds_per_step --> 1.775 |
331
+ [2024-08-07 11:37:00,181][Main][INFO] - [train] Step 14850 out of 25000 | Loss --> 0.722 | Grad_l2 --> 0.165 | Weights_l2 --> 47159.472 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
332
+ [2024-08-07 11:38:25,726][Main][INFO] - [train] Step 14900 out of 25000 | Loss --> 0.696 | Grad_l2 --> 0.174 | Weights_l2 --> 47159.335 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
333
+ [2024-08-07 11:39:53,934][Main][INFO] - [train] Step 14950 out of 25000 | Loss --> 0.732 | Grad_l2 --> 0.180 | Weights_l2 --> 47159.198 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
334
+ [2024-08-07 11:41:19,559][Main][INFO] - [train] Step 15000 out of 25000 | Loss --> 0.705 | Grad_l2 --> 0.173 | Weights_l2 --> 47159.060 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
335
+ [2024-08-07 11:41:19,560][accelerate.accelerator][INFO] - Saving current state to checkpoint-ft-15000
336
+ [2024-08-07 11:41:19,566][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'lm_head.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
337
+ [2024-08-07 11:41:20,382][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-ft-15000/model.safetensors
338
+ [2024-08-07 11:41:21,526][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-ft-15000/optimizer.bin
339
+ [2024-08-07 11:41:21,527][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-ft-15000/scheduler.bin
340
+ [2024-08-07 11:41:21,527][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-ft-15000/sampler.bin
341
+ [2024-08-07 11:41:21,527][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-ft-15000/sampler_1.bin
342
+ [2024-08-07 11:41:21,528][accelerate.checkpointing][INFO] - Random states saved in checkpoint-ft-15000/random_states_0.pkl
343
+ [2024-08-07 11:42:47,021][Main][INFO] - [train] Step 15050 out of 25000 | Loss --> 0.707 | Grad_l2 --> 0.173 | Weights_l2 --> 47158.927 | Lr --> 0.000 | Seconds_per_step --> 1.749 |
344
+ [2024-08-07 11:44:15,207][Main][INFO] - [train] Step 15100 out of 25000 | Loss --> 0.731 | Grad_l2 --> 0.153 | Weights_l2 --> 47158.786 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
345
+ [2024-08-07 11:45:41,007][Main][INFO] - [train] Step 15150 out of 25000 | Loss --> 0.724 | Grad_l2 --> 0.166 | Weights_l2 --> 47158.648 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
346
+ [2024-08-07 11:47:07,191][Main][INFO] - [train] Step 15200 out of 25000 | Loss --> 0.738 | Grad_l2 --> 0.170 | Weights_l2 --> 47158.515 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
347
+ [2024-08-07 11:48:35,506][Main][INFO] - [train] Step 15250 out of 25000 | Loss --> 0.715 | Grad_l2 --> 0.158 | Weights_l2 --> 47158.378 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
348
+ [2024-08-07 11:50:01,669][Main][INFO] - [train] Step 15300 out of 25000 | Loss --> 0.733 | Grad_l2 --> 0.166 | Weights_l2 --> 47158.237 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
349
+ [2024-08-07 11:51:27,955][Main][INFO] - [train] Step 15350 out of 25000 | Loss --> 0.725 | Grad_l2 --> 0.172 | Weights_l2 --> 47158.099 | Lr --> 0.000 | Seconds_per_step --> 1.726 |
350
+ [2024-08-07 11:52:56,776][Main][INFO] - [train] Step 15400 out of 25000 | Loss --> 0.708 | Grad_l2 --> 0.151 | Weights_l2 --> 47157.962 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
351
+ [2024-08-07 11:54:22,548][Main][INFO] - [train] Step 15450 out of 25000 | Loss --> 0.750 | Grad_l2 --> 0.157 | Weights_l2 --> 47157.825 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
352
+ [2024-08-07 11:55:48,808][Main][INFO] - [train] Step 15500 out of 25000 | Loss --> 0.766 | Grad_l2 --> 0.171 | Weights_l2 --> 47157.687 | Lr --> 0.000 | Seconds_per_step --> 1.725 |
353
+ [2024-08-07 11:57:17,585][Main][INFO] - [train] Step 15550 out of 25000 | Loss --> 0.734 | Grad_l2 --> 0.169 | Weights_l2 --> 47157.554 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
354
+ [2024-08-07 11:58:43,740][Main][INFO] - [train] Step 15600 out of 25000 | Loss --> 0.714 | Grad_l2 --> 0.155 | Weights_l2 --> 47157.417 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
355
+ [2024-08-07 12:00:12,386][Main][INFO] - [train] Step 15650 out of 25000 | Loss --> 0.745 | Grad_l2 --> 0.169 | Weights_l2 --> 47157.279 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
356
+ [2024-08-07 12:01:38,506][Main][INFO] - [train] Step 15700 out of 25000 | Loss --> 0.732 | Grad_l2 --> 0.223 | Weights_l2 --> 47157.142 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
357
+ [2024-08-07 12:03:04,291][Main][INFO] - [train] Step 15750 out of 25000 | Loss --> 0.718 | Grad_l2 --> 0.161 | Weights_l2 --> 47157.005 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
358
+ [2024-08-07 12:04:32,531][Main][INFO] - [train] Step 15800 out of 25000 | Loss --> 0.743 | Grad_l2 --> 0.177 | Weights_l2 --> 47156.871 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
359
+ [2024-08-07 12:05:58,031][Main][INFO] - [train] Step 15850 out of 25000 | Loss --> 0.754 | Grad_l2 --> 0.167 | Weights_l2 --> 47156.730 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
360
+ [2024-08-07 12:07:23,569][Main][INFO] - [train] Step 15900 out of 25000 | Loss --> 0.730 | Grad_l2 --> 0.200 | Weights_l2 --> 47156.593 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
361
+ [2024-08-07 12:08:51,789][Main][INFO] - [train] Step 15950 out of 25000 | Loss --> 0.750 | Grad_l2 --> 0.196 | Weights_l2 --> 47156.456 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
362
+ [2024-08-07 12:10:17,593][Main][INFO] - [train] Step 16000 out of 25000 | Loss --> 0.734 | Grad_l2 --> 0.158 | Weights_l2 --> 47156.318 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
363
+ [2024-08-07 12:10:22,535][Main][INFO] - [eval] Step 16000 out of 25000 | Loss --> 0.830 | Accuracy --> 0.846 | Time --> 4.939 |
364
+ [2024-08-07 12:14:42,758][absl][INFO] - Using default tokenizer.
365
+ [2024-08-07 12:14:43,321][Main][INFO] - [test] Step 16000 out of 25000 | Rougel --> 24.234 | Time --> 260.785 |
366
+ [2024-08-07 12:16:09,102][Main][INFO] - [train] Step 16050 out of 25000 | Loss --> 0.724 | Grad_l2 --> 0.153 | Weights_l2 --> 47156.181 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
367
+ [2024-08-07 12:17:38,179][Main][INFO] - [train] Step 16100 out of 25000 | Loss --> 0.746 | Grad_l2 --> 0.173 | Weights_l2 --> 47156.044 | Lr --> 0.000 | Seconds_per_step --> 1.782 |
368
+ [2024-08-07 12:19:04,016][Main][INFO] - [train] Step 16150 out of 25000 | Loss --> 0.746 | Grad_l2 --> 0.207 | Weights_l2 --> 47155.906 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
369
+ [2024-08-07 12:20:29,950][Main][INFO] - [train] Step 16200 out of 25000 | Loss --> 0.749 | Grad_l2 --> 0.155 | Weights_l2 --> 47155.769 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
370
+ [2024-08-07 12:21:58,460][Main][INFO] - [train] Step 16250 out of 25000 | Loss --> 0.730 | Grad_l2 --> 0.171 | Weights_l2 --> 47155.632 | Lr --> 0.000 | Seconds_per_step --> 1.770 |
371
+ [2024-08-07 12:23:24,644][Main][INFO] - [train] Step 16300 out of 25000 | Loss --> 0.738 | Grad_l2 --> 0.172 | Weights_l2 --> 47155.495 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
372
+ [2024-08-07 12:24:50,811][Main][INFO] - [train] Step 16350 out of 25000 | Loss --> 0.726 | Grad_l2 --> 0.159 | Weights_l2 --> 47155.357 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
373
+ [2024-08-07 12:26:18,994][Main][INFO] - [train] Step 16400 out of 25000 | Loss --> 0.726 | Grad_l2 --> 0.155 | Weights_l2 --> 47155.224 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
374
+ [2024-08-07 12:27:44,559][Main][INFO] - [train] Step 16450 out of 25000 | Loss --> 0.721 | Grad_l2 --> 0.169 | Weights_l2 --> 47155.086 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
375
+ [2024-08-07 12:29:10,639][Main][INFO] - [train] Step 16500 out of 25000 | Loss --> 0.733 | Grad_l2 --> 0.174 | Weights_l2 --> 47154.949 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
376
+ [2024-08-07 12:30:38,659][Main][INFO] - [train] Step 16550 out of 25000 | Loss --> 0.712 | Grad_l2 --> 0.174 | Weights_l2 --> 47154.812 | Lr --> 0.000 | Seconds_per_step --> 1.760 |
377
+ [2024-08-07 12:32:04,163][Main][INFO] - [train] Step 16600 out of 25000 | Loss --> 0.718 | Grad_l2 --> 0.152 | Weights_l2 --> 47154.675 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
378
+ [2024-08-07 12:33:29,664][Main][INFO] - [train] Step 16650 out of 25000 | Loss --> 0.734 | Grad_l2 --> 0.196 | Weights_l2 --> 47154.537 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
379
+ [2024-08-07 12:34:57,619][Main][INFO] - [train] Step 16700 out of 25000 | Loss --> 0.745 | Grad_l2 --> 0.158 | Weights_l2 --> 47154.400 | Lr --> 0.000 | Seconds_per_step --> 1.759 |
380
+ [2024-08-07 12:36:23,705][Main][INFO] - [train] Step 16750 out of 25000 | Loss --> 0.754 | Grad_l2 --> 0.228 | Weights_l2 --> 47154.263 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
381
+ [2024-08-07 12:37:51,699][Main][INFO] - [train] Step 16800 out of 25000 | Loss --> 0.744 | Grad_l2 --> 0.185 | Weights_l2 --> 47154.126 | Lr --> 0.000 | Seconds_per_step --> 1.760 |
382
+ [2024-08-07 12:39:17,171][Main][INFO] - [train] Step 16850 out of 25000 | Loss --> 0.742 | Grad_l2 --> 0.172 | Weights_l2 --> 47153.988 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
383
+ [2024-08-07 12:40:43,000][Main][INFO] - [train] Step 16900 out of 25000 | Loss --> 0.736 | Grad_l2 --> 0.216 | Weights_l2 --> 47153.851 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
384
+ [2024-08-07 12:42:11,725][Main][INFO] - [train] Step 16950 out of 25000 | Loss --> 0.737 | Grad_l2 --> 0.212 | Weights_l2 --> 47153.714 | Lr --> 0.000 | Seconds_per_step --> 1.774 |
385
+ [2024-08-07 12:43:38,101][Main][INFO] - [train] Step 17000 out of 25000 | Loss --> 0.737 | Grad_l2 --> 0.188 | Weights_l2 --> 47153.576 | Lr --> 0.000 | Seconds_per_step --> 1.728 |
386
+ [2024-08-07 12:45:04,080][Main][INFO] - [train] Step 17050 out of 25000 | Loss --> 0.753 | Grad_l2 --> 0.337 | Weights_l2 --> 47153.439 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
387
+ [2024-08-07 12:46:32,474][Main][INFO] - [train] Step 17100 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.165 | Weights_l2 --> 47153.302 | Lr --> 0.000 | Seconds_per_step --> 1.768 |
388
+ [2024-08-07 12:47:58,632][Main][INFO] - [train] Step 17150 out of 25000 | Loss --> 0.735 | Grad_l2 --> 0.162 | Weights_l2 --> 47153.165 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
389
+ [2024-08-07 12:49:25,027][Main][INFO] - [train] Step 17200 out of 25000 | Loss --> 0.742 | Grad_l2 --> 0.167 | Weights_l2 --> 47153.031 | Lr --> 0.000 | Seconds_per_step --> 1.728 |
390
+ [2024-08-07 12:50:53,829][Main][INFO] - [train] Step 17250 out of 25000 | Loss --> 0.732 | Grad_l2 --> 0.166 | Weights_l2 --> 47152.894 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
391
+ [2024-08-07 12:52:19,460][Main][INFO] - [train] Step 17300 out of 25000 | Loss --> 0.710 | Grad_l2 --> 0.151 | Weights_l2 --> 47152.757 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
392
+ [2024-08-07 12:53:45,284][Main][INFO] - [train] Step 17350 out of 25000 | Loss --> 0.748 | Grad_l2 --> 0.155 | Weights_l2 --> 47152.619 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
393
+ [2024-08-07 12:55:14,159][Main][INFO] - [train] Step 17400 out of 25000 | Loss --> 0.708 | Grad_l2 --> 0.151 | Weights_l2 --> 47152.482 | Lr --> 0.000 | Seconds_per_step --> 1.777 |
394
+ [2024-08-07 12:56:40,014][Main][INFO] - [train] Step 17450 out of 25000 | Loss --> 0.739 | Grad_l2 --> 0.181 | Weights_l2 --> 47152.345 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
395
+ [2024-08-07 12:58:05,500][Main][INFO] - [train] Step 17500 out of 25000 | Loss --> 0.724 | Grad_l2 --> 0.196 | Weights_l2 --> 47152.211 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
396
+ [2024-08-07 12:59:33,742][Main][INFO] - [train] Step 17550 out of 25000 | Loss --> 0.705 | Grad_l2 --> 0.174 | Weights_l2 --> 47152.070 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
397
+ [2024-08-07 13:00:59,255][Main][INFO] - [train] Step 17600 out of 25000 | Loss --> 0.726 | Grad_l2 --> 0.167 | Weights_l2 --> 47151.933 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
398
+ [2024-08-07 13:02:24,974][Main][INFO] - [train] Step 17650 out of 25000 | Loss --> 0.733 | Grad_l2 --> 0.185 | Weights_l2 --> 47151.796 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
399
+ [2024-08-07 13:03:53,834][Main][INFO] - [train] Step 17700 out of 25000 | Loss --> 0.753 | Grad_l2 --> 0.169 | Weights_l2 --> 47151.658 | Lr --> 0.000 | Seconds_per_step --> 1.777 |
400
+ [2024-08-07 13:05:19,816][Main][INFO] - [train] Step 17750 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.179 | Weights_l2 --> 47151.521 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
401
+ [2024-08-07 13:06:48,589][Main][INFO] - [train] Step 17800 out of 25000 | Loss --> 0.724 | Grad_l2 --> 0.169 | Weights_l2 --> 47151.384 | Lr --> 0.000 | Seconds_per_step --> 1.775 |
402
+ [2024-08-07 13:08:14,874][Main][INFO] - [train] Step 17850 out of 25000 | Loss --> 0.728 | Grad_l2 --> 0.185 | Weights_l2 --> 47151.246 | Lr --> 0.000 | Seconds_per_step --> 1.726 |
403
+ [2024-08-07 13:09:40,853][Main][INFO] - [train] Step 17900 out of 25000 | Loss --> 0.706 | Grad_l2 --> 0.161 | Weights_l2 --> 47151.109 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
404
+ [2024-08-07 13:11:08,861][Main][INFO] - [train] Step 17950 out of 25000 | Loss --> 0.751 | Grad_l2 --> 0.161 | Weights_l2 --> 47150.972 | Lr --> 0.000 | Seconds_per_step --> 1.760 |
405
+ [2024-08-07 13:12:34,980][Main][INFO] - [train] Step 18000 out of 25000 | Loss --> 0.745 | Grad_l2 --> 0.257 | Weights_l2 --> 47150.835 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
406
+ [2024-08-07 13:14:01,362][Main][INFO] - [train] Step 18050 out of 25000 | Loss --> 0.760 | Grad_l2 --> 0.166 | Weights_l2 --> 47150.701 | Lr --> 0.000 | Seconds_per_step --> 1.728 |
407
+ [2024-08-07 13:15:29,325][Main][INFO] - [train] Step 18100 out of 25000 | Loss --> 0.724 | Grad_l2 --> 0.159 | Weights_l2 --> 47150.564 | Lr --> 0.000 | Seconds_per_step --> 1.759 |
408
+ [2024-08-07 13:16:54,802][Main][INFO] - [train] Step 18150 out of 25000 | Loss --> 0.733 | Grad_l2 --> 0.163 | Weights_l2 --> 47150.427 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
409
+ [2024-08-07 13:18:20,272][Main][INFO] - [train] Step 18200 out of 25000 | Loss --> 0.723 | Grad_l2 --> 0.157 | Weights_l2 --> 47150.289 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
410
+ [2024-08-07 13:19:48,874][Main][INFO] - [train] Step 18250 out of 25000 | Loss --> 0.738 | Grad_l2 --> 0.186 | Weights_l2 --> 47150.152 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
411
+ [2024-08-07 13:21:14,606][Main][INFO] - [train] Step 18300 out of 25000 | Loss --> 0.750 | Grad_l2 --> 0.155 | Weights_l2 --> 47150.015 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
412
+ [2024-08-07 13:22:40,318][Main][INFO] - [train] Step 18350 out of 25000 | Loss --> 0.732 | Grad_l2 --> 0.151 | Weights_l2 --> 47149.878 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
413
+ [2024-08-07 13:24:08,741][Main][INFO] - [train] Step 18400 out of 25000 | Loss --> 0.716 | Grad_l2 --> 0.172 | Weights_l2 --> 47149.740 | Lr --> 0.000 | Seconds_per_step --> 1.768 |
414
+ [2024-08-07 13:25:34,410][Main][INFO] - [train] Step 18450 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.164 | Weights_l2 --> 47149.603 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
415
+ [2024-08-07 13:27:00,745][Main][INFO] - [train] Step 18500 out of 25000 | Loss --> 0.731 | Grad_l2 --> 0.156 | Weights_l2 --> 47149.466 | Lr --> 0.000 | Seconds_per_step --> 1.727 |
416
+ [2024-08-07 13:28:29,717][Main][INFO] - [train] Step 18550 out of 25000 | Loss --> 0.723 | Grad_l2 --> 0.153 | Weights_l2 --> 47149.328 | Lr --> 0.000 | Seconds_per_step --> 1.779 |
417
+ [2024-08-07 13:29:55,948][Main][INFO] - [train] Step 18600 out of 25000 | Loss --> 0.709 | Grad_l2 --> 0.167 | Weights_l2 --> 47149.191 | Lr --> 0.000 | Seconds_per_step --> 1.725 |
418
+ [2024-08-07 13:31:22,096][Main][INFO] - [train] Step 18650 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.162 | Weights_l2 --> 47149.054 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
419
+ [2024-08-07 13:32:50,690][Main][INFO] - [train] Step 18700 out of 25000 | Loss --> 0.712 | Grad_l2 --> 0.182 | Weights_l2 --> 47148.916 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
420
+ [2024-08-07 13:34:16,757][Main][INFO] - [train] Step 18750 out of 25000 | Loss --> 0.736 | Grad_l2 --> 0.196 | Weights_l2 --> 47148.779 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
421
+ [2024-08-07 13:35:42,343][Main][INFO] - [train] Step 18800 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.199 | Weights_l2 --> 47148.642 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
422
+ [2024-08-07 13:37:10,652][Main][INFO] - [train] Step 18850 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.161 | Weights_l2 --> 47148.504 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
423
+ [2024-08-07 13:38:41,717][Main][INFO] - [train] Step 18900 out of 25000 | Loss --> 0.751 | Grad_l2 --> 0.159 | Weights_l2 --> 47148.367 | Lr --> 0.000 | Seconds_per_step --> 1.821 |
424
+ [2024-08-07 13:40:07,531][Main][INFO] - [train] Step 18950 out of 25000 | Loss --> 0.726 | Grad_l2 --> 0.186 | Weights_l2 --> 47148.234 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
425
+ [2024-08-07 13:41:35,943][Main][INFO] - [train] Step 19000 out of 25000 | Loss --> 0.743 | Grad_l2 --> 0.260 | Weights_l2 --> 47148.096 | Lr --> 0.000 | Seconds_per_step --> 1.768 |
426
+ [2024-08-07 13:43:02,256][Main][INFO] - [train] Step 19050 out of 25000 | Loss --> 0.727 | Grad_l2 --> 0.218 | Weights_l2 --> 47147.959 | Lr --> 0.000 | Seconds_per_step --> 1.726 |
427
+ [2024-08-07 13:44:28,589][Main][INFO] - [train] Step 19100 out of 25000 | Loss --> 0.697 | Grad_l2 --> 0.156 | Weights_l2 --> 47147.822 | Lr --> 0.000 | Seconds_per_step --> 1.727 |
428
+ [2024-08-07 13:45:57,171][Main][INFO] - [train] Step 19150 out of 25000 | Loss --> 0.726 | Grad_l2 --> 0.205 | Weights_l2 --> 47147.685 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
429
+ [2024-08-07 13:47:22,671][Main][INFO] - [train] Step 19200 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.212 | Weights_l2 --> 47147.547 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
430
+ [2024-08-07 13:48:50,958][Main][INFO] - [train] Step 19250 out of 25000 | Loss --> 0.702 | Grad_l2 --> 0.176 | Weights_l2 --> 47147.410 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
431
+ [2024-08-07 13:50:16,489][Main][INFO] - [train] Step 19300 out of 25000 | Loss --> 0.694 | Grad_l2 --> 0.156 | Weights_l2 --> 47147.273 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
432
+ [2024-08-07 13:51:42,217][Main][INFO] - [train] Step 19350 out of 25000 | Loss --> 0.678 | Grad_l2 --> 0.158 | Weights_l2 --> 47147.135 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
433
+ [2024-08-07 13:53:10,925][Main][INFO] - [train] Step 19400 out of 25000 | Loss --> 0.708 | Grad_l2 --> 0.167 | Weights_l2 --> 47146.998 | Lr --> 0.000 | Seconds_per_step --> 1.774 |
434
+ [2024-08-07 13:54:37,089][Main][INFO] - [train] Step 19450 out of 25000 | Loss --> 0.721 | Grad_l2 --> 0.175 | Weights_l2 --> 47146.861 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
435
+ [2024-08-07 13:56:03,239][Main][INFO] - [train] Step 19500 out of 25000 | Loss --> 0.707 | Grad_l2 --> 0.158 | Weights_l2 --> 47146.723 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
436
+ [2024-08-07 13:57:31,897][Main][INFO] - [train] Step 19550 out of 25000 | Loss --> 0.684 | Grad_l2 --> 0.158 | Weights_l2 --> 47146.586 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
437
+ [2024-08-07 13:58:57,731][Main][INFO] - [train] Step 19600 out of 25000 | Loss --> 0.725 | Grad_l2 --> 0.158 | Weights_l2 --> 47146.449 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
438
+ [2024-08-07 14:00:23,264][Main][INFO] - [train] Step 19650 out of 25000 | Loss --> 0.699 | Grad_l2 --> 0.163 | Weights_l2 --> 47146.312 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
439
+ [2024-08-07 14:01:51,434][Main][INFO] - [train] Step 19700 out of 25000 | Loss --> 0.723 | Grad_l2 --> 0.169 | Weights_l2 --> 47146.174 | Lr --> 0.000 | Seconds_per_step --> 1.763 |
440
+ [2024-08-07 14:03:17,033][Main][INFO] - [train] Step 19750 out of 25000 | Loss --> 0.651 | Grad_l2 --> 0.190 | Weights_l2 --> 47146.037 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
441
+ [2024-08-07 14:04:42,658][Main][INFO] - [train] Step 19800 out of 25000 | Loss --> 0.686 | Grad_l2 --> 0.156 | Weights_l2 --> 47145.900 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
442
+ [2024-08-07 14:06:10,961][Main][INFO] - [train] Step 19850 out of 25000 | Loss --> 0.687 | Grad_l2 --> 0.160 | Weights_l2 --> 47145.766 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
443
+ [2024-08-07 14:07:36,890][Main][INFO] - [train] Step 19900 out of 25000 | Loss --> 0.698 | Grad_l2 --> 0.177 | Weights_l2 --> 47145.629 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
444
+ [2024-08-07 14:09:03,084][Main][INFO] - [train] Step 19950 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.166 | Weights_l2 --> 47145.492 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
445
+ [2024-08-07 14:10:31,989][Main][INFO] - [train] Step 20000 out of 25000 | Loss --> 0.668 | Grad_l2 --> 0.157 | Weights_l2 --> 47145.354 | Lr --> 0.000 | Seconds_per_step --> 1.778 |
446
+ [2024-08-07 14:10:36,920][Main][INFO] - [eval] Step 20000 out of 25000 | Loss --> 0.811 | Accuracy --> 0.849 | Time --> 4.928 |
447
+ [2024-08-07 14:15:07,226][absl][INFO] - Using default tokenizer.
448
+ [2024-08-07 14:15:07,807][Main][INFO] - [test] Step 20000 out of 25000 | Rougel --> 25.044 | Time --> 270.886 |
449
+ [2024-08-07 14:15:07,811][accelerate.accelerator][INFO] - Saving current state to checkpoint-ft-20000
450
+ [2024-08-07 14:15:07,819][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'lm_head.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
451
+ [2024-08-07 14:15:08,650][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-ft-20000/model.safetensors
452
+ [2024-08-07 14:15:09,813][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-ft-20000/optimizer.bin
453
+ [2024-08-07 14:15:09,814][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-ft-20000/scheduler.bin
454
+ [2024-08-07 14:15:09,814][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-ft-20000/sampler.bin
455
+ [2024-08-07 14:15:09,814][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-ft-20000/sampler_1.bin
456
+ [2024-08-07 14:15:09,815][accelerate.checkpointing][INFO] - Random states saved in checkpoint-ft-20000/random_states_0.pkl
457
+ [2024-08-07 14:16:36,010][Main][INFO] - [train] Step 20050 out of 25000 | Loss --> 0.686 | Grad_l2 --> 0.170 | Weights_l2 --> 47145.217 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
458
+ [2024-08-07 14:18:02,209][Main][INFO] - [train] Step 20100 out of 25000 | Loss --> 0.709 | Grad_l2 --> 0.167 | Weights_l2 --> 47145.080 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
459
+ [2024-08-07 14:19:30,971][Main][INFO] - [train] Step 20150 out of 25000 | Loss --> 0.682 | Grad_l2 --> 0.173 | Weights_l2 --> 47144.943 | Lr --> 0.000 | Seconds_per_step --> 1.775 |
460
+ [2024-08-07 14:20:56,968][Main][INFO] - [train] Step 20200 out of 25000 | Loss --> 0.662 | Grad_l2 --> 0.154 | Weights_l2 --> 47144.809 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
461
+ [2024-08-07 14:22:22,622][Main][INFO] - [train] Step 20250 out of 25000 | Loss --> 0.668 | Grad_l2 --> 0.173 | Weights_l2 --> 47144.668 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
462
+ [2024-08-07 14:23:50,811][Main][INFO] - [train] Step 20300 out of 25000 | Loss --> 0.648 | Grad_l2 --> 0.173 | Weights_l2 --> 47144.531 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
463
+ [2024-08-07 14:25:16,348][Main][INFO] - [train] Step 20350 out of 25000 | Loss --> 0.653 | Grad_l2 --> 0.181 | Weights_l2 --> 47144.397 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
464
+ [2024-08-07 14:26:41,867][Main][INFO] - [train] Step 20400 out of 25000 | Loss --> 0.651 | Grad_l2 --> 0.159 | Weights_l2 --> 47144.256 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
465
+ [2024-08-07 14:28:09,838][Main][INFO] - [train] Step 20450 out of 25000 | Loss --> 0.667 | Grad_l2 --> 0.268 | Weights_l2 --> 47144.123 | Lr --> 0.000 | Seconds_per_step --> 1.759 |
466
+ [2024-08-07 14:29:35,617][Main][INFO] - [train] Step 20500 out of 25000 | Loss --> 0.660 | Grad_l2 --> 0.167 | Weights_l2 --> 47143.985 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
467
+ [2024-08-07 14:31:08,507][Main][INFO] - [train] Step 20550 out of 25000 | Loss --> 0.660 | Grad_l2 --> 0.154 | Weights_l2 --> 47143.848 | Lr --> 0.000 | Seconds_per_step --> 1.858 |
468
+ [2024-08-07 14:32:37,146][Main][INFO] - [train] Step 20600 out of 25000 | Loss --> 0.667 | Grad_l2 --> 0.173 | Weights_l2 --> 47143.711 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
469
+ [2024-08-07 14:34:02,981][Main][INFO] - [train] Step 20650 out of 25000 | Loss --> 0.663 | Grad_l2 --> 0.157 | Weights_l2 --> 47143.573 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
470
+ [2024-08-07 14:35:28,537][Main][INFO] - [train] Step 20700 out of 25000 | Loss --> 0.641 | Grad_l2 --> 0.158 | Weights_l2 --> 47143.440 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
471
+ [2024-08-07 14:36:56,530][Main][INFO] - [train] Step 20750 out of 25000 | Loss --> 0.623 | Grad_l2 --> 0.185 | Weights_l2 --> 47143.303 | Lr --> 0.000 | Seconds_per_step --> 1.760 |
472
+ [2024-08-07 14:38:22,846][Main][INFO] - [train] Step 20800 out of 25000 | Loss --> 0.636 | Grad_l2 --> 0.152 | Weights_l2 --> 47143.162 | Lr --> 0.000 | Seconds_per_step --> 1.726 |
473
+ [2024-08-07 14:39:48,731][Main][INFO] - [train] Step 20850 out of 25000 | Loss --> 0.630 | Grad_l2 --> 0.153 | Weights_l2 --> 47143.024 | Lr --> 0.000 | Seconds_per_step --> 1.718 |
474
+ [2024-08-07 14:41:16,867][Main][INFO] - [train] Step 20900 out of 25000 | Loss --> 0.617 | Grad_l2 --> 0.353 | Weights_l2 --> 47142.887 | Lr --> 0.000 | Seconds_per_step --> 1.763 |
475
+ [2024-08-07 14:42:43,044][Main][INFO] - [train] Step 20950 out of 25000 | Loss --> 0.607 | Grad_l2 --> 0.154 | Weights_l2 --> 47142.750 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
476
+ [2024-08-07 14:44:11,795][Main][INFO] - [train] Step 21000 out of 25000 | Loss --> 0.622 | Grad_l2 --> 0.153 | Weights_l2 --> 47142.612 | Lr --> 0.000 | Seconds_per_step --> 1.775 |
477
+ [2024-08-07 14:45:37,402][Main][INFO] - [train] Step 21050 out of 25000 | Loss --> 0.620 | Grad_l2 --> 0.165 | Weights_l2 --> 47142.475 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
478
+ [2024-08-07 14:47:03,753][Main][INFO] - [train] Step 21100 out of 25000 | Loss --> 0.590 | Grad_l2 --> 0.157 | Weights_l2 --> 47142.341 | Lr --> 0.000 | Seconds_per_step --> 1.727 |
479
+ [2024-08-07 14:48:32,029][Main][INFO] - [train] Step 21150 out of 25000 | Loss --> 0.610 | Grad_l2 --> 0.151 | Weights_l2 --> 47142.204 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
480
+ [2024-08-07 14:49:57,716][Main][INFO] - [train] Step 21200 out of 25000 | Loss --> 0.604 | Grad_l2 --> 0.191 | Weights_l2 --> 47142.067 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
481
+ [2024-08-07 14:51:23,943][Main][INFO] - [train] Step 21250 out of 25000 | Loss --> 0.591 | Grad_l2 --> 0.151 | Weights_l2 --> 47141.930 | Lr --> 0.000 | Seconds_per_step --> 1.725 |
482
+ [2024-08-07 14:52:51,971][Main][INFO] - [train] Step 21300 out of 25000 | Loss --> 0.583 | Grad_l2 --> 0.156 | Weights_l2 --> 47141.792 | Lr --> 0.000 | Seconds_per_step --> 1.761 |
483
+ [2024-08-07 14:54:17,847][Main][INFO] - [train] Step 21350 out of 25000 | Loss --> 0.571 | Grad_l2 --> 0.143 | Weights_l2 --> 47141.655 | Lr --> 0.000 | Seconds_per_step --> 1.718 |
484
+ [2024-08-07 14:55:44,079][Main][INFO] - [train] Step 21400 out of 25000 | Loss --> 0.590 | Grad_l2 --> 0.154 | Weights_l2 --> 47141.521 | Lr --> 0.000 | Seconds_per_step --> 1.725 |
485
+ [2024-08-07 14:57:13,087][Main][INFO] - [train] Step 21450 out of 25000 | Loss --> 0.573 | Grad_l2 --> 0.205 | Weights_l2 --> 47141.380 | Lr --> 0.000 | Seconds_per_step --> 1.780 |
486
+ [2024-08-07 14:58:39,380][Main][INFO] - [train] Step 21500 out of 25000 | Loss --> 0.576 | Grad_l2 --> 0.173 | Weights_l2 --> 47141.247 | Lr --> 0.000 | Seconds_per_step --> 1.726 |
487
+ [2024-08-07 15:00:05,026][Main][INFO] - [train] Step 21550 out of 25000 | Loss --> 0.588 | Grad_l2 --> 0.161 | Weights_l2 --> 47141.109 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
488
+ [2024-08-07 15:01:33,627][Main][INFO] - [train] Step 21600 out of 25000 | Loss --> 0.572 | Grad_l2 --> 0.143 | Weights_l2 --> 47140.972 | Lr --> 0.000 | Seconds_per_step --> 1.772 |