Upload folder using huggingface_hub
Browse files- checkpoints/.hydra/config.yaml +47 -0
- checkpoints/.hydra/hydra.yaml +155 -0
- checkpoints/.hydra/overrides.yaml +1 -0
- checkpoints/checkpoint-ft-10000/model.safetensors +3 -0
- checkpoints/checkpoint-ft-10000/random_states_0.pkl +3 -0
- checkpoints/checkpoint-ft-15000/model.safetensors +3 -0
- checkpoints/checkpoint-ft-15000/random_states_0.pkl +3 -0
- checkpoints/checkpoint-ft-20000/model.safetensors +3 -0
- checkpoints/checkpoint-ft-20000/random_states_0.pkl +3 -0
- checkpoints/checkpoint-ft-5000/model.safetensors +3 -0
- checkpoints/checkpoint-ft-5000/random_states_0.pkl +3 -0
- checkpoints/config.json +31 -0
- checkpoints/main.log +488 -0
checkpoints/.hydra/config.yaml
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mode: ft
|
2 |
+
device: gpu
|
3 |
+
precision: bf16
|
4 |
+
eval_only: false
|
5 |
+
predict_only: false
|
6 |
+
seed: 23
|
7 |
+
model:
|
8 |
+
klass: hf_t5
|
9 |
+
compile: true
|
10 |
+
name: google/t5-efficient-mini-nl24
|
11 |
+
random_init: false
|
12 |
+
checkpoint_path: ''
|
13 |
+
data:
|
14 |
+
dataset: flan
|
15 |
+
max_seq_len: 1024
|
16 |
+
max_target_len: 128
|
17 |
+
num_workers: 8
|
18 |
+
n_eval_examples: 500
|
19 |
+
exec_file_path: .
|
20 |
+
data_dir: .
|
21 |
+
task_dir: .
|
22 |
+
optim:
|
23 |
+
name: adamw
|
24 |
+
base_lr: 5.0e-05
|
25 |
+
final_cosine: 1.0e-06
|
26 |
+
lr_scheduler: constant
|
27 |
+
epochs: -1
|
28 |
+
batch_size: 64
|
29 |
+
grad_acc: 8
|
30 |
+
weight_decay: 0.001
|
31 |
+
grad_clip: 1.0
|
32 |
+
total_steps: 25000
|
33 |
+
warmup_steps: 2000
|
34 |
+
eval:
|
35 |
+
steps: 500
|
36 |
+
every_steps: 4000
|
37 |
+
checkpoint:
|
38 |
+
every_steps: 5000
|
39 |
+
logging:
|
40 |
+
neptune: false
|
41 |
+
neptune_creds:
|
42 |
+
project: null
|
43 |
+
api_token: null
|
44 |
+
tags: ''
|
45 |
+
every_steps: 50
|
46 |
+
grad_l2: true
|
47 |
+
weights_l2: true
|
checkpoints/.hydra/hydra.yaml
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}-${logging.neptune_creds.tags}
|
4 |
+
sweep:
|
5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.mode=RUN
|
114 |
+
task: []
|
115 |
+
job:
|
116 |
+
name: main
|
117 |
+
chdir: true
|
118 |
+
override_dirname: ''
|
119 |
+
id: ???
|
120 |
+
num: ???
|
121 |
+
config_name: default
|
122 |
+
env_set: {}
|
123 |
+
env_copy: []
|
124 |
+
config:
|
125 |
+
override_dirname:
|
126 |
+
kv_sep: '='
|
127 |
+
item_sep: ','
|
128 |
+
exclude_keys: []
|
129 |
+
runtime:
|
130 |
+
version: 1.3.2
|
131 |
+
version_base: '1.1'
|
132 |
+
cwd: /workspace/nanoT5
|
133 |
+
config_sources:
|
134 |
+
- path: hydra.conf
|
135 |
+
schema: pkg
|
136 |
+
provider: hydra
|
137 |
+
- path: /workspace/nanoT5/nanoT5/configs
|
138 |
+
schema: file
|
139 |
+
provider: main
|
140 |
+
- path: ''
|
141 |
+
schema: structured
|
142 |
+
provider: schema
|
143 |
+
output_dir: /workspace/nanoT5/logs/2024-08-07/04-07-32-
|
144 |
+
choices:
|
145 |
+
local_env: default
|
146 |
+
hydra/env: default
|
147 |
+
hydra/callbacks: null
|
148 |
+
hydra/job_logging: default
|
149 |
+
hydra/hydra_logging: default
|
150 |
+
hydra/hydra_help: default
|
151 |
+
hydra/help: default
|
152 |
+
hydra/sweeper: basic
|
153 |
+
hydra/launcher: basic
|
154 |
+
hydra/output: default
|
155 |
+
verbose: false
|
checkpoints/.hydra/overrides.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
checkpoints/checkpoint-ft-10000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:192ce7574b7f1fee56d28b4233c1419492bd07c85b0e294f522c95bf9d1a4cc0
|
3 |
+
size 502583392
|
checkpoints/checkpoint-ft-10000/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9cdbcf0f8b24de4549975f78565d5c3dbc12269874ba5a2e8de35a84257f5ac0
|
3 |
+
size 14344
|
checkpoints/checkpoint-ft-15000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02d2317f4e6810fdf537af8a693a836cc125b2d6dd6492c8a46a2268a69a6c79
|
3 |
+
size 502583392
|
checkpoints/checkpoint-ft-15000/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfa4f09fdb0f32aa3d01eac304f065f247e976e784f9bc6768ab19b229734962
|
3 |
+
size 14344
|
checkpoints/checkpoint-ft-20000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6be6570c10abf613503afad9fd26152a9221172840b77b47d9475abd610487b6
|
3 |
+
size 502583392
|
checkpoints/checkpoint-ft-20000/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:504ad73e70abf3cd83e3f02d6c2a0e1ef5491c7d1cf9af3ed2cfbeb246b09102
|
3 |
+
size 14344
|
checkpoints/checkpoint-ft-5000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b6501a8766a1f4272a59f5175863ae3388feb447e634e37534fa861b9ef6fb2c
|
3 |
+
size 502583392
|
checkpoints/checkpoint-ft-5000/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2075711e135da3b9d8d419bbe69236c8015d5e6b2ea68c9ffe2cec404ed7d1eb
|
3 |
+
size 14344
|
checkpoints/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "google/t5-efficient-mini-nl24",
|
3 |
+
"architectures": [
|
4 |
+
"T5ForConditionalGeneration"
|
5 |
+
],
|
6 |
+
"classifier_dropout": 0.0,
|
7 |
+
"d_ff": 1536,
|
8 |
+
"d_kv": 64,
|
9 |
+
"d_model": 384,
|
10 |
+
"decoder_start_token_id": 0,
|
11 |
+
"dense_act_fn": "relu",
|
12 |
+
"dropout_rate": 0.1,
|
13 |
+
"eos_token_id": 1,
|
14 |
+
"feed_forward_proj": "relu",
|
15 |
+
"initializer_factor": 1.0,
|
16 |
+
"is_encoder_decoder": true,
|
17 |
+
"is_gated_act": false,
|
18 |
+
"layer_norm_epsilon": 1e-06,
|
19 |
+
"model_type": "t5",
|
20 |
+
"n_positions": 512,
|
21 |
+
"num_decoder_layers": 24,
|
22 |
+
"num_heads": 8,
|
23 |
+
"num_layers": 24,
|
24 |
+
"pad_token_id": 0,
|
25 |
+
"relative_attention_max_distance": 128,
|
26 |
+
"relative_attention_num_buckets": 32,
|
27 |
+
"torch_dtype": "float32",
|
28 |
+
"transformers_version": "4.44.0",
|
29 |
+
"use_cache": true,
|
30 |
+
"vocab_size": 32128
|
31 |
+
}
|
checkpoints/main.log
ADDED
@@ -0,0 +1,488 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[2024-08-07 04:07:32,726][Main][INFO] - Distributed environment: NO
|
2 |
+
Num processes: 1
|
3 |
+
Process index: 0
|
4 |
+
Local process index: 0
|
5 |
+
Device: cuda
|
6 |
+
|
7 |
+
Mixed precision type: bf16
|
8 |
+
|
9 |
+
[2024-08-07 04:07:32,726][Main][INFO] - Working directory is /workspace/nanoT5/logs/2024-08-07/04-07-32-
|
10 |
+
[2024-08-07 04:12:18,030][Main][INFO] - [train] Step 50 out of 25000 | Loss --> 11.343 | Grad_l2 --> 43.017 | Weights_l2 --> 47200.114 | Lr --> 0.000 | Seconds_per_step --> 5.126 |
|
11 |
+
[2024-08-07 04:13:43,886][Main][INFO] - [train] Step 100 out of 25000 | Loss --> 9.276 | Grad_l2 --> 45.242 | Weights_l2 --> 47199.978 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
12 |
+
[2024-08-07 04:15:10,676][Main][INFO] - [train] Step 150 out of 25000 | Loss --> 8.145 | Grad_l2 --> 47.689 | Weights_l2 --> 47199.838 | Lr --> 0.000 | Seconds_per_step --> 1.736 |
|
13 |
+
[2024-08-07 04:16:36,244][Main][INFO] - [train] Step 200 out of 25000 | Loss --> 7.244 | Grad_l2 --> 49.128 | Weights_l2 --> 47199.701 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
14 |
+
[2024-08-07 04:18:03,337][Main][INFO] - [train] Step 250 out of 25000 | Loss --> 6.159 | Grad_l2 --> 47.661 | Weights_l2 --> 47199.569 | Lr --> 0.000 | Seconds_per_step --> 1.742 |
|
15 |
+
[2024-08-07 04:19:30,382][Main][INFO] - [train] Step 300 out of 25000 | Loss --> 4.863 | Grad_l2 --> 42.692 | Weights_l2 --> 47199.433 | Lr --> 0.000 | Seconds_per_step --> 1.741 |
|
16 |
+
[2024-08-07 04:20:56,228][Main][INFO] - [train] Step 350 out of 25000 | Loss --> 3.515 | Grad_l2 --> 32.667 | Weights_l2 --> 47199.297 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
17 |
+
[2024-08-07 04:22:22,722][Main][INFO] - [train] Step 400 out of 25000 | Loss --> 2.114 | Grad_l2 --> 15.583 | Weights_l2 --> 47199.164 | Lr --> 0.000 | Seconds_per_step --> 1.730 |
|
18 |
+
[2024-08-07 04:23:47,989][Main][INFO] - [train] Step 450 out of 25000 | Loss --> 0.978 | Grad_l2 --> 1.009 | Weights_l2 --> 47199.033 | Lr --> 0.000 | Seconds_per_step --> 1.705 |
|
19 |
+
[2024-08-07 04:25:15,079][Main][INFO] - [train] Step 500 out of 25000 | Loss --> 0.806 | Grad_l2 --> 0.498 | Weights_l2 --> 47198.897 | Lr --> 0.000 | Seconds_per_step --> 1.742 |
|
20 |
+
[2024-08-07 04:26:40,081][Main][INFO] - [train] Step 550 out of 25000 | Loss --> 0.808 | Grad_l2 --> 0.696 | Weights_l2 --> 47198.757 | Lr --> 0.000 | Seconds_per_step --> 1.700 |
|
21 |
+
[2024-08-07 04:28:06,532][Main][INFO] - [train] Step 600 out of 25000 | Loss --> 0.785 | Grad_l2 --> 0.455 | Weights_l2 --> 47198.620 | Lr --> 0.000 | Seconds_per_step --> 1.729 |
|
22 |
+
[2024-08-07 04:29:33,537][Main][INFO] - [train] Step 650 out of 25000 | Loss --> 0.787 | Grad_l2 --> 0.733 | Weights_l2 --> 47198.484 | Lr --> 0.000 | Seconds_per_step --> 1.740 |
|
23 |
+
[2024-08-07 04:30:59,366][Main][INFO] - [train] Step 700 out of 25000 | Loss --> 0.735 | Grad_l2 --> 0.463 | Weights_l2 --> 47198.347 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
24 |
+
[2024-08-07 04:32:26,571][Main][INFO] - [train] Step 750 out of 25000 | Loss --> 0.714 | Grad_l2 --> 0.298 | Weights_l2 --> 47198.207 | Lr --> 0.000 | Seconds_per_step --> 1.744 |
|
25 |
+
[2024-08-07 04:33:51,938][Main][INFO] - [train] Step 800 out of 25000 | Loss --> 0.709 | Grad_l2 --> 0.358 | Weights_l2 --> 47198.070 | Lr --> 0.000 | Seconds_per_step --> 1.707 |
|
26 |
+
[2024-08-07 04:35:18,887][Main][INFO] - [train] Step 850 out of 25000 | Loss --> 0.697 | Grad_l2 --> 0.314 | Weights_l2 --> 47197.933 | Lr --> 0.000 | Seconds_per_step --> 1.739 |
|
27 |
+
[2024-08-07 04:36:44,507][Main][INFO] - [train] Step 900 out of 25000 | Loss --> 0.712 | Grad_l2 --> 0.621 | Weights_l2 --> 47197.796 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
|
28 |
+
[2024-08-07 04:38:11,744][Main][INFO] - [train] Step 950 out of 25000 | Loss --> 0.681 | Grad_l2 --> 0.380 | Weights_l2 --> 47197.660 | Lr --> 0.000 | Seconds_per_step --> 1.745 |
|
29 |
+
[2024-08-07 04:39:37,561][Main][INFO] - [train] Step 1000 out of 25000 | Loss --> 0.697 | Grad_l2 --> 0.373 | Weights_l2 --> 47197.523 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
30 |
+
[2024-08-07 04:41:04,005][Main][INFO] - [train] Step 1050 out of 25000 | Loss --> 0.706 | Grad_l2 --> 0.522 | Weights_l2 --> 47197.382 | Lr --> 0.000 | Seconds_per_step --> 1.729 |
|
31 |
+
[2024-08-07 04:42:29,034][Main][INFO] - [train] Step 1100 out of 25000 | Loss --> 0.685 | Grad_l2 --> 0.296 | Weights_l2 --> 47197.245 | Lr --> 0.000 | Seconds_per_step --> 1.701 |
|
32 |
+
[2024-08-07 04:43:55,529][Main][INFO] - [train] Step 1150 out of 25000 | Loss --> 0.674 | Grad_l2 --> 0.331 | Weights_l2 --> 47197.109 | Lr --> 0.000 | Seconds_per_step --> 1.730 |
|
33 |
+
[2024-08-07 04:45:20,774][Main][INFO] - [train] Step 1200 out of 25000 | Loss --> 0.667 | Grad_l2 --> 0.590 | Weights_l2 --> 47196.972 | Lr --> 0.000 | Seconds_per_step --> 1.705 |
|
34 |
+
[2024-08-07 04:46:47,532][Main][INFO] - [train] Step 1250 out of 25000 | Loss --> 0.665 | Grad_l2 --> 0.349 | Weights_l2 --> 47196.835 | Lr --> 0.000 | Seconds_per_step --> 1.735 |
|
35 |
+
[2024-08-07 04:48:14,549][Main][INFO] - [train] Step 1300 out of 25000 | Loss --> 0.663 | Grad_l2 --> 0.495 | Weights_l2 --> 47196.698 | Lr --> 0.000 | Seconds_per_step --> 1.740 |
|
36 |
+
[2024-08-07 04:49:40,267][Main][INFO] - [train] Step 1350 out of 25000 | Loss --> 0.646 | Grad_l2 --> 0.261 | Weights_l2 --> 47196.557 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
|
37 |
+
[2024-08-07 04:51:07,119][Main][INFO] - [train] Step 1400 out of 25000 | Loss --> 0.626 | Grad_l2 --> 0.245 | Weights_l2 --> 47196.420 | Lr --> 0.000 | Seconds_per_step --> 1.737 |
|
38 |
+
[2024-08-07 04:52:32,692][Main][INFO] - [train] Step 1450 out of 25000 | Loss --> 0.642 | Grad_l2 --> 0.329 | Weights_l2 --> 47196.283 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
39 |
+
[2024-08-07 04:53:59,630][Main][INFO] - [train] Step 1500 out of 25000 | Loss --> 0.641 | Grad_l2 --> 0.347 | Weights_l2 --> 47196.146 | Lr --> 0.000 | Seconds_per_step --> 1.739 |
|
40 |
+
[2024-08-07 04:55:25,305][Main][INFO] - [train] Step 1550 out of 25000 | Loss --> 0.645 | Grad_l2 --> 0.239 | Weights_l2 --> 47196.009 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
|
41 |
+
[2024-08-07 04:56:52,437][Main][INFO] - [train] Step 1600 out of 25000 | Loss --> 0.670 | Grad_l2 --> 0.247 | Weights_l2 --> 47195.869 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
|
42 |
+
[2024-08-07 04:58:18,243][Main][INFO] - [train] Step 1650 out of 25000 | Loss --> 0.630 | Grad_l2 --> 0.272 | Weights_l2 --> 47195.736 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
43 |
+
[2024-08-07 04:59:45,425][Main][INFO] - [train] Step 1700 out of 25000 | Loss --> 0.639 | Grad_l2 --> 0.281 | Weights_l2 --> 47195.599 | Lr --> 0.000 | Seconds_per_step --> 1.744 |
|
44 |
+
[2024-08-07 05:01:11,208][Main][INFO] - [train] Step 1750 out of 25000 | Loss --> 0.640 | Grad_l2 --> 0.281 | Weights_l2 --> 47195.462 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
45 |
+
[2024-08-07 05:02:38,396][Main][INFO] - [train] Step 1800 out of 25000 | Loss --> 0.626 | Grad_l2 --> 0.226 | Weights_l2 --> 47195.321 | Lr --> 0.000 | Seconds_per_step --> 1.744 |
|
46 |
+
[2024-08-07 05:04:04,169][Main][INFO] - [train] Step 1850 out of 25000 | Loss --> 0.632 | Grad_l2 --> 0.526 | Weights_l2 --> 47195.184 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
47 |
+
[2024-08-07 05:05:31,337][Main][INFO] - [train] Step 1900 out of 25000 | Loss --> 0.623 | Grad_l2 --> 0.213 | Weights_l2 --> 47195.047 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
|
48 |
+
[2024-08-07 05:06:58,450][Main][INFO] - [train] Step 1950 out of 25000 | Loss --> 0.616 | Grad_l2 --> 0.337 | Weights_l2 --> 47194.910 | Lr --> 0.000 | Seconds_per_step --> 1.742 |
|
49 |
+
[2024-08-07 05:08:24,208][Main][INFO] - [train] Step 2000 out of 25000 | Loss --> 0.632 | Grad_l2 --> 0.220 | Weights_l2 --> 47194.773 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
50 |
+
[2024-08-07 05:09:51,402][Main][INFO] - [train] Step 2050 out of 25000 | Loss --> 0.615 | Grad_l2 --> 0.250 | Weights_l2 --> 47194.632 | Lr --> 0.000 | Seconds_per_step --> 1.744 |
|
51 |
+
[2024-08-07 05:11:17,134][Main][INFO] - [train] Step 2100 out of 25000 | Loss --> 0.614 | Grad_l2 --> 0.443 | Weights_l2 --> 47194.495 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
52 |
+
[2024-08-07 05:12:44,287][Main][INFO] - [train] Step 2150 out of 25000 | Loss --> 0.623 | Grad_l2 --> 0.269 | Weights_l2 --> 47194.358 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
|
53 |
+
[2024-08-07 05:14:09,998][Main][INFO] - [train] Step 2200 out of 25000 | Loss --> 0.623 | Grad_l2 --> 0.349 | Weights_l2 --> 47194.221 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
|
54 |
+
[2024-08-07 05:15:37,150][Main][INFO] - [train] Step 2250 out of 25000 | Loss --> 0.602 | Grad_l2 --> 0.210 | Weights_l2 --> 47194.084 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
|
55 |
+
[2024-08-07 05:17:02,865][Main][INFO] - [train] Step 2300 out of 25000 | Loss --> 0.593 | Grad_l2 --> 0.206 | Weights_l2 --> 47193.947 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
|
56 |
+
[2024-08-07 05:18:29,986][Main][INFO] - [train] Step 2350 out of 25000 | Loss --> 0.596 | Grad_l2 --> 0.299 | Weights_l2 --> 47193.810 | Lr --> 0.000 | Seconds_per_step --> 1.742 |
|
57 |
+
[2024-08-07 05:19:55,710][Main][INFO] - [train] Step 2400 out of 25000 | Loss --> 0.634 | Grad_l2 --> 0.229 | Weights_l2 --> 47193.669 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
|
58 |
+
[2024-08-07 05:21:22,808][Main][INFO] - [train] Step 2450 out of 25000 | Loss --> 0.634 | Grad_l2 --> 0.187 | Weights_l2 --> 47193.532 | Lr --> 0.000 | Seconds_per_step --> 1.742 |
|
59 |
+
[2024-08-07 05:22:48,543][Main][INFO] - [train] Step 2500 out of 25000 | Loss --> 0.628 | Grad_l2 --> 0.235 | Weights_l2 --> 47193.395 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
60 |
+
[2024-08-07 05:24:15,698][Main][INFO] - [train] Step 2550 out of 25000 | Loss --> 0.621 | Grad_l2 --> 0.222 | Weights_l2 --> 47193.258 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
|
61 |
+
[2024-08-07 05:25:41,174][Main][INFO] - [train] Step 2600 out of 25000 | Loss --> 0.598 | Grad_l2 --> 0.200 | Weights_l2 --> 47193.121 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
62 |
+
[2024-08-07 05:27:08,312][Main][INFO] - [train] Step 2650 out of 25000 | Loss --> 0.605 | Grad_l2 --> 0.199 | Weights_l2 --> 47192.984 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
|
63 |
+
[2024-08-07 05:28:35,494][Main][INFO] - [train] Step 2700 out of 25000 | Loss --> 0.616 | Grad_l2 --> 0.190 | Weights_l2 --> 47192.847 | Lr --> 0.000 | Seconds_per_step --> 1.744 |
|
64 |
+
[2024-08-07 05:30:01,277][Main][INFO] - [train] Step 2750 out of 25000 | Loss --> 0.638 | Grad_l2 --> 0.208 | Weights_l2 --> 47192.710 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
65 |
+
[2024-08-07 05:31:28,184][Main][INFO] - [train] Step 2800 out of 25000 | Loss --> 0.625 | Grad_l2 --> 0.262 | Weights_l2 --> 47192.573 | Lr --> 0.000 | Seconds_per_step --> 1.738 |
|
66 |
+
[2024-08-07 05:32:53,230][Main][INFO] - [train] Step 2850 out of 25000 | Loss --> 0.609 | Grad_l2 --> 0.292 | Weights_l2 --> 47192.432 | Lr --> 0.000 | Seconds_per_step --> 1.701 |
|
67 |
+
[2024-08-07 05:34:19,792][Main][INFO] - [train] Step 2900 out of 25000 | Loss --> 0.597 | Grad_l2 --> 0.184 | Weights_l2 --> 47192.295 | Lr --> 0.000 | Seconds_per_step --> 1.731 |
|
68 |
+
[2024-08-07 05:35:44,824][Main][INFO] - [train] Step 2950 out of 25000 | Loss --> 0.593 | Grad_l2 --> 0.224 | Weights_l2 --> 47192.158 | Lr --> 0.000 | Seconds_per_step --> 1.701 |
|
69 |
+
[2024-08-07 05:37:11,229][Main][INFO] - [train] Step 3000 out of 25000 | Loss --> 0.638 | Grad_l2 --> 0.342 | Weights_l2 --> 47192.021 | Lr --> 0.000 | Seconds_per_step --> 1.728 |
|
70 |
+
[2024-08-07 05:38:36,575][Main][INFO] - [train] Step 3050 out of 25000 | Loss --> 0.599 | Grad_l2 --> 0.171 | Weights_l2 --> 47191.884 | Lr --> 0.000 | Seconds_per_step --> 1.707 |
|
71 |
+
[2024-08-07 05:40:03,729][Main][INFO] - [train] Step 3100 out of 25000 | Loss --> 0.592 | Grad_l2 --> 0.223 | Weights_l2 --> 47191.743 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
|
72 |
+
[2024-08-07 05:41:29,379][Main][INFO] - [train] Step 3150 out of 25000 | Loss --> 0.601 | Grad_l2 --> 0.294 | Weights_l2 --> 47191.610 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
|
73 |
+
[2024-08-07 05:42:56,819][Main][INFO] - [train] Step 3200 out of 25000 | Loss --> 0.575 | Grad_l2 --> 0.207 | Weights_l2 --> 47191.469 | Lr --> 0.000 | Seconds_per_step --> 1.749 |
|
74 |
+
[2024-08-07 05:44:22,604][Main][INFO] - [train] Step 3250 out of 25000 | Loss --> 0.596 | Grad_l2 --> 0.257 | Weights_l2 --> 47191.332 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
75 |
+
[2024-08-07 05:45:49,918][Main][INFO] - [train] Step 3300 out of 25000 | Loss --> 0.612 | Grad_l2 --> 0.377 | Weights_l2 --> 47191.199 | Lr --> 0.000 | Seconds_per_step --> 1.746 |
|
76 |
+
[2024-08-07 05:47:15,156][Main][INFO] - [train] Step 3350 out of 25000 | Loss --> 0.612 | Grad_l2 --> 0.250 | Weights_l2 --> 47191.062 | Lr --> 0.000 | Seconds_per_step --> 1.705 |
|
77 |
+
[2024-08-07 05:48:42,282][Main][INFO] - [train] Step 3400 out of 25000 | Loss --> 0.591 | Grad_l2 --> 0.204 | Weights_l2 --> 47190.925 | Lr --> 0.000 | Seconds_per_step --> 1.743 |
|
78 |
+
[2024-08-07 05:50:08,971][Main][INFO] - [train] Step 3450 out of 25000 | Loss --> 0.592 | Grad_l2 --> 0.205 | Weights_l2 --> 47190.784 | Lr --> 0.000 | Seconds_per_step --> 1.734 |
|
79 |
+
[2024-08-07 05:51:34,049][Main][INFO] - [train] Step 3500 out of 25000 | Loss --> 0.604 | Grad_l2 --> 0.239 | Weights_l2 --> 47190.647 | Lr --> 0.000 | Seconds_per_step --> 1.702 |
|
80 |
+
[2024-08-07 05:53:01,117][Main][INFO] - [train] Step 3550 out of 25000 | Loss --> 0.563 | Grad_l2 --> 0.270 | Weights_l2 --> 47190.510 | Lr --> 0.000 | Seconds_per_step --> 1.741 |
|
81 |
+
[2024-08-07 05:54:26,716][Main][INFO] - [train] Step 3600 out of 25000 | Loss --> 0.585 | Grad_l2 --> 0.448 | Weights_l2 --> 47190.373 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
|
82 |
+
[2024-08-07 05:55:53,130][Main][INFO] - [train] Step 3650 out of 25000 | Loss --> 0.596 | Grad_l2 --> 0.234 | Weights_l2 --> 47190.236 | Lr --> 0.000 | Seconds_per_step --> 1.728 |
|
83 |
+
[2024-08-07 05:57:18,221][Main][INFO] - [train] Step 3700 out of 25000 | Loss --> 0.616 | Grad_l2 --> 0.206 | Weights_l2 --> 47190.099 | Lr --> 0.000 | Seconds_per_step --> 1.702 |
|
84 |
+
[2024-08-07 05:58:45,613][Main][INFO] - [train] Step 3750 out of 25000 | Loss --> 0.562 | Grad_l2 --> 0.159 | Weights_l2 --> 47189.962 | Lr --> 0.000 | Seconds_per_step --> 1.748 |
|
85 |
+
[2024-08-07 06:00:11,446][Main][INFO] - [train] Step 3800 out of 25000 | Loss --> 0.588 | Grad_l2 --> 0.199 | Weights_l2 --> 47189.821 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
86 |
+
[2024-08-07 06:01:38,826][Main][INFO] - [train] Step 3850 out of 25000 | Loss --> 0.581 | Grad_l2 --> 0.248 | Weights_l2 --> 47189.684 | Lr --> 0.000 | Seconds_per_step --> 1.748 |
|
87 |
+
[2024-08-07 06:03:05,865][Main][INFO] - [train] Step 3900 out of 25000 | Loss --> 0.597 | Grad_l2 --> 0.195 | Weights_l2 --> 47189.547 | Lr --> 0.000 | Seconds_per_step --> 1.741 |
|
88 |
+
[2024-08-07 06:04:31,625][Main][INFO] - [train] Step 3950 out of 25000 | Loss --> 0.595 | Grad_l2 --> 0.192 | Weights_l2 --> 47189.410 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
89 |
+
[2024-08-07 06:05:59,020][Main][INFO] - [train] Step 4000 out of 25000 | Loss --> 0.584 | Grad_l2 --> 0.200 | Weights_l2 --> 47189.273 | Lr --> 0.000 | Seconds_per_step --> 1.748 |
|
90 |
+
[2024-08-07 06:09:10,498][Main][INFO] - [eval] Step 4000 out of 25000 | Loss --> 0.941 | Accuracy --> 0.832 | Time --> 191.474 |
|
91 |
+
[2024-08-07 06:13:48,778][absl][INFO] - Using default tokenizer.
|
92 |
+
[2024-08-07 06:13:49,269][Main][INFO] - [test] Step 4000 out of 25000 | Rougel --> 15.204 | Time --> 278.771 |
|
93 |
+
[2024-08-07 06:15:14,427][Main][INFO] - [train] Step 4050 out of 25000 | Loss --> 0.564 | Grad_l2 --> 0.288 | Weights_l2 --> 47189.135 | Lr --> 0.000 | Seconds_per_step --> 1.703 |
|
94 |
+
[2024-08-07 06:16:39,500][Main][INFO] - [train] Step 4100 out of 25000 | Loss --> 0.567 | Grad_l2 --> 0.205 | Weights_l2 --> 47188.998 | Lr --> 0.000 | Seconds_per_step --> 1.701 |
|
95 |
+
[2024-08-07 06:18:07,374][Main][INFO] - [train] Step 4150 out of 25000 | Loss --> 0.583 | Grad_l2 --> 0.203 | Weights_l2 --> 47188.861 | Lr --> 0.000 | Seconds_per_step --> 1.757 |
|
96 |
+
[2024-08-07 06:19:32,587][Main][INFO] - [train] Step 4200 out of 25000 | Loss --> 0.596 | Grad_l2 --> 0.269 | Weights_l2 --> 47188.724 | Lr --> 0.000 | Seconds_per_step --> 1.704 |
|
97 |
+
[2024-08-07 06:20:58,195][Main][INFO] - [train] Step 4250 out of 25000 | Loss --> 0.575 | Grad_l2 --> 0.279 | Weights_l2 --> 47188.587 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
|
98 |
+
[2024-08-07 06:22:26,587][Main][INFO] - [train] Step 4300 out of 25000 | Loss --> 0.595 | Grad_l2 --> 0.256 | Weights_l2 --> 47188.446 | Lr --> 0.000 | Seconds_per_step --> 1.768 |
|
99 |
+
[2024-08-07 06:23:52,408][Main][INFO] - [train] Step 4350 out of 25000 | Loss --> 0.567 | Grad_l2 --> 0.185 | Weights_l2 --> 47188.309 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
100 |
+
[2024-08-07 06:25:18,175][Main][INFO] - [train] Step 4400 out of 25000 | Loss --> 0.587 | Grad_l2 --> 0.188 | Weights_l2 --> 47188.172 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
101 |
+
[2024-08-07 06:26:46,749][Main][INFO] - [train] Step 4450 out of 25000 | Loss --> 0.580 | Grad_l2 --> 0.164 | Weights_l2 --> 47188.035 | Lr --> 0.000 | Seconds_per_step --> 1.771 |
|
102 |
+
[2024-08-07 06:28:12,711][Main][INFO] - [train] Step 4500 out of 25000 | Loss --> 0.588 | Grad_l2 --> 0.221 | Weights_l2 --> 47187.898 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
|
103 |
+
[2024-08-07 06:29:38,541][Main][INFO] - [train] Step 4550 out of 25000 | Loss --> 0.586 | Grad_l2 --> 0.191 | Weights_l2 --> 47187.761 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
104 |
+
[2024-08-07 06:31:06,867][Main][INFO] - [train] Step 4600 out of 25000 | Loss --> 0.574 | Grad_l2 --> 0.215 | Weights_l2 --> 47187.628 | Lr --> 0.000 | Seconds_per_step --> 1.767 |
|
105 |
+
[2024-08-07 06:32:32,538][Main][INFO] - [train] Step 4650 out of 25000 | Loss --> 0.554 | Grad_l2 --> 0.220 | Weights_l2 --> 47187.487 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
|
106 |
+
[2024-08-07 06:33:58,044][Main][INFO] - [train] Step 4700 out of 25000 | Loss --> 0.571 | Grad_l2 --> 0.200 | Weights_l2 --> 47187.346 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
107 |
+
[2024-08-07 06:35:26,162][Main][INFO] - [train] Step 4750 out of 25000 | Loss --> 0.562 | Grad_l2 --> 0.273 | Weights_l2 --> 47187.209 | Lr --> 0.000 | Seconds_per_step --> 1.762 |
|
108 |
+
[2024-08-07 06:36:51,666][Main][INFO] - [train] Step 4800 out of 25000 | Loss --> 0.560 | Grad_l2 --> 0.188 | Weights_l2 --> 47187.071 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
109 |
+
[2024-08-07 06:38:17,107][Main][INFO] - [train] Step 4850 out of 25000 | Loss --> 0.569 | Grad_l2 --> 0.293 | Weights_l2 --> 47186.938 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
|
110 |
+
[2024-08-07 06:39:44,965][Main][INFO] - [train] Step 4900 out of 25000 | Loss --> 0.566 | Grad_l2 --> 0.177 | Weights_l2 --> 47186.797 | Lr --> 0.000 | Seconds_per_step --> 1.757 |
|
111 |
+
[2024-08-07 06:41:10,239][Main][INFO] - [train] Step 4950 out of 25000 | Loss --> 0.547 | Grad_l2 --> 0.170 | Weights_l2 --> 47186.660 | Lr --> 0.000 | Seconds_per_step --> 1.705 |
|
112 |
+
[2024-08-07 06:42:35,764][Main][INFO] - [train] Step 5000 out of 25000 | Loss --> 0.585 | Grad_l2 --> 0.219 | Weights_l2 --> 47186.523 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
113 |
+
[2024-08-07 06:42:35,765][accelerate.accelerator][INFO] - Saving current state to checkpoint-ft-5000
|
114 |
+
[2024-08-07 06:42:35,771][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'lm_head.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
115 |
+
[2024-08-07 06:42:36,580][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-ft-5000/model.safetensors
|
116 |
+
[2024-08-07 06:42:37,727][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-ft-5000/optimizer.bin
|
117 |
+
[2024-08-07 06:42:37,728][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-ft-5000/scheduler.bin
|
118 |
+
[2024-08-07 06:42:37,728][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-ft-5000/sampler.bin
|
119 |
+
[2024-08-07 06:42:37,728][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-ft-5000/sampler_1.bin
|
120 |
+
[2024-08-07 06:42:37,729][accelerate.checkpointing][INFO] - Random states saved in checkpoint-ft-5000/random_states_0.pkl
|
121 |
+
[2024-08-07 06:44:06,450][Main][INFO] - [train] Step 5050 out of 25000 | Loss --> 0.564 | Grad_l2 --> 0.230 | Weights_l2 --> 47186.386 | Lr --> 0.000 | Seconds_per_step --> 1.814 |
|
122 |
+
[2024-08-07 06:45:32,495][Main][INFO] - [train] Step 5100 out of 25000 | Loss --> 0.565 | Grad_l2 --> 0.191 | Weights_l2 --> 47186.248 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
|
123 |
+
[2024-08-07 06:46:58,589][Main][INFO] - [train] Step 5150 out of 25000 | Loss --> 0.583 | Grad_l2 --> 0.170 | Weights_l2 --> 47186.111 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
|
124 |
+
[2024-08-07 06:48:27,030][Main][INFO] - [train] Step 5200 out of 25000 | Loss --> 0.569 | Grad_l2 --> 0.164 | Weights_l2 --> 47185.974 | Lr --> 0.000 | Seconds_per_step --> 1.769 |
|
125 |
+
[2024-08-07 06:49:53,124][Main][INFO] - [train] Step 5250 out of 25000 | Loss --> 0.551 | Grad_l2 --> 0.174 | Weights_l2 --> 47185.837 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
|
126 |
+
[2024-08-07 06:51:19,188][Main][INFO] - [train] Step 5300 out of 25000 | Loss --> 0.551 | Grad_l2 --> 0.179 | Weights_l2 --> 47185.700 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
|
127 |
+
[2024-08-07 06:52:47,972][Main][INFO] - [train] Step 5350 out of 25000 | Loss --> 0.553 | Grad_l2 --> 0.183 | Weights_l2 --> 47185.558 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
|
128 |
+
[2024-08-07 06:54:13,671][Main][INFO] - [train] Step 5400 out of 25000 | Loss --> 0.565 | Grad_l2 --> 0.206 | Weights_l2 --> 47185.421 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
|
129 |
+
[2024-08-07 06:55:41,483][Main][INFO] - [train] Step 5450 out of 25000 | Loss --> 0.593 | Grad_l2 --> 0.205 | Weights_l2 --> 47185.288 | Lr --> 0.000 | Seconds_per_step --> 1.756 |
|
130 |
+
[2024-08-07 06:57:06,833][Main][INFO] - [train] Step 5500 out of 25000 | Loss --> 0.533 | Grad_l2 --> 0.157 | Weights_l2 --> 47185.151 | Lr --> 0.000 | Seconds_per_step --> 1.707 |
|
131 |
+
[2024-08-07 06:58:32,236][Main][INFO] - [train] Step 5550 out of 25000 | Loss --> 0.595 | Grad_l2 --> 0.209 | Weights_l2 --> 47185.009 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
|
132 |
+
[2024-08-07 07:00:00,311][Main][INFO] - [train] Step 5600 out of 25000 | Loss --> 0.558 | Grad_l2 --> 0.202 | Weights_l2 --> 47184.872 | Lr --> 0.000 | Seconds_per_step --> 1.761 |
|
133 |
+
[2024-08-07 07:01:26,167][Main][INFO] - [train] Step 5650 out of 25000 | Loss --> 0.590 | Grad_l2 --> 0.157 | Weights_l2 --> 47184.735 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
134 |
+
[2024-08-07 07:02:51,945][Main][INFO] - [train] Step 5700 out of 25000 | Loss --> 0.559 | Grad_l2 --> 0.160 | Weights_l2 --> 47184.598 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
135 |
+
[2024-08-07 07:04:20,503][Main][INFO] - [train] Step 5750 out of 25000 | Loss --> 0.556 | Grad_l2 --> 0.169 | Weights_l2 --> 47184.460 | Lr --> 0.000 | Seconds_per_step --> 1.771 |
|
136 |
+
[2024-08-07 07:05:45,999][Main][INFO] - [train] Step 5800 out of 25000 | Loss --> 0.548 | Grad_l2 --> 0.176 | Weights_l2 --> 47184.323 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
137 |
+
[2024-08-07 07:07:11,484][Main][INFO] - [train] Step 5850 out of 25000 | Loss --> 0.561 | Grad_l2 --> 0.161 | Weights_l2 --> 47184.186 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
138 |
+
[2024-08-07 07:08:39,677][Main][INFO] - [train] Step 5900 out of 25000 | Loss --> 0.575 | Grad_l2 --> 0.157 | Weights_l2 --> 47184.049 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
|
139 |
+
[2024-08-07 07:10:05,179][Main][INFO] - [train] Step 5950 out of 25000 | Loss --> 0.569 | Grad_l2 --> 0.147 | Weights_l2 --> 47183.911 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
140 |
+
[2024-08-07 07:11:30,734][Main][INFO] - [train] Step 6000 out of 25000 | Loss --> 0.565 | Grad_l2 --> 0.202 | Weights_l2 --> 47183.774 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
141 |
+
[2024-08-07 07:12:59,400][Main][INFO] - [train] Step 6050 out of 25000 | Loss --> 0.565 | Grad_l2 --> 0.148 | Weights_l2 --> 47183.633 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
|
142 |
+
[2024-08-07 07:14:25,522][Main][INFO] - [train] Step 6100 out of 25000 | Loss --> 0.568 | Grad_l2 --> 0.178 | Weights_l2 --> 47183.496 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
|
143 |
+
[2024-08-07 07:15:51,608][Main][INFO] - [train] Step 6150 out of 25000 | Loss --> 0.561 | Grad_l2 --> 0.176 | Weights_l2 --> 47183.362 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
|
144 |
+
[2024-08-07 07:17:20,427][Main][INFO] - [train] Step 6200 out of 25000 | Loss --> 0.561 | Grad_l2 --> 0.166 | Weights_l2 --> 47183.225 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
|
145 |
+
[2024-08-07 07:18:46,385][Main][INFO] - [train] Step 6250 out of 25000 | Loss --> 0.556 | Grad_l2 --> 0.181 | Weights_l2 --> 47183.084 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
|
146 |
+
[2024-08-07 07:20:11,716][Main][INFO] - [train] Step 6300 out of 25000 | Loss --> 0.576 | Grad_l2 --> 0.153 | Weights_l2 --> 47182.951 | Lr --> 0.000 | Seconds_per_step --> 1.707 |
|
147 |
+
[2024-08-07 07:21:39,494][Main][INFO] - [train] Step 6350 out of 25000 | Loss --> 0.562 | Grad_l2 --> 0.193 | Weights_l2 --> 47182.813 | Lr --> 0.000 | Seconds_per_step --> 1.756 |
|
148 |
+
[2024-08-07 07:23:04,833][Main][INFO] - [train] Step 6400 out of 25000 | Loss --> 0.551 | Grad_l2 --> 0.158 | Weights_l2 --> 47182.676 | Lr --> 0.000 | Seconds_per_step --> 1.707 |
|
149 |
+
[2024-08-07 07:24:30,152][Main][INFO] - [train] Step 6450 out of 25000 | Loss --> 0.544 | Grad_l2 --> 0.160 | Weights_l2 --> 47182.539 | Lr --> 0.000 | Seconds_per_step --> 1.706 |
|
150 |
+
[2024-08-07 07:25:57,971][Main][INFO] - [train] Step 6500 out of 25000 | Loss --> 0.573 | Grad_l2 --> 0.179 | Weights_l2 --> 47182.398 | Lr --> 0.000 | Seconds_per_step --> 1.756 |
|
151 |
+
[2024-08-07 07:27:23,312][Main][INFO] - [train] Step 6550 out of 25000 | Loss --> 0.557 | Grad_l2 --> 0.217 | Weights_l2 --> 47182.260 | Lr --> 0.000 | Seconds_per_step --> 1.707 |
|
152 |
+
[2024-08-07 07:28:48,618][Main][INFO] - [train] Step 6600 out of 25000 | Loss --> 0.526 | Grad_l2 --> 0.152 | Weights_l2 --> 47182.127 | Lr --> 0.000 | Seconds_per_step --> 1.706 |
|
153 |
+
[2024-08-07 07:30:16,544][Main][INFO] - [train] Step 6650 out of 25000 | Loss --> 0.547 | Grad_l2 --> 0.150 | Weights_l2 --> 47181.986 | Lr --> 0.000 | Seconds_per_step --> 1.759 |
|
154 |
+
[2024-08-07 07:31:42,081][Main][INFO] - [train] Step 6700 out of 25000 | Loss --> 0.547 | Grad_l2 --> 0.186 | Weights_l2 --> 47181.848 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
155 |
+
[2024-08-07 07:33:07,474][Main][INFO] - [train] Step 6750 out of 25000 | Loss --> 0.582 | Grad_l2 --> 0.156 | Weights_l2 --> 47181.711 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
|
156 |
+
[2024-08-07 07:34:36,072][Main][INFO] - [train] Step 6800 out of 25000 | Loss --> 0.541 | Grad_l2 --> 0.188 | Weights_l2 --> 47181.574 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
|
157 |
+
[2024-08-07 07:36:01,451][Main][INFO] - [train] Step 6850 out of 25000 | Loss --> 0.558 | Grad_l2 --> 0.205 | Weights_l2 --> 47181.437 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
|
158 |
+
[2024-08-07 07:37:26,901][Main][INFO] - [train] Step 6900 out of 25000 | Loss --> 0.565 | Grad_l2 --> 0.152 | Weights_l2 --> 47181.299 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
|
159 |
+
[2024-08-07 07:38:55,145][Main][INFO] - [train] Step 6950 out of 25000 | Loss --> 0.587 | Grad_l2 --> 0.193 | Weights_l2 --> 47181.162 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
|
160 |
+
[2024-08-07 07:40:21,155][Main][INFO] - [train] Step 7000 out of 25000 | Loss --> 0.539 | Grad_l2 --> 0.172 | Weights_l2 --> 47181.025 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
|
161 |
+
[2024-08-07 07:41:48,993][Main][INFO] - [train] Step 7050 out of 25000 | Loss --> 0.551 | Grad_l2 --> 0.168 | Weights_l2 --> 47180.887 | Lr --> 0.000 | Seconds_per_step --> 1.757 |
|
162 |
+
[2024-08-07 07:43:14,381][Main][INFO] - [train] Step 7100 out of 25000 | Loss --> 0.550 | Grad_l2 --> 0.143 | Weights_l2 --> 47180.750 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
|
163 |
+
[2024-08-07 07:44:40,205][Main][INFO] - [train] Step 7150 out of 25000 | Loss --> 0.553 | Grad_l2 --> 0.148 | Weights_l2 --> 47180.609 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
164 |
+
[2024-08-07 07:46:09,019][Main][INFO] - [train] Step 7200 out of 25000 | Loss --> 0.573 | Grad_l2 --> 0.179 | Weights_l2 --> 47180.472 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
|
165 |
+
[2024-08-07 07:47:35,121][Main][INFO] - [train] Step 7250 out of 25000 | Loss --> 0.558 | Grad_l2 --> 0.170 | Weights_l2 --> 47180.338 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
|
166 |
+
[2024-08-07 07:49:00,910][Main][INFO] - [train] Step 7300 out of 25000 | Loss --> 0.539 | Grad_l2 --> 0.161 | Weights_l2 --> 47180.201 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
167 |
+
[2024-08-07 07:50:29,152][Main][INFO] - [train] Step 7350 out of 25000 | Loss --> 0.575 | Grad_l2 --> 0.159 | Weights_l2 --> 47180.064 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
|
168 |
+
[2024-08-07 07:51:54,570][Main][INFO] - [train] Step 7400 out of 25000 | Loss --> 0.535 | Grad_l2 --> 0.143 | Weights_l2 --> 47179.926 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
|
169 |
+
[2024-08-07 07:53:19,946][Main][INFO] - [train] Step 7450 out of 25000 | Loss --> 0.568 | Grad_l2 --> 0.153 | Weights_l2 --> 47179.789 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
|
170 |
+
[2024-08-07 07:54:47,864][Main][INFO] - [train] Step 7500 out of 25000 | Loss --> 0.554 | Grad_l2 --> 0.161 | Weights_l2 --> 47179.652 | Lr --> 0.000 | Seconds_per_step --> 1.758 |
|
171 |
+
[2024-08-07 07:56:13,855][Main][INFO] - [train] Step 7550 out of 25000 | Loss --> 0.526 | Grad_l2 --> 0.140 | Weights_l2 --> 47179.511 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
|
172 |
+
[2024-08-07 07:57:39,992][Main][INFO] - [train] Step 7600 out of 25000 | Loss --> 0.552 | Grad_l2 --> 0.201 | Weights_l2 --> 47179.377 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
173 |
+
[2024-08-07 07:59:08,451][Main][INFO] - [train] Step 7650 out of 25000 | Loss --> 0.536 | Grad_l2 --> 0.179 | Weights_l2 --> 47179.236 | Lr --> 0.000 | Seconds_per_step --> 1.769 |
|
174 |
+
[2024-08-07 08:00:34,328][Main][INFO] - [train] Step 7700 out of 25000 | Loss --> 0.554 | Grad_l2 --> 0.158 | Weights_l2 --> 47179.103 | Lr --> 0.000 | Seconds_per_step --> 1.718 |
|
175 |
+
[2024-08-07 08:01:59,720][Main][INFO] - [train] Step 7750 out of 25000 | Loss --> 0.579 | Grad_l2 --> 0.156 | Weights_l2 --> 47178.965 | Lr --> 0.000 | Seconds_per_step --> 1.708 |
|
176 |
+
[2024-08-07 08:03:27,727][Main][INFO] - [train] Step 7800 out of 25000 | Loss --> 0.565 | Grad_l2 --> 0.143 | Weights_l2 --> 47178.828 | Lr --> 0.000 | Seconds_per_step --> 1.760 |
|
177 |
+
[2024-08-07 08:04:53,570][Main][INFO] - [train] Step 7850 out of 25000 | Loss --> 0.524 | Grad_l2 --> 0.151 | Weights_l2 --> 47178.687 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
178 |
+
[2024-08-07 08:06:19,787][Main][INFO] - [train] Step 7900 out of 25000 | Loss --> 0.566 | Grad_l2 --> 0.145 | Weights_l2 --> 47178.549 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
|
179 |
+
[2024-08-07 08:07:48,354][Main][INFO] - [train] Step 7950 out of 25000 | Loss --> 0.554 | Grad_l2 --> 0.212 | Weights_l2 --> 47178.412 | Lr --> 0.000 | Seconds_per_step --> 1.771 |
|
180 |
+
[2024-08-07 08:09:13,971][Main][INFO] - [train] Step 8000 out of 25000 | Loss --> 0.554 | Grad_l2 --> 0.167 | Weights_l2 --> 47178.275 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
|
181 |
+
[2024-08-07 08:09:18,908][Main][INFO] - [eval] Step 8000 out of 25000 | Loss --> 0.880 | Accuracy --> 0.838 | Time --> 4.933 |
|
182 |
+
[2024-08-07 08:13:58,225][absl][INFO] - Using default tokenizer.
|
183 |
+
[2024-08-07 08:13:58,808][Main][INFO] - [test] Step 8000 out of 25000 | Rougel --> 21.696 | Time --> 279.900 |
|
184 |
+
[2024-08-07 08:15:24,876][Main][INFO] - [train] Step 8050 out of 25000 | Loss --> 0.539 | Grad_l2 --> 0.208 | Weights_l2 --> 47178.138 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
|
185 |
+
[2024-08-07 08:16:53,543][Main][INFO] - [train] Step 8100 out of 25000 | Loss --> 0.556 | Grad_l2 --> 0.159 | Weights_l2 --> 47178.000 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
|
186 |
+
[2024-08-07 08:18:19,674][Main][INFO] - [train] Step 8150 out of 25000 | Loss --> 0.540 | Grad_l2 --> 0.157 | Weights_l2 --> 47177.867 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
187 |
+
[2024-08-07 08:19:45,669][Main][INFO] - [train] Step 8200 out of 25000 | Loss --> 0.566 | Grad_l2 --> 0.150 | Weights_l2 --> 47177.729 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
|
188 |
+
[2024-08-07 08:21:13,985][Main][INFO] - [train] Step 8250 out of 25000 | Loss --> 0.546 | Grad_l2 --> 0.141 | Weights_l2 --> 47177.588 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
|
189 |
+
[2024-08-07 08:22:39,528][Main][INFO] - [train] Step 8300 out of 25000 | Loss --> 0.554 | Grad_l2 --> 0.158 | Weights_l2 --> 47177.451 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
190 |
+
[2024-08-07 08:24:05,082][Main][INFO] - [train] Step 8350 out of 25000 | Loss --> 0.526 | Grad_l2 --> 0.147 | Weights_l2 --> 47177.314 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
191 |
+
[2024-08-07 08:25:33,450][Main][INFO] - [train] Step 8400 out of 25000 | Loss --> 0.539 | Grad_l2 --> 0.141 | Weights_l2 --> 47177.176 | Lr --> 0.000 | Seconds_per_step --> 1.767 |
|
192 |
+
[2024-08-07 08:26:59,375][Main][INFO] - [train] Step 8450 out of 25000 | Loss --> 0.526 | Grad_l2 --> 0.134 | Weights_l2 --> 47177.039 | Lr --> 0.000 | Seconds_per_step --> 1.718 |
|
193 |
+
[2024-08-07 08:28:25,341][Main][INFO] - [train] Step 8500 out of 25000 | Loss --> 0.528 | Grad_l2 --> 0.171 | Weights_l2 --> 47176.902 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
|
194 |
+
[2024-08-07 08:29:53,919][Main][INFO] - [train] Step 8550 out of 25000 | Loss --> 0.546 | Grad_l2 --> 0.157 | Weights_l2 --> 47176.764 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
|
195 |
+
[2024-08-07 08:31:19,464][Main][INFO] - [train] Step 8600 out of 25000 | Loss --> 0.553 | Grad_l2 --> 0.146 | Weights_l2 --> 47176.631 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
196 |
+
[2024-08-07 08:32:47,795][Main][INFO] - [train] Step 8650 out of 25000 | Loss --> 0.532 | Grad_l2 --> 0.140 | Weights_l2 --> 47176.494 | Lr --> 0.000 | Seconds_per_step --> 1.767 |
|
197 |
+
[2024-08-07 08:34:13,984][Main][INFO] - [train] Step 8700 out of 25000 | Loss --> 0.553 | Grad_l2 --> 0.144 | Weights_l2 --> 47176.352 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
|
198 |
+
[2024-08-07 08:35:40,153][Main][INFO] - [train] Step 8750 out of 25000 | Loss --> 0.537 | Grad_l2 --> 0.135 | Weights_l2 --> 47176.215 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
199 |
+
[2024-08-07 08:37:08,880][Main][INFO] - [train] Step 8800 out of 25000 | Loss --> 0.552 | Grad_l2 --> 0.147 | Weights_l2 --> 47176.078 | Lr --> 0.000 | Seconds_per_step --> 1.775 |
|
200 |
+
[2024-08-07 08:38:35,095][Main][INFO] - [train] Step 8850 out of 25000 | Loss --> 0.564 | Grad_l2 --> 0.156 | Weights_l2 --> 47175.940 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
|
201 |
+
[2024-08-07 08:40:01,329][Main][INFO] - [train] Step 8900 out of 25000 | Loss --> 0.525 | Grad_l2 --> 0.144 | Weights_l2 --> 47175.803 | Lr --> 0.000 | Seconds_per_step --> 1.725 |
|
202 |
+
[2024-08-07 08:41:29,387][Main][INFO] - [train] Step 8950 out of 25000 | Loss --> 0.544 | Grad_l2 --> 0.150 | Weights_l2 --> 47175.666 | Lr --> 0.000 | Seconds_per_step --> 1.761 |
|
203 |
+
[2024-08-07 08:42:55,260][Main][INFO] - [train] Step 9000 out of 25000 | Loss --> 0.518 | Grad_l2 --> 0.151 | Weights_l2 --> 47175.528 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
204 |
+
[2024-08-07 08:44:21,359][Main][INFO] - [train] Step 9050 out of 25000 | Loss --> 0.562 | Grad_l2 --> 0.176 | Weights_l2 --> 47175.395 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
|
205 |
+
[2024-08-07 08:45:50,226][Main][INFO] - [train] Step 9100 out of 25000 | Loss --> 0.532 | Grad_l2 --> 0.147 | Weights_l2 --> 47175.258 | Lr --> 0.000 | Seconds_per_step --> 1.777 |
|
206 |
+
[2024-08-07 08:47:16,084][Main][INFO] - [train] Step 9150 out of 25000 | Loss --> 0.543 | Grad_l2 --> 0.174 | Weights_l2 --> 47175.117 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
207 |
+
[2024-08-07 08:48:41,636][Main][INFO] - [train] Step 9200 out of 25000 | Loss --> 0.546 | Grad_l2 --> 0.173 | Weights_l2 --> 47174.979 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
208 |
+
[2024-08-07 08:50:10,103][Main][INFO] - [train] Step 9250 out of 25000 | Loss --> 0.557 | Grad_l2 --> 0.152 | Weights_l2 --> 47174.838 | Lr --> 0.000 | Seconds_per_step --> 1.769 |
|
209 |
+
[2024-08-07 08:51:36,133][Main][INFO] - [train] Step 9300 out of 25000 | Loss --> 0.536 | Grad_l2 --> 0.155 | Weights_l2 --> 47174.701 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
|
210 |
+
[2024-08-07 08:53:02,290][Main][INFO] - [train] Step 9350 out of 25000 | Loss --> 0.552 | Grad_l2 --> 0.177 | Weights_l2 --> 47174.567 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
211 |
+
[2024-08-07 08:54:30,686][Main][INFO] - [train] Step 9400 out of 25000 | Loss --> 0.558 | Grad_l2 --> 0.147 | Weights_l2 --> 47174.430 | Lr --> 0.000 | Seconds_per_step --> 1.768 |
|
212 |
+
[2024-08-07 08:55:56,533][Main][INFO] - [train] Step 9450 out of 25000 | Loss --> 0.540 | Grad_l2 --> 0.151 | Weights_l2 --> 47174.293 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
213 |
+
[2024-08-07 08:57:22,391][Main][INFO] - [train] Step 9500 out of 25000 | Loss --> 0.507 | Grad_l2 --> 0.169 | Weights_l2 --> 47174.155 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
214 |
+
[2024-08-07 08:58:50,627][Main][INFO] - [train] Step 9550 out of 25000 | Loss --> 0.534 | Grad_l2 --> 0.142 | Weights_l2 --> 47174.018 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
|
215 |
+
[2024-08-07 09:00:16,767][Main][INFO] - [train] Step 9600 out of 25000 | Loss --> 0.519 | Grad_l2 --> 0.218 | Weights_l2 --> 47173.881 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
216 |
+
[2024-08-07 09:01:48,187][Main][INFO] - [train] Step 9650 out of 25000 | Loss --> 0.542 | Grad_l2 --> 0.153 | Weights_l2 --> 47173.743 | Lr --> 0.000 | Seconds_per_step --> 1.828 |
|
217 |
+
[2024-08-07 09:03:13,777][Main][INFO] - [train] Step 9700 out of 25000 | Loss --> 0.536 | Grad_l2 --> 0.238 | Weights_l2 --> 47173.606 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
|
218 |
+
[2024-08-07 09:04:39,235][Main][INFO] - [train] Step 9750 out of 25000 | Loss --> 0.546 | Grad_l2 --> 0.151 | Weights_l2 --> 47173.469 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
|
219 |
+
[2024-08-07 09:06:07,645][Main][INFO] - [train] Step 9800 out of 25000 | Loss --> 0.550 | Grad_l2 --> 0.154 | Weights_l2 --> 47173.332 | Lr --> 0.000 | Seconds_per_step --> 1.768 |
|
220 |
+
[2024-08-07 09:07:33,777][Main][INFO] - [train] Step 9850 out of 25000 | Loss --> 0.538 | Grad_l2 --> 0.146 | Weights_l2 --> 47173.194 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
221 |
+
[2024-08-07 09:08:59,343][Main][INFO] - [train] Step 9900 out of 25000 | Loss --> 0.561 | Grad_l2 --> 0.156 | Weights_l2 --> 47173.057 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
222 |
+
[2024-08-07 09:10:27,785][Main][INFO] - [train] Step 9950 out of 25000 | Loss --> 0.543 | Grad_l2 --> 0.143 | Weights_l2 --> 47172.920 | Lr --> 0.000 | Seconds_per_step --> 1.769 |
|
223 |
+
[2024-08-07 09:11:53,857][Main][INFO] - [train] Step 10000 out of 25000 | Loss --> 0.547 | Grad_l2 --> 0.139 | Weights_l2 --> 47172.782 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
|
224 |
+
[2024-08-07 09:11:53,857][accelerate.accelerator][INFO] - Saving current state to checkpoint-ft-10000
|
225 |
+
[2024-08-07 09:11:53,863][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'lm_head.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
226 |
+
[2024-08-07 09:11:54,664][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-ft-10000/model.safetensors
|
227 |
+
[2024-08-07 09:11:55,798][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-ft-10000/optimizer.bin
|
228 |
+
[2024-08-07 09:11:55,798][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-ft-10000/scheduler.bin
|
229 |
+
[2024-08-07 09:11:55,799][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-ft-10000/sampler.bin
|
230 |
+
[2024-08-07 09:11:55,799][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-ft-10000/sampler_1.bin
|
231 |
+
[2024-08-07 09:11:55,800][accelerate.checkpointing][INFO] - Random states saved in checkpoint-ft-10000/random_states_0.pkl
|
232 |
+
[2024-08-07 09:13:22,152][Main][INFO] - [train] Step 10050 out of 25000 | Loss --> 0.530 | Grad_l2 --> 0.143 | Weights_l2 --> 47172.645 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
|
233 |
+
[2024-08-07 09:14:50,831][Main][INFO] - [train] Step 10100 out of 25000 | Loss --> 0.532 | Grad_l2 --> 0.517 | Weights_l2 --> 47172.508 | Lr --> 0.000 | Seconds_per_step --> 1.774 |
|
234 |
+
[2024-08-07 09:16:16,529][Main][INFO] - [train] Step 10150 out of 25000 | Loss --> 0.577 | Grad_l2 --> 0.154 | Weights_l2 --> 47172.370 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
|
235 |
+
[2024-08-07 09:17:42,010][Main][INFO] - [train] Step 10200 out of 25000 | Loss --> 0.558 | Grad_l2 --> 0.145 | Weights_l2 --> 47172.233 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
236 |
+
[2024-08-07 09:19:10,193][Main][INFO] - [train] Step 10250 out of 25000 | Loss --> 0.556 | Grad_l2 --> 0.148 | Weights_l2 --> 47172.096 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
|
237 |
+
[2024-08-07 09:20:35,887][Main][INFO] - [train] Step 10300 out of 25000 | Loss --> 0.578 | Grad_l2 --> 0.179 | Weights_l2 --> 47171.958 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
|
238 |
+
[2024-08-07 09:22:01,651][Main][INFO] - [train] Step 10350 out of 25000 | Loss --> 0.568 | Grad_l2 --> 0.149 | Weights_l2 --> 47171.821 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
239 |
+
[2024-08-07 09:23:30,262][Main][INFO] - [train] Step 10400 out of 25000 | Loss --> 0.577 | Grad_l2 --> 0.149 | Weights_l2 --> 47171.684 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
|
240 |
+
[2024-08-07 09:24:55,846][Main][INFO] - [train] Step 10450 out of 25000 | Loss --> 0.577 | Grad_l2 --> 0.197 | Weights_l2 --> 47171.546 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
|
241 |
+
[2024-08-07 09:26:21,432][Main][INFO] - [train] Step 10500 out of 25000 | Loss --> 0.595 | Grad_l2 --> 0.148 | Weights_l2 --> 47171.409 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
|
242 |
+
[2024-08-07 09:27:49,918][Main][INFO] - [train] Step 10550 out of 25000 | Loss --> 0.599 | Grad_l2 --> 0.181 | Weights_l2 --> 47171.272 | Lr --> 0.000 | Seconds_per_step --> 1.770 |
|
243 |
+
[2024-08-07 09:29:16,097][Main][INFO] - [train] Step 10600 out of 25000 | Loss --> 0.578 | Grad_l2 --> 0.148 | Weights_l2 --> 47171.135 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
|
244 |
+
[2024-08-07 09:30:42,100][Main][INFO] - [train] Step 10650 out of 25000 | Loss --> 0.608 | Grad_l2 --> 0.156 | Weights_l2 --> 47170.997 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
|
245 |
+
[2024-08-07 09:32:10,527][Main][INFO] - [train] Step 10700 out of 25000 | Loss --> 0.611 | Grad_l2 --> 0.181 | Weights_l2 --> 47170.860 | Lr --> 0.000 | Seconds_per_step --> 1.769 |
|
246 |
+
[2024-08-07 09:33:36,488][Main][INFO] - [train] Step 10750 out of 25000 | Loss --> 0.615 | Grad_l2 --> 0.146 | Weights_l2 --> 47170.723 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
|
247 |
+
[2024-08-07 09:35:02,437][Main][INFO] - [train] Step 10800 out of 25000 | Loss --> 0.616 | Grad_l2 --> 0.151 | Weights_l2 --> 47170.585 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
|
248 |
+
[2024-08-07 09:36:31,350][Main][INFO] - [train] Step 10850 out of 25000 | Loss --> 0.606 | Grad_l2 --> 0.165 | Weights_l2 --> 47170.448 | Lr --> 0.000 | Seconds_per_step --> 1.778 |
|
249 |
+
[2024-08-07 09:37:57,470][Main][INFO] - [train] Step 10900 out of 25000 | Loss --> 0.575 | Grad_l2 --> 0.139 | Weights_l2 --> 47170.311 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
|
250 |
+
[2024-08-07 09:39:23,644][Main][INFO] - [train] Step 10950 out of 25000 | Loss --> 0.609 | Grad_l2 --> 0.145 | Weights_l2 --> 47170.173 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
251 |
+
[2024-08-07 09:40:52,251][Main][INFO] - [train] Step 11000 out of 25000 | Loss --> 0.585 | Grad_l2 --> 0.151 | Weights_l2 --> 47170.036 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
|
252 |
+
[2024-08-07 09:42:18,278][Main][INFO] - [train] Step 11050 out of 25000 | Loss --> 0.627 | Grad_l2 --> 0.149 | Weights_l2 --> 47169.899 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
|
253 |
+
[2024-08-07 09:43:44,271][Main][INFO] - [train] Step 11100 out of 25000 | Loss --> 0.624 | Grad_l2 --> 0.149 | Weights_l2 --> 47169.762 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
|
254 |
+
[2024-08-07 09:45:12,962][Main][INFO] - [train] Step 11150 out of 25000 | Loss --> 0.641 | Grad_l2 --> 0.165 | Weights_l2 --> 47169.620 | Lr --> 0.000 | Seconds_per_step --> 1.774 |
|
255 |
+
[2024-08-07 09:46:38,824][Main][INFO] - [train] Step 11200 out of 25000 | Loss --> 0.633 | Grad_l2 --> 0.160 | Weights_l2 --> 47169.483 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
256 |
+
[2024-08-07 09:48:06,984][Main][INFO] - [train] Step 11250 out of 25000 | Loss --> 0.625 | Grad_l2 --> 0.161 | Weights_l2 --> 47169.350 | Lr --> 0.000 | Seconds_per_step --> 1.763 |
|
257 |
+
[2024-08-07 09:49:32,458][Main][INFO] - [train] Step 11300 out of 25000 | Loss --> 0.629 | Grad_l2 --> 0.152 | Weights_l2 --> 47169.212 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
|
258 |
+
[2024-08-07 09:50:58,382][Main][INFO] - [train] Step 11350 out of 25000 | Loss --> 0.636 | Grad_l2 --> 0.150 | Weights_l2 --> 47169.075 | Lr --> 0.000 | Seconds_per_step --> 1.718 |
|
259 |
+
[2024-08-07 09:52:27,146][Main][INFO] - [train] Step 11400 out of 25000 | Loss --> 0.619 | Grad_l2 --> 0.149 | Weights_l2 --> 47168.938 | Lr --> 0.000 | Seconds_per_step --> 1.775 |
|
260 |
+
[2024-08-07 09:53:52,893][Main][INFO] - [train] Step 11450 out of 25000 | Loss --> 0.640 | Grad_l2 --> 0.152 | Weights_l2 --> 47168.800 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
261 |
+
[2024-08-07 09:55:18,641][Main][INFO] - [train] Step 11500 out of 25000 | Loss --> 0.636 | Grad_l2 --> 0.188 | Weights_l2 --> 47168.663 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
262 |
+
[2024-08-07 09:56:47,125][Main][INFO] - [train] Step 11550 out of 25000 | Loss --> 0.628 | Grad_l2 --> 0.188 | Weights_l2 --> 47168.526 | Lr --> 0.000 | Seconds_per_step --> 1.770 |
|
263 |
+
[2024-08-07 09:58:13,217][Main][INFO] - [train] Step 11600 out of 25000 | Loss --> 0.655 | Grad_l2 --> 0.161 | Weights_l2 --> 47168.392 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
|
264 |
+
[2024-08-07 09:59:38,994][Main][INFO] - [train] Step 11650 out of 25000 | Loss --> 0.613 | Grad_l2 --> 0.153 | Weights_l2 --> 47168.251 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
265 |
+
[2024-08-07 10:01:07,252][Main][INFO] - [train] Step 11700 out of 25000 | Loss --> 0.638 | Grad_l2 --> 0.158 | Weights_l2 --> 47168.114 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
|
266 |
+
[2024-08-07 10:02:32,915][Main][INFO] - [train] Step 11750 out of 25000 | Loss --> 0.633 | Grad_l2 --> 0.147 | Weights_l2 --> 47167.977 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
|
267 |
+
[2024-08-07 10:03:59,142][Main][INFO] - [train] Step 11800 out of 25000 | Loss --> 0.638 | Grad_l2 --> 0.192 | Weights_l2 --> 47167.839 | Lr --> 0.000 | Seconds_per_step --> 1.725 |
|
268 |
+
[2024-08-07 10:05:27,859][Main][INFO] - [train] Step 11850 out of 25000 | Loss --> 0.633 | Grad_l2 --> 0.149 | Weights_l2 --> 47167.702 | Lr --> 0.000 | Seconds_per_step --> 1.774 |
|
269 |
+
[2024-08-07 10:06:53,666][Main][INFO] - [train] Step 11900 out of 25000 | Loss --> 0.632 | Grad_l2 --> 0.145 | Weights_l2 --> 47167.565 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
270 |
+
[2024-08-07 10:08:19,233][Main][INFO] - [train] Step 11950 out of 25000 | Loss --> 0.628 | Grad_l2 --> 0.165 | Weights_l2 --> 47167.431 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
271 |
+
[2024-08-07 10:09:48,031][Main][INFO] - [train] Step 12000 out of 25000 | Loss --> 0.627 | Grad_l2 --> 0.148 | Weights_l2 --> 47167.294 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
|
272 |
+
[2024-08-07 10:09:52,932][Main][INFO] - [eval] Step 12000 out of 25000 | Loss --> 0.851 | Accuracy --> 0.842 | Time --> 4.898 |
|
273 |
+
[2024-08-07 10:14:26,659][absl][INFO] - Using default tokenizer.
|
274 |
+
[2024-08-07 10:14:27,253][Main][INFO] - [test] Step 12000 out of 25000 | Rougel --> 22.493 | Time --> 274.321 |
|
275 |
+
[2024-08-07 10:15:53,027][Main][INFO] - [train] Step 12050 out of 25000 | Loss --> 0.650 | Grad_l2 --> 0.157 | Weights_l2 --> 47167.157 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
276 |
+
[2024-08-07 10:17:18,994][Main][INFO] - [train] Step 12100 out of 25000 | Loss --> 0.639 | Grad_l2 --> 0.173 | Weights_l2 --> 47167.020 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
|
277 |
+
[2024-08-07 10:18:47,538][Main][INFO] - [train] Step 12150 out of 25000 | Loss --> 0.645 | Grad_l2 --> 0.162 | Weights_l2 --> 47166.878 | Lr --> 0.000 | Seconds_per_step --> 1.771 |
|
278 |
+
[2024-08-07 10:20:13,740][Main][INFO] - [train] Step 12200 out of 25000 | Loss --> 0.655 | Grad_l2 --> 0.185 | Weights_l2 --> 47166.741 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
|
279 |
+
[2024-08-07 10:21:39,599][Main][INFO] - [train] Step 12250 out of 25000 | Loss --> 0.659 | Grad_l2 --> 0.154 | Weights_l2 --> 47166.604 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
280 |
+
[2024-08-07 10:23:08,120][Main][INFO] - [train] Step 12300 out of 25000 | Loss --> 0.639 | Grad_l2 --> 0.156 | Weights_l2 --> 47166.466 | Lr --> 0.000 | Seconds_per_step --> 1.770 |
|
281 |
+
[2024-08-07 10:24:33,959][Main][INFO] - [train] Step 12350 out of 25000 | Loss --> 0.622 | Grad_l2 --> 0.145 | Weights_l2 --> 47166.329 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
282 |
+
[2024-08-07 10:26:00,262][Main][INFO] - [train] Step 12400 out of 25000 | Loss --> 0.653 | Grad_l2 --> 0.155 | Weights_l2 --> 47166.192 | Lr --> 0.000 | Seconds_per_step --> 1.726 |
|
283 |
+
[2024-08-07 10:27:28,963][Main][INFO] - [train] Step 12450 out of 25000 | Loss --> 0.633 | Grad_l2 --> 0.160 | Weights_l2 --> 47166.058 | Lr --> 0.000 | Seconds_per_step --> 1.774 |
|
284 |
+
[2024-08-07 10:28:55,101][Main][INFO] - [train] Step 12500 out of 25000 | Loss --> 0.626 | Grad_l2 --> 0.142 | Weights_l2 --> 47165.921 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
285 |
+
[2024-08-07 10:30:20,881][Main][INFO] - [train] Step 12550 out of 25000 | Loss --> 0.618 | Grad_l2 --> 0.147 | Weights_l2 --> 47165.784 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
286 |
+
[2024-08-07 10:31:49,106][Main][INFO] - [train] Step 12600 out of 25000 | Loss --> 0.658 | Grad_l2 --> 0.189 | Weights_l2 --> 47165.647 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
|
287 |
+
[2024-08-07 10:33:15,133][Main][INFO] - [train] Step 12650 out of 25000 | Loss --> 0.651 | Grad_l2 --> 0.150 | Weights_l2 --> 47165.509 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
|
288 |
+
[2024-08-07 10:34:44,001][Main][INFO] - [train] Step 12700 out of 25000 | Loss --> 0.631 | Grad_l2 --> 0.153 | Weights_l2 --> 47165.368 | Lr --> 0.000 | Seconds_per_step --> 1.777 |
|
289 |
+
[2024-08-07 10:36:10,488][Main][INFO] - [train] Step 12750 out of 25000 | Loss --> 0.616 | Grad_l2 --> 0.151 | Weights_l2 --> 47165.231 | Lr --> 0.000 | Seconds_per_step --> 1.730 |
|
290 |
+
[2024-08-07 10:37:45,117][Main][INFO] - [train] Step 12800 out of 25000 | Loss --> 0.660 | Grad_l2 --> 0.154 | Weights_l2 --> 47165.097 | Lr --> 0.000 | Seconds_per_step --> 1.893 |
|
291 |
+
[2024-08-07 10:39:14,106][Main][INFO] - [train] Step 12850 out of 25000 | Loss --> 0.648 | Grad_l2 --> 0.147 | Weights_l2 --> 47164.960 | Lr --> 0.000 | Seconds_per_step --> 1.780 |
|
292 |
+
[2024-08-07 10:40:39,804][Main][INFO] - [train] Step 12900 out of 25000 | Loss --> 0.623 | Grad_l2 --> 0.144 | Weights_l2 --> 47164.823 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
|
293 |
+
[2024-08-07 10:42:05,250][Main][INFO] - [train] Step 12950 out of 25000 | Loss --> 0.641 | Grad_l2 --> 0.165 | Weights_l2 --> 47164.685 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
|
294 |
+
[2024-08-07 10:43:33,303][Main][INFO] - [train] Step 13000 out of 25000 | Loss --> 0.659 | Grad_l2 --> 0.148 | Weights_l2 --> 47164.548 | Lr --> 0.000 | Seconds_per_step --> 1.761 |
|
295 |
+
[2024-08-07 10:44:59,507][Main][INFO] - [train] Step 13050 out of 25000 | Loss --> 0.657 | Grad_l2 --> 0.224 | Weights_l2 --> 47164.411 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
|
296 |
+
[2024-08-07 10:46:25,581][Main][INFO] - [train] Step 13100 out of 25000 | Loss --> 0.668 | Grad_l2 --> 0.163 | Weights_l2 --> 47164.273 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
|
297 |
+
[2024-08-07 10:47:53,904][Main][INFO] - [train] Step 13150 out of 25000 | Loss --> 0.671 | Grad_l2 --> 0.175 | Weights_l2 --> 47164.136 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
|
298 |
+
[2024-08-07 10:49:19,864][Main][INFO] - [train] Step 13200 out of 25000 | Loss --> 0.666 | Grad_l2 --> 0.158 | Weights_l2 --> 47163.999 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
|
299 |
+
[2024-08-07 10:50:45,992][Main][INFO] - [train] Step 13250 out of 25000 | Loss --> 0.681 | Grad_l2 --> 0.158 | Weights_l2 --> 47163.862 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
300 |
+
[2024-08-07 10:52:15,654][Main][INFO] - [train] Step 13300 out of 25000 | Loss --> 0.653 | Grad_l2 --> 0.167 | Weights_l2 --> 47163.728 | Lr --> 0.000 | Seconds_per_step --> 1.793 |
|
301 |
+
[2024-08-07 10:53:41,694][Main][INFO] - [train] Step 13350 out of 25000 | Loss --> 0.651 | Grad_l2 --> 0.154 | Weights_l2 --> 47163.587 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
|
302 |
+
[2024-08-07 10:55:07,494][Main][INFO] - [train] Step 13400 out of 25000 | Loss --> 0.683 | Grad_l2 --> 0.161 | Weights_l2 --> 47163.450 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
303 |
+
[2024-08-07 10:56:35,968][Main][INFO] - [train] Step 13450 out of 25000 | Loss --> 0.666 | Grad_l2 --> 0.168 | Weights_l2 --> 47163.312 | Lr --> 0.000 | Seconds_per_step --> 1.769 |
|
304 |
+
[2024-08-07 10:58:01,621][Main][INFO] - [train] Step 13500 out of 25000 | Loss --> 0.650 | Grad_l2 --> 0.158 | Weights_l2 --> 47163.175 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
|
305 |
+
[2024-08-07 10:59:27,354][Main][INFO] - [train] Step 13550 out of 25000 | Loss --> 0.691 | Grad_l2 --> 0.171 | Weights_l2 --> 47163.038 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
306 |
+
[2024-08-07 11:00:55,576][Main][INFO] - [train] Step 13600 out of 25000 | Loss --> 0.678 | Grad_l2 --> 0.177 | Weights_l2 --> 47162.901 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
|
307 |
+
[2024-08-07 11:02:21,441][Main][INFO] - [train] Step 13650 out of 25000 | Loss --> 0.711 | Grad_l2 --> 0.170 | Weights_l2 --> 47162.763 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
308 |
+
[2024-08-07 11:03:47,588][Main][INFO] - [train] Step 13700 out of 25000 | Loss --> 0.696 | Grad_l2 --> 0.170 | Weights_l2 --> 47162.630 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
309 |
+
[2024-08-07 11:05:16,219][Main][INFO] - [train] Step 13750 out of 25000 | Loss --> 0.711 | Grad_l2 --> 0.163 | Weights_l2 --> 47162.493 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
|
310 |
+
[2024-08-07 11:06:41,706][Main][INFO] - [train] Step 13800 out of 25000 | Loss --> 0.715 | Grad_l2 --> 0.163 | Weights_l2 --> 47162.352 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
311 |
+
[2024-08-07 11:08:07,195][Main][INFO] - [train] Step 13850 out of 25000 | Loss --> 0.694 | Grad_l2 --> 0.164 | Weights_l2 --> 47162.218 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
312 |
+
[2024-08-07 11:09:35,388][Main][INFO] - [train] Step 13900 out of 25000 | Loss --> 0.680 | Grad_l2 --> 0.152 | Weights_l2 --> 47162.081 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
|
313 |
+
[2024-08-07 11:11:00,908][Main][INFO] - [train] Step 13950 out of 25000 | Loss --> 0.696 | Grad_l2 --> 0.168 | Weights_l2 --> 47161.944 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
314 |
+
[2024-08-07 11:12:26,729][Main][INFO] - [train] Step 14000 out of 25000 | Loss --> 0.711 | Grad_l2 --> 0.157 | Weights_l2 --> 47161.806 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
315 |
+
[2024-08-07 11:13:55,581][Main][INFO] - [train] Step 14050 out of 25000 | Loss --> 0.682 | Grad_l2 --> 0.166 | Weights_l2 --> 47161.669 | Lr --> 0.000 | Seconds_per_step --> 1.777 |
|
316 |
+
[2024-08-07 11:15:21,781][Main][INFO] - [train] Step 14100 out of 25000 | Loss --> 0.709 | Grad_l2 --> 0.158 | Weights_l2 --> 47161.532 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
|
317 |
+
[2024-08-07 11:16:47,977][Main][INFO] - [train] Step 14150 out of 25000 | Loss --> 0.733 | Grad_l2 --> 0.160 | Weights_l2 --> 47161.394 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
|
318 |
+
[2024-08-07 11:18:16,612][Main][INFO] - [train] Step 14200 out of 25000 | Loss --> 0.728 | Grad_l2 --> 0.168 | Weights_l2 --> 47161.257 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
|
319 |
+
[2024-08-07 11:19:42,684][Main][INFO] - [train] Step 14250 out of 25000 | Loss --> 0.707 | Grad_l2 --> 0.160 | Weights_l2 --> 47161.120 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
|
320 |
+
[2024-08-07 11:21:08,147][Main][INFO] - [train] Step 14300 out of 25000 | Loss --> 0.717 | Grad_l2 --> 0.156 | Weights_l2 --> 47160.982 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
|
321 |
+
[2024-08-07 11:22:36,306][Main][INFO] - [train] Step 14350 out of 25000 | Loss --> 0.712 | Grad_l2 --> 0.157 | Weights_l2 --> 47160.845 | Lr --> 0.000 | Seconds_per_step --> 1.763 |
|
322 |
+
[2024-08-07 11:24:01,811][Main][INFO] - [train] Step 14400 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.164 | Weights_l2 --> 47160.708 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
323 |
+
[2024-08-07 11:25:27,346][Main][INFO] - [train] Step 14450 out of 25000 | Loss --> 0.706 | Grad_l2 --> 0.160 | Weights_l2 --> 47160.570 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
324 |
+
[2024-08-07 11:26:55,597][Main][INFO] - [train] Step 14500 out of 25000 | Loss --> 0.731 | Grad_l2 --> 0.159 | Weights_l2 --> 47160.433 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
|
325 |
+
[2024-08-07 11:28:21,073][Main][INFO] - [train] Step 14550 out of 25000 | Loss --> 0.713 | Grad_l2 --> 0.159 | Weights_l2 --> 47160.296 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
326 |
+
[2024-08-07 11:29:46,549][Main][INFO] - [train] Step 14600 out of 25000 | Loss --> 0.712 | Grad_l2 --> 0.170 | Weights_l2 --> 47160.155 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
327 |
+
[2024-08-07 11:31:14,793][Main][INFO] - [train] Step 14650 out of 25000 | Loss --> 0.732 | Grad_l2 --> 0.156 | Weights_l2 --> 47160.021 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
|
328 |
+
[2024-08-07 11:32:40,288][Main][INFO] - [train] Step 14700 out of 25000 | Loss --> 0.725 | Grad_l2 --> 0.154 | Weights_l2 --> 47159.884 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
329 |
+
[2024-08-07 11:34:05,850][Main][INFO] - [train] Step 14750 out of 25000 | Loss --> 0.689 | Grad_l2 --> 0.147 | Weights_l2 --> 47159.747 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
330 |
+
[2024-08-07 11:35:34,608][Main][INFO] - [train] Step 14800 out of 25000 | Loss --> 0.705 | Grad_l2 --> 0.170 | Weights_l2 --> 47159.609 | Lr --> 0.000 | Seconds_per_step --> 1.775 |
|
331 |
+
[2024-08-07 11:37:00,181][Main][INFO] - [train] Step 14850 out of 25000 | Loss --> 0.722 | Grad_l2 --> 0.165 | Weights_l2 --> 47159.472 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
332 |
+
[2024-08-07 11:38:25,726][Main][INFO] - [train] Step 14900 out of 25000 | Loss --> 0.696 | Grad_l2 --> 0.174 | Weights_l2 --> 47159.335 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
333 |
+
[2024-08-07 11:39:53,934][Main][INFO] - [train] Step 14950 out of 25000 | Loss --> 0.732 | Grad_l2 --> 0.180 | Weights_l2 --> 47159.198 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
|
334 |
+
[2024-08-07 11:41:19,559][Main][INFO] - [train] Step 15000 out of 25000 | Loss --> 0.705 | Grad_l2 --> 0.173 | Weights_l2 --> 47159.060 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
|
335 |
+
[2024-08-07 11:41:19,560][accelerate.accelerator][INFO] - Saving current state to checkpoint-ft-15000
|
336 |
+
[2024-08-07 11:41:19,566][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'lm_head.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
337 |
+
[2024-08-07 11:41:20,382][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-ft-15000/model.safetensors
|
338 |
+
[2024-08-07 11:41:21,526][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-ft-15000/optimizer.bin
|
339 |
+
[2024-08-07 11:41:21,527][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-ft-15000/scheduler.bin
|
340 |
+
[2024-08-07 11:41:21,527][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-ft-15000/sampler.bin
|
341 |
+
[2024-08-07 11:41:21,527][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-ft-15000/sampler_1.bin
|
342 |
+
[2024-08-07 11:41:21,528][accelerate.checkpointing][INFO] - Random states saved in checkpoint-ft-15000/random_states_0.pkl
|
343 |
+
[2024-08-07 11:42:47,021][Main][INFO] - [train] Step 15050 out of 25000 | Loss --> 0.707 | Grad_l2 --> 0.173 | Weights_l2 --> 47158.927 | Lr --> 0.000 | Seconds_per_step --> 1.749 |
|
344 |
+
[2024-08-07 11:44:15,207][Main][INFO] - [train] Step 15100 out of 25000 | Loss --> 0.731 | Grad_l2 --> 0.153 | Weights_l2 --> 47158.786 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
|
345 |
+
[2024-08-07 11:45:41,007][Main][INFO] - [train] Step 15150 out of 25000 | Loss --> 0.724 | Grad_l2 --> 0.166 | Weights_l2 --> 47158.648 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
346 |
+
[2024-08-07 11:47:07,191][Main][INFO] - [train] Step 15200 out of 25000 | Loss --> 0.738 | Grad_l2 --> 0.170 | Weights_l2 --> 47158.515 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
|
347 |
+
[2024-08-07 11:48:35,506][Main][INFO] - [train] Step 15250 out of 25000 | Loss --> 0.715 | Grad_l2 --> 0.158 | Weights_l2 --> 47158.378 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
|
348 |
+
[2024-08-07 11:50:01,669][Main][INFO] - [train] Step 15300 out of 25000 | Loss --> 0.733 | Grad_l2 --> 0.166 | Weights_l2 --> 47158.237 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
349 |
+
[2024-08-07 11:51:27,955][Main][INFO] - [train] Step 15350 out of 25000 | Loss --> 0.725 | Grad_l2 --> 0.172 | Weights_l2 --> 47158.099 | Lr --> 0.000 | Seconds_per_step --> 1.726 |
|
350 |
+
[2024-08-07 11:52:56,776][Main][INFO] - [train] Step 15400 out of 25000 | Loss --> 0.708 | Grad_l2 --> 0.151 | Weights_l2 --> 47157.962 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
|
351 |
+
[2024-08-07 11:54:22,548][Main][INFO] - [train] Step 15450 out of 25000 | Loss --> 0.750 | Grad_l2 --> 0.157 | Weights_l2 --> 47157.825 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
352 |
+
[2024-08-07 11:55:48,808][Main][INFO] - [train] Step 15500 out of 25000 | Loss --> 0.766 | Grad_l2 --> 0.171 | Weights_l2 --> 47157.687 | Lr --> 0.000 | Seconds_per_step --> 1.725 |
|
353 |
+
[2024-08-07 11:57:17,585][Main][INFO] - [train] Step 15550 out of 25000 | Loss --> 0.734 | Grad_l2 --> 0.169 | Weights_l2 --> 47157.554 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
|
354 |
+
[2024-08-07 11:58:43,740][Main][INFO] - [train] Step 15600 out of 25000 | Loss --> 0.714 | Grad_l2 --> 0.155 | Weights_l2 --> 47157.417 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
355 |
+
[2024-08-07 12:00:12,386][Main][INFO] - [train] Step 15650 out of 25000 | Loss --> 0.745 | Grad_l2 --> 0.169 | Weights_l2 --> 47157.279 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
|
356 |
+
[2024-08-07 12:01:38,506][Main][INFO] - [train] Step 15700 out of 25000 | Loss --> 0.732 | Grad_l2 --> 0.223 | Weights_l2 --> 47157.142 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
|
357 |
+
[2024-08-07 12:03:04,291][Main][INFO] - [train] Step 15750 out of 25000 | Loss --> 0.718 | Grad_l2 --> 0.161 | Weights_l2 --> 47157.005 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
358 |
+
[2024-08-07 12:04:32,531][Main][INFO] - [train] Step 15800 out of 25000 | Loss --> 0.743 | Grad_l2 --> 0.177 | Weights_l2 --> 47156.871 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
|
359 |
+
[2024-08-07 12:05:58,031][Main][INFO] - [train] Step 15850 out of 25000 | Loss --> 0.754 | Grad_l2 --> 0.167 | Weights_l2 --> 47156.730 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
360 |
+
[2024-08-07 12:07:23,569][Main][INFO] - [train] Step 15900 out of 25000 | Loss --> 0.730 | Grad_l2 --> 0.200 | Weights_l2 --> 47156.593 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
361 |
+
[2024-08-07 12:08:51,789][Main][INFO] - [train] Step 15950 out of 25000 | Loss --> 0.750 | Grad_l2 --> 0.196 | Weights_l2 --> 47156.456 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
|
362 |
+
[2024-08-07 12:10:17,593][Main][INFO] - [train] Step 16000 out of 25000 | Loss --> 0.734 | Grad_l2 --> 0.158 | Weights_l2 --> 47156.318 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
363 |
+
[2024-08-07 12:10:22,535][Main][INFO] - [eval] Step 16000 out of 25000 | Loss --> 0.830 | Accuracy --> 0.846 | Time --> 4.939 |
|
364 |
+
[2024-08-07 12:14:42,758][absl][INFO] - Using default tokenizer.
|
365 |
+
[2024-08-07 12:14:43,321][Main][INFO] - [test] Step 16000 out of 25000 | Rougel --> 24.234 | Time --> 260.785 |
|
366 |
+
[2024-08-07 12:16:09,102][Main][INFO] - [train] Step 16050 out of 25000 | Loss --> 0.724 | Grad_l2 --> 0.153 | Weights_l2 --> 47156.181 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
367 |
+
[2024-08-07 12:17:38,179][Main][INFO] - [train] Step 16100 out of 25000 | Loss --> 0.746 | Grad_l2 --> 0.173 | Weights_l2 --> 47156.044 | Lr --> 0.000 | Seconds_per_step --> 1.782 |
|
368 |
+
[2024-08-07 12:19:04,016][Main][INFO] - [train] Step 16150 out of 25000 | Loss --> 0.746 | Grad_l2 --> 0.207 | Weights_l2 --> 47155.906 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
369 |
+
[2024-08-07 12:20:29,950][Main][INFO] - [train] Step 16200 out of 25000 | Loss --> 0.749 | Grad_l2 --> 0.155 | Weights_l2 --> 47155.769 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
|
370 |
+
[2024-08-07 12:21:58,460][Main][INFO] - [train] Step 16250 out of 25000 | Loss --> 0.730 | Grad_l2 --> 0.171 | Weights_l2 --> 47155.632 | Lr --> 0.000 | Seconds_per_step --> 1.770 |
|
371 |
+
[2024-08-07 12:23:24,644][Main][INFO] - [train] Step 16300 out of 25000 | Loss --> 0.738 | Grad_l2 --> 0.172 | Weights_l2 --> 47155.495 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
|
372 |
+
[2024-08-07 12:24:50,811][Main][INFO] - [train] Step 16350 out of 25000 | Loss --> 0.726 | Grad_l2 --> 0.159 | Weights_l2 --> 47155.357 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
373 |
+
[2024-08-07 12:26:18,994][Main][INFO] - [train] Step 16400 out of 25000 | Loss --> 0.726 | Grad_l2 --> 0.155 | Weights_l2 --> 47155.224 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
|
374 |
+
[2024-08-07 12:27:44,559][Main][INFO] - [train] Step 16450 out of 25000 | Loss --> 0.721 | Grad_l2 --> 0.169 | Weights_l2 --> 47155.086 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
375 |
+
[2024-08-07 12:29:10,639][Main][INFO] - [train] Step 16500 out of 25000 | Loss --> 0.733 | Grad_l2 --> 0.174 | Weights_l2 --> 47154.949 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
|
376 |
+
[2024-08-07 12:30:38,659][Main][INFO] - [train] Step 16550 out of 25000 | Loss --> 0.712 | Grad_l2 --> 0.174 | Weights_l2 --> 47154.812 | Lr --> 0.000 | Seconds_per_step --> 1.760 |
|
377 |
+
[2024-08-07 12:32:04,163][Main][INFO] - [train] Step 16600 out of 25000 | Loss --> 0.718 | Grad_l2 --> 0.152 | Weights_l2 --> 47154.675 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
378 |
+
[2024-08-07 12:33:29,664][Main][INFO] - [train] Step 16650 out of 25000 | Loss --> 0.734 | Grad_l2 --> 0.196 | Weights_l2 --> 47154.537 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
379 |
+
[2024-08-07 12:34:57,619][Main][INFO] - [train] Step 16700 out of 25000 | Loss --> 0.745 | Grad_l2 --> 0.158 | Weights_l2 --> 47154.400 | Lr --> 0.000 | Seconds_per_step --> 1.759 |
|
380 |
+
[2024-08-07 12:36:23,705][Main][INFO] - [train] Step 16750 out of 25000 | Loss --> 0.754 | Grad_l2 --> 0.228 | Weights_l2 --> 47154.263 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
|
381 |
+
[2024-08-07 12:37:51,699][Main][INFO] - [train] Step 16800 out of 25000 | Loss --> 0.744 | Grad_l2 --> 0.185 | Weights_l2 --> 47154.126 | Lr --> 0.000 | Seconds_per_step --> 1.760 |
|
382 |
+
[2024-08-07 12:39:17,171][Main][INFO] - [train] Step 16850 out of 25000 | Loss --> 0.742 | Grad_l2 --> 0.172 | Weights_l2 --> 47153.988 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
|
383 |
+
[2024-08-07 12:40:43,000][Main][INFO] - [train] Step 16900 out of 25000 | Loss --> 0.736 | Grad_l2 --> 0.216 | Weights_l2 --> 47153.851 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
384 |
+
[2024-08-07 12:42:11,725][Main][INFO] - [train] Step 16950 out of 25000 | Loss --> 0.737 | Grad_l2 --> 0.212 | Weights_l2 --> 47153.714 | Lr --> 0.000 | Seconds_per_step --> 1.774 |
|
385 |
+
[2024-08-07 12:43:38,101][Main][INFO] - [train] Step 17000 out of 25000 | Loss --> 0.737 | Grad_l2 --> 0.188 | Weights_l2 --> 47153.576 | Lr --> 0.000 | Seconds_per_step --> 1.728 |
|
386 |
+
[2024-08-07 12:45:04,080][Main][INFO] - [train] Step 17050 out of 25000 | Loss --> 0.753 | Grad_l2 --> 0.337 | Weights_l2 --> 47153.439 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
|
387 |
+
[2024-08-07 12:46:32,474][Main][INFO] - [train] Step 17100 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.165 | Weights_l2 --> 47153.302 | Lr --> 0.000 | Seconds_per_step --> 1.768 |
|
388 |
+
[2024-08-07 12:47:58,632][Main][INFO] - [train] Step 17150 out of 25000 | Loss --> 0.735 | Grad_l2 --> 0.162 | Weights_l2 --> 47153.165 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
389 |
+
[2024-08-07 12:49:25,027][Main][INFO] - [train] Step 17200 out of 25000 | Loss --> 0.742 | Grad_l2 --> 0.167 | Weights_l2 --> 47153.031 | Lr --> 0.000 | Seconds_per_step --> 1.728 |
|
390 |
+
[2024-08-07 12:50:53,829][Main][INFO] - [train] Step 17250 out of 25000 | Loss --> 0.732 | Grad_l2 --> 0.166 | Weights_l2 --> 47152.894 | Lr --> 0.000 | Seconds_per_step --> 1.776 |
|
391 |
+
[2024-08-07 12:52:19,460][Main][INFO] - [train] Step 17300 out of 25000 | Loss --> 0.710 | Grad_l2 --> 0.151 | Weights_l2 --> 47152.757 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
|
392 |
+
[2024-08-07 12:53:45,284][Main][INFO] - [train] Step 17350 out of 25000 | Loss --> 0.748 | Grad_l2 --> 0.155 | Weights_l2 --> 47152.619 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
393 |
+
[2024-08-07 12:55:14,159][Main][INFO] - [train] Step 17400 out of 25000 | Loss --> 0.708 | Grad_l2 --> 0.151 | Weights_l2 --> 47152.482 | Lr --> 0.000 | Seconds_per_step --> 1.777 |
|
394 |
+
[2024-08-07 12:56:40,014][Main][INFO] - [train] Step 17450 out of 25000 | Loss --> 0.739 | Grad_l2 --> 0.181 | Weights_l2 --> 47152.345 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
395 |
+
[2024-08-07 12:58:05,500][Main][INFO] - [train] Step 17500 out of 25000 | Loss --> 0.724 | Grad_l2 --> 0.196 | Weights_l2 --> 47152.211 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
396 |
+
[2024-08-07 12:59:33,742][Main][INFO] - [train] Step 17550 out of 25000 | Loss --> 0.705 | Grad_l2 --> 0.174 | Weights_l2 --> 47152.070 | Lr --> 0.000 | Seconds_per_step --> 1.765 |
|
397 |
+
[2024-08-07 13:00:59,255][Main][INFO] - [train] Step 17600 out of 25000 | Loss --> 0.726 | Grad_l2 --> 0.167 | Weights_l2 --> 47151.933 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
398 |
+
[2024-08-07 13:02:24,974][Main][INFO] - [train] Step 17650 out of 25000 | Loss --> 0.733 | Grad_l2 --> 0.185 | Weights_l2 --> 47151.796 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
|
399 |
+
[2024-08-07 13:03:53,834][Main][INFO] - [train] Step 17700 out of 25000 | Loss --> 0.753 | Grad_l2 --> 0.169 | Weights_l2 --> 47151.658 | Lr --> 0.000 | Seconds_per_step --> 1.777 |
|
400 |
+
[2024-08-07 13:05:19,816][Main][INFO] - [train] Step 17750 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.179 | Weights_l2 --> 47151.521 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
|
401 |
+
[2024-08-07 13:06:48,589][Main][INFO] - [train] Step 17800 out of 25000 | Loss --> 0.724 | Grad_l2 --> 0.169 | Weights_l2 --> 47151.384 | Lr --> 0.000 | Seconds_per_step --> 1.775 |
|
402 |
+
[2024-08-07 13:08:14,874][Main][INFO] - [train] Step 17850 out of 25000 | Loss --> 0.728 | Grad_l2 --> 0.185 | Weights_l2 --> 47151.246 | Lr --> 0.000 | Seconds_per_step --> 1.726 |
|
403 |
+
[2024-08-07 13:09:40,853][Main][INFO] - [train] Step 17900 out of 25000 | Loss --> 0.706 | Grad_l2 --> 0.161 | Weights_l2 --> 47151.109 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
|
404 |
+
[2024-08-07 13:11:08,861][Main][INFO] - [train] Step 17950 out of 25000 | Loss --> 0.751 | Grad_l2 --> 0.161 | Weights_l2 --> 47150.972 | Lr --> 0.000 | Seconds_per_step --> 1.760 |
|
405 |
+
[2024-08-07 13:12:34,980][Main][INFO] - [train] Step 18000 out of 25000 | Loss --> 0.745 | Grad_l2 --> 0.257 | Weights_l2 --> 47150.835 | Lr --> 0.000 | Seconds_per_step --> 1.722 |
|
406 |
+
[2024-08-07 13:14:01,362][Main][INFO] - [train] Step 18050 out of 25000 | Loss --> 0.760 | Grad_l2 --> 0.166 | Weights_l2 --> 47150.701 | Lr --> 0.000 | Seconds_per_step --> 1.728 |
|
407 |
+
[2024-08-07 13:15:29,325][Main][INFO] - [train] Step 18100 out of 25000 | Loss --> 0.724 | Grad_l2 --> 0.159 | Weights_l2 --> 47150.564 | Lr --> 0.000 | Seconds_per_step --> 1.759 |
|
408 |
+
[2024-08-07 13:16:54,802][Main][INFO] - [train] Step 18150 out of 25000 | Loss --> 0.733 | Grad_l2 --> 0.163 | Weights_l2 --> 47150.427 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
409 |
+
[2024-08-07 13:18:20,272][Main][INFO] - [train] Step 18200 out of 25000 | Loss --> 0.723 | Grad_l2 --> 0.157 | Weights_l2 --> 47150.289 | Lr --> 0.000 | Seconds_per_step --> 1.709 |
|
410 |
+
[2024-08-07 13:19:48,874][Main][INFO] - [train] Step 18250 out of 25000 | Loss --> 0.738 | Grad_l2 --> 0.186 | Weights_l2 --> 47150.152 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
|
411 |
+
[2024-08-07 13:21:14,606][Main][INFO] - [train] Step 18300 out of 25000 | Loss --> 0.750 | Grad_l2 --> 0.155 | Weights_l2 --> 47150.015 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
412 |
+
[2024-08-07 13:22:40,318][Main][INFO] - [train] Step 18350 out of 25000 | Loss --> 0.732 | Grad_l2 --> 0.151 | Weights_l2 --> 47149.878 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
|
413 |
+
[2024-08-07 13:24:08,741][Main][INFO] - [train] Step 18400 out of 25000 | Loss --> 0.716 | Grad_l2 --> 0.172 | Weights_l2 --> 47149.740 | Lr --> 0.000 | Seconds_per_step --> 1.768 |
|
414 |
+
[2024-08-07 13:25:34,410][Main][INFO] - [train] Step 18450 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.164 | Weights_l2 --> 47149.603 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
|
415 |
+
[2024-08-07 13:27:00,745][Main][INFO] - [train] Step 18500 out of 25000 | Loss --> 0.731 | Grad_l2 --> 0.156 | Weights_l2 --> 47149.466 | Lr --> 0.000 | Seconds_per_step --> 1.727 |
|
416 |
+
[2024-08-07 13:28:29,717][Main][INFO] - [train] Step 18550 out of 25000 | Loss --> 0.723 | Grad_l2 --> 0.153 | Weights_l2 --> 47149.328 | Lr --> 0.000 | Seconds_per_step --> 1.779 |
|
417 |
+
[2024-08-07 13:29:55,948][Main][INFO] - [train] Step 18600 out of 25000 | Loss --> 0.709 | Grad_l2 --> 0.167 | Weights_l2 --> 47149.191 | Lr --> 0.000 | Seconds_per_step --> 1.725 |
|
418 |
+
[2024-08-07 13:31:22,096][Main][INFO] - [train] Step 18650 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.162 | Weights_l2 --> 47149.054 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
419 |
+
[2024-08-07 13:32:50,690][Main][INFO] - [train] Step 18700 out of 25000 | Loss --> 0.712 | Grad_l2 --> 0.182 | Weights_l2 --> 47148.916 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
|
420 |
+
[2024-08-07 13:34:16,757][Main][INFO] - [train] Step 18750 out of 25000 | Loss --> 0.736 | Grad_l2 --> 0.196 | Weights_l2 --> 47148.779 | Lr --> 0.000 | Seconds_per_step --> 1.721 |
|
421 |
+
[2024-08-07 13:35:42,343][Main][INFO] - [train] Step 18800 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.199 | Weights_l2 --> 47148.642 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
|
422 |
+
[2024-08-07 13:37:10,652][Main][INFO] - [train] Step 18850 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.161 | Weights_l2 --> 47148.504 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
|
423 |
+
[2024-08-07 13:38:41,717][Main][INFO] - [train] Step 18900 out of 25000 | Loss --> 0.751 | Grad_l2 --> 0.159 | Weights_l2 --> 47148.367 | Lr --> 0.000 | Seconds_per_step --> 1.821 |
|
424 |
+
[2024-08-07 13:40:07,531][Main][INFO] - [train] Step 18950 out of 25000 | Loss --> 0.726 | Grad_l2 --> 0.186 | Weights_l2 --> 47148.234 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
425 |
+
[2024-08-07 13:41:35,943][Main][INFO] - [train] Step 19000 out of 25000 | Loss --> 0.743 | Grad_l2 --> 0.260 | Weights_l2 --> 47148.096 | Lr --> 0.000 | Seconds_per_step --> 1.768 |
|
426 |
+
[2024-08-07 13:43:02,256][Main][INFO] - [train] Step 19050 out of 25000 | Loss --> 0.727 | Grad_l2 --> 0.218 | Weights_l2 --> 47147.959 | Lr --> 0.000 | Seconds_per_step --> 1.726 |
|
427 |
+
[2024-08-07 13:44:28,589][Main][INFO] - [train] Step 19100 out of 25000 | Loss --> 0.697 | Grad_l2 --> 0.156 | Weights_l2 --> 47147.822 | Lr --> 0.000 | Seconds_per_step --> 1.727 |
|
428 |
+
[2024-08-07 13:45:57,171][Main][INFO] - [train] Step 19150 out of 25000 | Loss --> 0.726 | Grad_l2 --> 0.205 | Weights_l2 --> 47147.685 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
|
429 |
+
[2024-08-07 13:47:22,671][Main][INFO] - [train] Step 19200 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.212 | Weights_l2 --> 47147.547 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
430 |
+
[2024-08-07 13:48:50,958][Main][INFO] - [train] Step 19250 out of 25000 | Loss --> 0.702 | Grad_l2 --> 0.176 | Weights_l2 --> 47147.410 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
|
431 |
+
[2024-08-07 13:50:16,489][Main][INFO] - [train] Step 19300 out of 25000 | Loss --> 0.694 | Grad_l2 --> 0.156 | Weights_l2 --> 47147.273 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
432 |
+
[2024-08-07 13:51:42,217][Main][INFO] - [train] Step 19350 out of 25000 | Loss --> 0.678 | Grad_l2 --> 0.158 | Weights_l2 --> 47147.135 | Lr --> 0.000 | Seconds_per_step --> 1.715 |
|
433 |
+
[2024-08-07 13:53:10,925][Main][INFO] - [train] Step 19400 out of 25000 | Loss --> 0.708 | Grad_l2 --> 0.167 | Weights_l2 --> 47146.998 | Lr --> 0.000 | Seconds_per_step --> 1.774 |
|
434 |
+
[2024-08-07 13:54:37,089][Main][INFO] - [train] Step 19450 out of 25000 | Loss --> 0.721 | Grad_l2 --> 0.175 | Weights_l2 --> 47146.861 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
435 |
+
[2024-08-07 13:56:03,239][Main][INFO] - [train] Step 19500 out of 25000 | Loss --> 0.707 | Grad_l2 --> 0.158 | Weights_l2 --> 47146.723 | Lr --> 0.000 | Seconds_per_step --> 1.723 |
|
436 |
+
[2024-08-07 13:57:31,897][Main][INFO] - [train] Step 19550 out of 25000 | Loss --> 0.684 | Grad_l2 --> 0.158 | Weights_l2 --> 47146.586 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
|
437 |
+
[2024-08-07 13:58:57,731][Main][INFO] - [train] Step 19600 out of 25000 | Loss --> 0.725 | Grad_l2 --> 0.158 | Weights_l2 --> 47146.449 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
438 |
+
[2024-08-07 14:00:23,264][Main][INFO] - [train] Step 19650 out of 25000 | Loss --> 0.699 | Grad_l2 --> 0.163 | Weights_l2 --> 47146.312 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
439 |
+
[2024-08-07 14:01:51,434][Main][INFO] - [train] Step 19700 out of 25000 | Loss --> 0.723 | Grad_l2 --> 0.169 | Weights_l2 --> 47146.174 | Lr --> 0.000 | Seconds_per_step --> 1.763 |
|
440 |
+
[2024-08-07 14:03:17,033][Main][INFO] - [train] Step 19750 out of 25000 | Loss --> 0.651 | Grad_l2 --> 0.190 | Weights_l2 --> 47146.037 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
|
441 |
+
[2024-08-07 14:04:42,658][Main][INFO] - [train] Step 19800 out of 25000 | Loss --> 0.686 | Grad_l2 --> 0.156 | Weights_l2 --> 47145.900 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
|
442 |
+
[2024-08-07 14:06:10,961][Main][INFO] - [train] Step 19850 out of 25000 | Loss --> 0.687 | Grad_l2 --> 0.160 | Weights_l2 --> 47145.766 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
|
443 |
+
[2024-08-07 14:07:36,890][Main][INFO] - [train] Step 19900 out of 25000 | Loss --> 0.698 | Grad_l2 --> 0.177 | Weights_l2 --> 47145.629 | Lr --> 0.000 | Seconds_per_step --> 1.719 |
|
444 |
+
[2024-08-07 14:09:03,084][Main][INFO] - [train] Step 19950 out of 25000 | Loss --> 0.720 | Grad_l2 --> 0.166 | Weights_l2 --> 47145.492 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
|
445 |
+
[2024-08-07 14:10:31,989][Main][INFO] - [train] Step 20000 out of 25000 | Loss --> 0.668 | Grad_l2 --> 0.157 | Weights_l2 --> 47145.354 | Lr --> 0.000 | Seconds_per_step --> 1.778 |
|
446 |
+
[2024-08-07 14:10:36,920][Main][INFO] - [eval] Step 20000 out of 25000 | Loss --> 0.811 | Accuracy --> 0.849 | Time --> 4.928 |
|
447 |
+
[2024-08-07 14:15:07,226][absl][INFO] - Using default tokenizer.
|
448 |
+
[2024-08-07 14:15:07,807][Main][INFO] - [test] Step 20000 out of 25000 | Rougel --> 25.044 | Time --> 270.886 |
|
449 |
+
[2024-08-07 14:15:07,811][accelerate.accelerator][INFO] - Saving current state to checkpoint-ft-20000
|
450 |
+
[2024-08-07 14:15:07,819][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'lm_head.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
451 |
+
[2024-08-07 14:15:08,650][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-ft-20000/model.safetensors
|
452 |
+
[2024-08-07 14:15:09,813][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-ft-20000/optimizer.bin
|
453 |
+
[2024-08-07 14:15:09,814][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-ft-20000/scheduler.bin
|
454 |
+
[2024-08-07 14:15:09,814][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-ft-20000/sampler.bin
|
455 |
+
[2024-08-07 14:15:09,814][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-ft-20000/sampler_1.bin
|
456 |
+
[2024-08-07 14:15:09,815][accelerate.checkpointing][INFO] - Random states saved in checkpoint-ft-20000/random_states_0.pkl
|
457 |
+
[2024-08-07 14:16:36,010][Main][INFO] - [train] Step 20050 out of 25000 | Loss --> 0.686 | Grad_l2 --> 0.170 | Weights_l2 --> 47145.217 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
|
458 |
+
[2024-08-07 14:18:02,209][Main][INFO] - [train] Step 20100 out of 25000 | Loss --> 0.709 | Grad_l2 --> 0.167 | Weights_l2 --> 47145.080 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
|
459 |
+
[2024-08-07 14:19:30,971][Main][INFO] - [train] Step 20150 out of 25000 | Loss --> 0.682 | Grad_l2 --> 0.173 | Weights_l2 --> 47144.943 | Lr --> 0.000 | Seconds_per_step --> 1.775 |
|
460 |
+
[2024-08-07 14:20:56,968][Main][INFO] - [train] Step 20200 out of 25000 | Loss --> 0.662 | Grad_l2 --> 0.154 | Weights_l2 --> 47144.809 | Lr --> 0.000 | Seconds_per_step --> 1.720 |
|
461 |
+
[2024-08-07 14:22:22,622][Main][INFO] - [train] Step 20250 out of 25000 | Loss --> 0.668 | Grad_l2 --> 0.173 | Weights_l2 --> 47144.668 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
|
462 |
+
[2024-08-07 14:23:50,811][Main][INFO] - [train] Step 20300 out of 25000 | Loss --> 0.648 | Grad_l2 --> 0.173 | Weights_l2 --> 47144.531 | Lr --> 0.000 | Seconds_per_step --> 1.764 |
|
463 |
+
[2024-08-07 14:25:16,348][Main][INFO] - [train] Step 20350 out of 25000 | Loss --> 0.653 | Grad_l2 --> 0.181 | Weights_l2 --> 47144.397 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
464 |
+
[2024-08-07 14:26:41,867][Main][INFO] - [train] Step 20400 out of 25000 | Loss --> 0.651 | Grad_l2 --> 0.159 | Weights_l2 --> 47144.256 | Lr --> 0.000 | Seconds_per_step --> 1.710 |
|
465 |
+
[2024-08-07 14:28:09,838][Main][INFO] - [train] Step 20450 out of 25000 | Loss --> 0.667 | Grad_l2 --> 0.268 | Weights_l2 --> 47144.123 | Lr --> 0.000 | Seconds_per_step --> 1.759 |
|
466 |
+
[2024-08-07 14:29:35,617][Main][INFO] - [train] Step 20500 out of 25000 | Loss --> 0.660 | Grad_l2 --> 0.167 | Weights_l2 --> 47143.985 | Lr --> 0.000 | Seconds_per_step --> 1.716 |
|
467 |
+
[2024-08-07 14:31:08,507][Main][INFO] - [train] Step 20550 out of 25000 | Loss --> 0.660 | Grad_l2 --> 0.154 | Weights_l2 --> 47143.848 | Lr --> 0.000 | Seconds_per_step --> 1.858 |
|
468 |
+
[2024-08-07 14:32:37,146][Main][INFO] - [train] Step 20600 out of 25000 | Loss --> 0.667 | Grad_l2 --> 0.173 | Weights_l2 --> 47143.711 | Lr --> 0.000 | Seconds_per_step --> 1.773 |
|
469 |
+
[2024-08-07 14:34:02,981][Main][INFO] - [train] Step 20650 out of 25000 | Loss --> 0.663 | Grad_l2 --> 0.157 | Weights_l2 --> 47143.573 | Lr --> 0.000 | Seconds_per_step --> 1.717 |
|
470 |
+
[2024-08-07 14:35:28,537][Main][INFO] - [train] Step 20700 out of 25000 | Loss --> 0.641 | Grad_l2 --> 0.158 | Weights_l2 --> 47143.440 | Lr --> 0.000 | Seconds_per_step --> 1.711 |
|
471 |
+
[2024-08-07 14:36:56,530][Main][INFO] - [train] Step 20750 out of 25000 | Loss --> 0.623 | Grad_l2 --> 0.185 | Weights_l2 --> 47143.303 | Lr --> 0.000 | Seconds_per_step --> 1.760 |
|
472 |
+
[2024-08-07 14:38:22,846][Main][INFO] - [train] Step 20800 out of 25000 | Loss --> 0.636 | Grad_l2 --> 0.152 | Weights_l2 --> 47143.162 | Lr --> 0.000 | Seconds_per_step --> 1.726 |
|
473 |
+
[2024-08-07 14:39:48,731][Main][INFO] - [train] Step 20850 out of 25000 | Loss --> 0.630 | Grad_l2 --> 0.153 | Weights_l2 --> 47143.024 | Lr --> 0.000 | Seconds_per_step --> 1.718 |
|
474 |
+
[2024-08-07 14:41:16,867][Main][INFO] - [train] Step 20900 out of 25000 | Loss --> 0.617 | Grad_l2 --> 0.353 | Weights_l2 --> 47142.887 | Lr --> 0.000 | Seconds_per_step --> 1.763 |
|
475 |
+
[2024-08-07 14:42:43,044][Main][INFO] - [train] Step 20950 out of 25000 | Loss --> 0.607 | Grad_l2 --> 0.154 | Weights_l2 --> 47142.750 | Lr --> 0.000 | Seconds_per_step --> 1.724 |
|
476 |
+
[2024-08-07 14:44:11,795][Main][INFO] - [train] Step 21000 out of 25000 | Loss --> 0.622 | Grad_l2 --> 0.153 | Weights_l2 --> 47142.612 | Lr --> 0.000 | Seconds_per_step --> 1.775 |
|
477 |
+
[2024-08-07 14:45:37,402][Main][INFO] - [train] Step 21050 out of 25000 | Loss --> 0.620 | Grad_l2 --> 0.165 | Weights_l2 --> 47142.475 | Lr --> 0.000 | Seconds_per_step --> 1.712 |
|
478 |
+
[2024-08-07 14:47:03,753][Main][INFO] - [train] Step 21100 out of 25000 | Loss --> 0.590 | Grad_l2 --> 0.157 | Weights_l2 --> 47142.341 | Lr --> 0.000 | Seconds_per_step --> 1.727 |
|
479 |
+
[2024-08-07 14:48:32,029][Main][INFO] - [train] Step 21150 out of 25000 | Loss --> 0.610 | Grad_l2 --> 0.151 | Weights_l2 --> 47142.204 | Lr --> 0.000 | Seconds_per_step --> 1.766 |
|
480 |
+
[2024-08-07 14:49:57,716][Main][INFO] - [train] Step 21200 out of 25000 | Loss --> 0.604 | Grad_l2 --> 0.191 | Weights_l2 --> 47142.067 | Lr --> 0.000 | Seconds_per_step --> 1.714 |
|
481 |
+
[2024-08-07 14:51:23,943][Main][INFO] - [train] Step 21250 out of 25000 | Loss --> 0.591 | Grad_l2 --> 0.151 | Weights_l2 --> 47141.930 | Lr --> 0.000 | Seconds_per_step --> 1.725 |
|
482 |
+
[2024-08-07 14:52:51,971][Main][INFO] - [train] Step 21300 out of 25000 | Loss --> 0.583 | Grad_l2 --> 0.156 | Weights_l2 --> 47141.792 | Lr --> 0.000 | Seconds_per_step --> 1.761 |
|
483 |
+
[2024-08-07 14:54:17,847][Main][INFO] - [train] Step 21350 out of 25000 | Loss --> 0.571 | Grad_l2 --> 0.143 | Weights_l2 --> 47141.655 | Lr --> 0.000 | Seconds_per_step --> 1.718 |
|
484 |
+
[2024-08-07 14:55:44,079][Main][INFO] - [train] Step 21400 out of 25000 | Loss --> 0.590 | Grad_l2 --> 0.154 | Weights_l2 --> 47141.521 | Lr --> 0.000 | Seconds_per_step --> 1.725 |
|
485 |
+
[2024-08-07 14:57:13,087][Main][INFO] - [train] Step 21450 out of 25000 | Loss --> 0.573 | Grad_l2 --> 0.205 | Weights_l2 --> 47141.380 | Lr --> 0.000 | Seconds_per_step --> 1.780 |
|
486 |
+
[2024-08-07 14:58:39,380][Main][INFO] - [train] Step 21500 out of 25000 | Loss --> 0.576 | Grad_l2 --> 0.173 | Weights_l2 --> 47141.247 | Lr --> 0.000 | Seconds_per_step --> 1.726 |
|
487 |
+
[2024-08-07 15:00:05,026][Main][INFO] - [train] Step 21550 out of 25000 | Loss --> 0.588 | Grad_l2 --> 0.161 | Weights_l2 --> 47141.109 | Lr --> 0.000 | Seconds_per_step --> 1.713 |
|
488 |
+
[2024-08-07 15:01:33,627][Main][INFO] - [train] Step 21600 out of 25000 | Loss --> 0.572 | Grad_l2 --> 0.143 | Weights_l2 --> 47140.972 | Lr --> 0.000 | Seconds_per_step --> 1.772 |
|