2023-10-25 12:56:32,932 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:56:32,933 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-25 12:56:32,933 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:56:32,934 MultiCorpus: 14465 train + 1392 dev + 2432 test sentences - NER_HIPE_2022 Corpus: 14465 train + 1392 dev + 2432 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/letemps/fr/with_doc_seperator 2023-10-25 12:56:32,934 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:56:32,934 Train: 14465 sentences 2023-10-25 12:56:32,934 (train_with_dev=False, train_with_test=False) 2023-10-25 12:56:32,934 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:56:32,934 Training Params: 2023-10-25 12:56:32,934 - learning_rate: "5e-05" 2023-10-25 12:56:32,934 - mini_batch_size: "8" 2023-10-25 12:56:32,934 - max_epochs: "10" 2023-10-25 12:56:32,934 - shuffle: "True" 2023-10-25 12:56:32,934 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:56:32,934 Plugins: 2023-10-25 12:56:32,934 - TensorboardLogger 2023-10-25 12:56:32,934 - LinearScheduler | warmup_fraction: '0.1' 2023-10-25 12:56:32,934 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:56:32,934 Final evaluation on model from best epoch (best-model.pt) 2023-10-25 12:56:32,934 - metric: "('micro avg', 'f1-score')" 2023-10-25 12:56:32,934 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:56:32,934 Computation: 2023-10-25 12:56:32,934 - compute on device: cuda:0 2023-10-25 12:56:32,934 - embedding storage: none 2023-10-25 12:56:32,934 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:56:32,934 Model training base path: "hmbench-letemps/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs8-wsFalse-e10-lr5e-05-poolingfirst-layers-1-crfFalse-3" 2023-10-25 12:56:32,934 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:56:32,934 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:56:32,934 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-25 12:56:48,346 epoch 1 - iter 180/1809 - loss 1.10006084 - time (sec): 15.41 - samples/sec: 2402.62 - lr: 0.000005 - momentum: 0.000000 2023-10-25 12:57:04,474 epoch 1 - iter 360/1809 - loss 0.63052655 - time (sec): 31.54 - samples/sec: 2436.05 - lr: 0.000010 - momentum: 0.000000 2023-10-25 12:57:20,154 epoch 1 - iter 540/1809 - loss 0.47100990 - time (sec): 47.22 - samples/sec: 2427.01 - lr: 0.000015 - momentum: 0.000000 2023-10-25 12:57:35,999 epoch 1 - iter 720/1809 - loss 0.38400724 - time (sec): 63.06 - samples/sec: 2427.43 - lr: 0.000020 - momentum: 0.000000 2023-10-25 12:57:51,832 epoch 1 - iter 900/1809 - loss 0.33257800 - time (sec): 78.90 - samples/sec: 2412.15 - lr: 0.000025 - momentum: 0.000000 2023-10-25 12:58:07,695 epoch 1 - iter 1080/1809 - loss 0.29578277 - time (sec): 94.76 - samples/sec: 2405.28 - lr: 0.000030 - momentum: 0.000000 2023-10-25 12:58:23,282 epoch 1 - iter 1260/1809 - loss 0.26811077 - time (sec): 110.35 - samples/sec: 2407.63 - lr: 0.000035 - momentum: 0.000000 2023-10-25 12:58:39,159 epoch 1 - iter 1440/1809 - loss 0.24867204 - time (sec): 126.22 - samples/sec: 2401.15 - lr: 0.000040 - momentum: 0.000000 2023-10-25 12:58:54,972 epoch 1 - iter 1620/1809 - loss 0.23254968 - time (sec): 142.04 - samples/sec: 2394.45 - lr: 0.000045 - momentum: 0.000000 2023-10-25 12:59:10,963 epoch 1 - iter 1800/1809 - loss 0.21944741 - time (sec): 158.03 - samples/sec: 2393.36 - lr: 0.000050 - momentum: 0.000000 2023-10-25 12:59:11,727 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:59:11,727 EPOCH 1 done: loss 0.2190 - lr: 0.000050 2023-10-25 12:59:16,227 DEV : loss 0.0940776988863945 - f1-score (micro avg) 0.5283 2023-10-25 12:59:16,250 saving best model 2023-10-25 12:59:16,804 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:59:32,156 epoch 2 - iter 180/1809 - loss 0.07972278 - time (sec): 15.35 - samples/sec: 2377.73 - lr: 0.000049 - momentum: 0.000000 2023-10-25 12:59:48,323 epoch 2 - iter 360/1809 - loss 0.08152052 - time (sec): 31.52 - samples/sec: 2340.68 - lr: 0.000049 - momentum: 0.000000 2023-10-25 13:00:04,570 epoch 2 - iter 540/1809 - loss 0.08505170 - time (sec): 47.77 - samples/sec: 2366.58 - lr: 0.000048 - momentum: 0.000000 2023-10-25 13:00:20,330 epoch 2 - iter 720/1809 - loss 0.08594619 - time (sec): 63.53 - samples/sec: 2376.02 - lr: 0.000048 - momentum: 0.000000 2023-10-25 13:00:36,207 epoch 2 - iter 900/1809 - loss 0.08806352 - time (sec): 79.40 - samples/sec: 2380.43 - lr: 0.000047 - momentum: 0.000000 2023-10-25 13:00:51,928 epoch 2 - iter 1080/1809 - loss 0.08857237 - time (sec): 95.12 - samples/sec: 2379.41 - lr: 0.000047 - momentum: 0.000000 2023-10-25 13:01:07,495 epoch 2 - iter 1260/1809 - loss 0.08718027 - time (sec): 110.69 - samples/sec: 2386.51 - lr: 0.000046 - momentum: 0.000000 2023-10-25 13:01:23,189 epoch 2 - iter 1440/1809 - loss 0.08666178 - time (sec): 126.38 - samples/sec: 2392.20 - lr: 0.000046 - momentum: 0.000000 2023-10-25 13:01:38,806 epoch 2 - iter 1620/1809 - loss 0.08537196 - time (sec): 142.00 - samples/sec: 2395.91 - lr: 0.000045 - momentum: 0.000000 2023-10-25 13:01:55,003 epoch 2 - iter 1800/1809 - loss 0.08624945 - time (sec): 158.20 - samples/sec: 2390.41 - lr: 0.000044 - momentum: 0.000000 2023-10-25 13:01:55,826 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:01:55,826 EPOCH 2 done: loss 0.0862 - lr: 0.000044 2023-10-25 13:02:01,076 DEV : loss 0.13432565331459045 - f1-score (micro avg) 0.6025 2023-10-25 13:02:01,098 saving best model 2023-10-25 13:02:01,754 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:02:17,323 epoch 3 - iter 180/1809 - loss 0.05779259 - time (sec): 15.57 - samples/sec: 2359.77 - lr: 0.000044 - momentum: 0.000000 2023-10-25 13:02:33,718 epoch 3 - iter 360/1809 - loss 0.05644142 - time (sec): 31.96 - samples/sec: 2360.99 - lr: 0.000043 - momentum: 0.000000 2023-10-25 13:02:49,700 epoch 3 - iter 540/1809 - loss 0.06025166 - time (sec): 47.94 - samples/sec: 2376.32 - lr: 0.000043 - momentum: 0.000000 2023-10-25 13:03:05,515 epoch 3 - iter 720/1809 - loss 0.05963981 - time (sec): 63.76 - samples/sec: 2398.85 - lr: 0.000042 - momentum: 0.000000 2023-10-25 13:03:21,210 epoch 3 - iter 900/1809 - loss 0.05988365 - time (sec): 79.45 - samples/sec: 2391.83 - lr: 0.000042 - momentum: 0.000000 2023-10-25 13:03:37,246 epoch 3 - iter 1080/1809 - loss 0.06110576 - time (sec): 95.49 - samples/sec: 2395.55 - lr: 0.000041 - momentum: 0.000000 2023-10-25 13:03:53,081 epoch 3 - iter 1260/1809 - loss 0.05959679 - time (sec): 111.33 - samples/sec: 2397.88 - lr: 0.000041 - momentum: 0.000000 2023-10-25 13:04:08,904 epoch 3 - iter 1440/1809 - loss 0.05990670 - time (sec): 127.15 - samples/sec: 2392.99 - lr: 0.000040 - momentum: 0.000000 2023-10-25 13:04:24,144 epoch 3 - iter 1620/1809 - loss 0.05964992 - time (sec): 142.39 - samples/sec: 2380.45 - lr: 0.000039 - momentum: 0.000000 2023-10-25 13:04:40,347 epoch 3 - iter 1800/1809 - loss 0.06114775 - time (sec): 158.59 - samples/sec: 2382.72 - lr: 0.000039 - momentum: 0.000000 2023-10-25 13:04:41,213 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:04:41,213 EPOCH 3 done: loss 0.0611 - lr: 0.000039 2023-10-25 13:04:46,486 DEV : loss 0.1459859311580658 - f1-score (micro avg) 0.6574 2023-10-25 13:04:46,509 saving best model 2023-10-25 13:04:47,256 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:05:03,087 epoch 4 - iter 180/1809 - loss 0.03569500 - time (sec): 15.83 - samples/sec: 2388.59 - lr: 0.000038 - momentum: 0.000000 2023-10-25 13:05:18,721 epoch 4 - iter 360/1809 - loss 0.03919591 - time (sec): 31.46 - samples/sec: 2389.76 - lr: 0.000038 - momentum: 0.000000 2023-10-25 13:05:34,881 epoch 4 - iter 540/1809 - loss 0.04102332 - time (sec): 47.62 - samples/sec: 2380.28 - lr: 0.000037 - momentum: 0.000000 2023-10-25 13:05:50,661 epoch 4 - iter 720/1809 - loss 0.04009188 - time (sec): 63.40 - samples/sec: 2377.07 - lr: 0.000037 - momentum: 0.000000 2023-10-25 13:06:06,635 epoch 4 - iter 900/1809 - loss 0.04101052 - time (sec): 79.38 - samples/sec: 2383.76 - lr: 0.000036 - momentum: 0.000000 2023-10-25 13:06:22,324 epoch 4 - iter 1080/1809 - loss 0.04250899 - time (sec): 95.07 - samples/sec: 2389.95 - lr: 0.000036 - momentum: 0.000000 2023-10-25 13:06:37,963 epoch 4 - iter 1260/1809 - loss 0.04333909 - time (sec): 110.71 - samples/sec: 2386.49 - lr: 0.000035 - momentum: 0.000000 2023-10-25 13:06:53,854 epoch 4 - iter 1440/1809 - loss 0.04374822 - time (sec): 126.60 - samples/sec: 2380.16 - lr: 0.000034 - momentum: 0.000000 2023-10-25 13:07:09,575 epoch 4 - iter 1620/1809 - loss 0.04432814 - time (sec): 142.32 - samples/sec: 2380.91 - lr: 0.000034 - momentum: 0.000000 2023-10-25 13:07:25,700 epoch 4 - iter 1800/1809 - loss 0.04524228 - time (sec): 158.44 - samples/sec: 2383.85 - lr: 0.000033 - momentum: 0.000000 2023-10-25 13:07:26,573 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:07:26,573 EPOCH 4 done: loss 0.0451 - lr: 0.000033 2023-10-25 13:07:31,848 DEV : loss 0.20192305743694305 - f1-score (micro avg) 0.6289 2023-10-25 13:07:31,871 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:07:47,452 epoch 5 - iter 180/1809 - loss 0.02734931 - time (sec): 15.58 - samples/sec: 2412.22 - lr: 0.000033 - momentum: 0.000000 2023-10-25 13:08:03,819 epoch 5 - iter 360/1809 - loss 0.02324021 - time (sec): 31.95 - samples/sec: 2392.25 - lr: 0.000032 - momentum: 0.000000 2023-10-25 13:08:19,580 epoch 5 - iter 540/1809 - loss 0.02650141 - time (sec): 47.71 - samples/sec: 2403.80 - lr: 0.000032 - momentum: 0.000000 2023-10-25 13:08:35,226 epoch 5 - iter 720/1809 - loss 0.02704811 - time (sec): 63.35 - samples/sec: 2417.63 - lr: 0.000031 - momentum: 0.000000 2023-10-25 13:08:51,415 epoch 5 - iter 900/1809 - loss 0.02877765 - time (sec): 79.54 - samples/sec: 2406.90 - lr: 0.000031 - momentum: 0.000000 2023-10-25 13:09:07,224 epoch 5 - iter 1080/1809 - loss 0.02961531 - time (sec): 95.35 - samples/sec: 2400.83 - lr: 0.000030 - momentum: 0.000000 2023-10-25 13:09:23,022 epoch 5 - iter 1260/1809 - loss 0.02938927 - time (sec): 111.15 - samples/sec: 2396.43 - lr: 0.000029 - momentum: 0.000000 2023-10-25 13:09:38,603 epoch 5 - iter 1440/1809 - loss 0.02946213 - time (sec): 126.73 - samples/sec: 2401.26 - lr: 0.000029 - momentum: 0.000000 2023-10-25 13:09:54,292 epoch 5 - iter 1620/1809 - loss 0.02947805 - time (sec): 142.42 - samples/sec: 2396.84 - lr: 0.000028 - momentum: 0.000000 2023-10-25 13:10:10,223 epoch 5 - iter 1800/1809 - loss 0.02955292 - time (sec): 158.35 - samples/sec: 2389.40 - lr: 0.000028 - momentum: 0.000000 2023-10-25 13:10:10,953 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:10:10,953 EPOCH 5 done: loss 0.0295 - lr: 0.000028 2023-10-25 13:10:15,727 DEV : loss 0.2949555218219757 - f1-score (micro avg) 0.6355 2023-10-25 13:10:15,750 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:10:31,928 epoch 6 - iter 180/1809 - loss 0.01655256 - time (sec): 16.18 - samples/sec: 2405.34 - lr: 0.000027 - momentum: 0.000000 2023-10-25 13:10:47,816 epoch 6 - iter 360/1809 - loss 0.01946603 - time (sec): 32.07 - samples/sec: 2373.06 - lr: 0.000027 - momentum: 0.000000 2023-10-25 13:11:03,540 epoch 6 - iter 540/1809 - loss 0.01771531 - time (sec): 47.79 - samples/sec: 2366.90 - lr: 0.000026 - momentum: 0.000000 2023-10-25 13:11:19,762 epoch 6 - iter 720/1809 - loss 0.01794652 - time (sec): 64.01 - samples/sec: 2376.16 - lr: 0.000026 - momentum: 0.000000 2023-10-25 13:11:35,513 epoch 6 - iter 900/1809 - loss 0.01902434 - time (sec): 79.76 - samples/sec: 2373.75 - lr: 0.000025 - momentum: 0.000000 2023-10-25 13:11:51,427 epoch 6 - iter 1080/1809 - loss 0.01867401 - time (sec): 95.68 - samples/sec: 2377.37 - lr: 0.000024 - momentum: 0.000000 2023-10-25 13:12:07,127 epoch 6 - iter 1260/1809 - loss 0.01897470 - time (sec): 111.38 - samples/sec: 2382.57 - lr: 0.000024 - momentum: 0.000000 2023-10-25 13:12:23,192 epoch 6 - iter 1440/1809 - loss 0.01911851 - time (sec): 127.44 - samples/sec: 2384.79 - lr: 0.000023 - momentum: 0.000000 2023-10-25 13:12:39,025 epoch 6 - iter 1620/1809 - loss 0.01999373 - time (sec): 143.27 - samples/sec: 2382.18 - lr: 0.000023 - momentum: 0.000000 2023-10-25 13:12:54,521 epoch 6 - iter 1800/1809 - loss 0.01982448 - time (sec): 158.77 - samples/sec: 2381.69 - lr: 0.000022 - momentum: 0.000000 2023-10-25 13:12:55,269 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:12:55,269 EPOCH 6 done: loss 0.0198 - lr: 0.000022 2023-10-25 13:13:00,034 DEV : loss 0.347699373960495 - f1-score (micro avg) 0.6493 2023-10-25 13:13:00,057 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:13:15,623 epoch 7 - iter 180/1809 - loss 0.01012341 - time (sec): 15.57 - samples/sec: 2404.35 - lr: 0.000022 - momentum: 0.000000 2023-10-25 13:13:31,454 epoch 7 - iter 360/1809 - loss 0.01305697 - time (sec): 31.40 - samples/sec: 2373.59 - lr: 0.000021 - momentum: 0.000000 2023-10-25 13:13:47,017 epoch 7 - iter 540/1809 - loss 0.01300877 - time (sec): 46.96 - samples/sec: 2377.11 - lr: 0.000021 - momentum: 0.000000 2023-10-25 13:14:02,889 epoch 7 - iter 720/1809 - loss 0.01389528 - time (sec): 62.83 - samples/sec: 2376.06 - lr: 0.000020 - momentum: 0.000000 2023-10-25 13:14:18,735 epoch 7 - iter 900/1809 - loss 0.01408907 - time (sec): 78.68 - samples/sec: 2375.30 - lr: 0.000019 - momentum: 0.000000 2023-10-25 13:14:34,859 epoch 7 - iter 1080/1809 - loss 0.01370295 - time (sec): 94.80 - samples/sec: 2382.48 - lr: 0.000019 - momentum: 0.000000 2023-10-25 13:14:50,754 epoch 7 - iter 1260/1809 - loss 0.01366540 - time (sec): 110.70 - samples/sec: 2392.99 - lr: 0.000018 - momentum: 0.000000 2023-10-25 13:15:06,865 epoch 7 - iter 1440/1809 - loss 0.01400641 - time (sec): 126.81 - samples/sec: 2390.49 - lr: 0.000018 - momentum: 0.000000 2023-10-25 13:15:22,780 epoch 7 - iter 1620/1809 - loss 0.01385209 - time (sec): 142.72 - samples/sec: 2385.21 - lr: 0.000017 - momentum: 0.000000 2023-10-25 13:15:38,684 epoch 7 - iter 1800/1809 - loss 0.01412168 - time (sec): 158.63 - samples/sec: 2383.30 - lr: 0.000017 - momentum: 0.000000 2023-10-25 13:15:39,407 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:15:39,408 EPOCH 7 done: loss 0.0141 - lr: 0.000017 2023-10-25 13:15:44,698 DEV : loss 0.35314860939979553 - f1-score (micro avg) 0.6557 2023-10-25 13:15:44,721 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:16:00,867 epoch 8 - iter 180/1809 - loss 0.01022784 - time (sec): 16.14 - samples/sec: 2412.56 - lr: 0.000016 - momentum: 0.000000 2023-10-25 13:16:17,042 epoch 8 - iter 360/1809 - loss 0.01058094 - time (sec): 32.32 - samples/sec: 2352.04 - lr: 0.000016 - momentum: 0.000000 2023-10-25 13:16:33,062 epoch 8 - iter 540/1809 - loss 0.01011424 - time (sec): 48.34 - samples/sec: 2361.84 - lr: 0.000015 - momentum: 0.000000 2023-10-25 13:16:49,047 epoch 8 - iter 720/1809 - loss 0.00966851 - time (sec): 64.32 - samples/sec: 2375.94 - lr: 0.000014 - momentum: 0.000000 2023-10-25 13:17:04,894 epoch 8 - iter 900/1809 - loss 0.00933864 - time (sec): 80.17 - samples/sec: 2376.95 - lr: 0.000014 - momentum: 0.000000 2023-10-25 13:17:20,529 epoch 8 - iter 1080/1809 - loss 0.00951050 - time (sec): 95.81 - samples/sec: 2364.76 - lr: 0.000013 - momentum: 0.000000 2023-10-25 13:17:36,142 epoch 8 - iter 1260/1809 - loss 0.00966767 - time (sec): 111.42 - samples/sec: 2370.92 - lr: 0.000013 - momentum: 0.000000 2023-10-25 13:17:52,143 epoch 8 - iter 1440/1809 - loss 0.00962221 - time (sec): 127.42 - samples/sec: 2379.62 - lr: 0.000012 - momentum: 0.000000 2023-10-25 13:18:07,519 epoch 8 - iter 1620/1809 - loss 0.00957778 - time (sec): 142.80 - samples/sec: 2379.27 - lr: 0.000012 - momentum: 0.000000 2023-10-25 13:18:23,395 epoch 8 - iter 1800/1809 - loss 0.00960260 - time (sec): 158.67 - samples/sec: 2382.90 - lr: 0.000011 - momentum: 0.000000 2023-10-25 13:18:24,169 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:18:24,169 EPOCH 8 done: loss 0.0096 - lr: 0.000011 2023-10-25 13:18:29,463 DEV : loss 0.4076786935329437 - f1-score (micro avg) 0.6491 2023-10-25 13:18:29,486 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:18:44,946 epoch 9 - iter 180/1809 - loss 0.00346298 - time (sec): 15.46 - samples/sec: 2391.49 - lr: 0.000011 - momentum: 0.000000 2023-10-25 13:19:01,070 epoch 9 - iter 360/1809 - loss 0.00386564 - time (sec): 31.58 - samples/sec: 2399.66 - lr: 0.000010 - momentum: 0.000000 2023-10-25 13:19:16,951 epoch 9 - iter 540/1809 - loss 0.00562997 - time (sec): 47.46 - samples/sec: 2400.68 - lr: 0.000009 - momentum: 0.000000 2023-10-25 13:19:32,422 epoch 9 - iter 720/1809 - loss 0.00546924 - time (sec): 62.94 - samples/sec: 2394.18 - lr: 0.000009 - momentum: 0.000000 2023-10-25 13:19:48,301 epoch 9 - iter 900/1809 - loss 0.00581289 - time (sec): 78.81 - samples/sec: 2392.02 - lr: 0.000008 - momentum: 0.000000 2023-10-25 13:20:04,825 epoch 9 - iter 1080/1809 - loss 0.00611858 - time (sec): 95.34 - samples/sec: 2384.94 - lr: 0.000008 - momentum: 0.000000 2023-10-25 13:20:21,183 epoch 9 - iter 1260/1809 - loss 0.00643895 - time (sec): 111.70 - samples/sec: 2378.78 - lr: 0.000007 - momentum: 0.000000 2023-10-25 13:20:37,254 epoch 9 - iter 1440/1809 - loss 0.00648270 - time (sec): 127.77 - samples/sec: 2383.17 - lr: 0.000007 - momentum: 0.000000 2023-10-25 13:20:52,352 epoch 9 - iter 1620/1809 - loss 0.00637081 - time (sec): 142.86 - samples/sec: 2377.49 - lr: 0.000006 - momentum: 0.000000 2023-10-25 13:21:08,288 epoch 9 - iter 1800/1809 - loss 0.00619420 - time (sec): 158.80 - samples/sec: 2382.08 - lr: 0.000006 - momentum: 0.000000 2023-10-25 13:21:09,058 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:21:09,058 EPOCH 9 done: loss 0.0062 - lr: 0.000006 2023-10-25 13:21:14,367 DEV : loss 0.4059355556964874 - f1-score (micro avg) 0.6474 2023-10-25 13:21:14,390 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:21:30,210 epoch 10 - iter 180/1809 - loss 0.00238887 - time (sec): 15.82 - samples/sec: 2406.56 - lr: 0.000005 - momentum: 0.000000 2023-10-25 13:21:45,960 epoch 10 - iter 360/1809 - loss 0.00251156 - time (sec): 31.57 - samples/sec: 2405.41 - lr: 0.000004 - momentum: 0.000000 2023-10-25 13:22:01,793 epoch 10 - iter 540/1809 - loss 0.00290608 - time (sec): 47.40 - samples/sec: 2395.50 - lr: 0.000004 - momentum: 0.000000 2023-10-25 13:22:17,530 epoch 10 - iter 720/1809 - loss 0.00315655 - time (sec): 63.14 - samples/sec: 2387.64 - lr: 0.000003 - momentum: 0.000000 2023-10-25 13:22:33,117 epoch 10 - iter 900/1809 - loss 0.00334403 - time (sec): 78.73 - samples/sec: 2379.43 - lr: 0.000003 - momentum: 0.000000 2023-10-25 13:22:48,770 epoch 10 - iter 1080/1809 - loss 0.00312517 - time (sec): 94.38 - samples/sec: 2380.87 - lr: 0.000002 - momentum: 0.000000 2023-10-25 13:23:04,901 epoch 10 - iter 1260/1809 - loss 0.00323274 - time (sec): 110.51 - samples/sec: 2383.07 - lr: 0.000002 - momentum: 0.000000 2023-10-25 13:23:21,257 epoch 10 - iter 1440/1809 - loss 0.00317762 - time (sec): 126.87 - samples/sec: 2385.06 - lr: 0.000001 - momentum: 0.000000 2023-10-25 13:23:37,428 epoch 10 - iter 1620/1809 - loss 0.00332409 - time (sec): 143.04 - samples/sec: 2383.07 - lr: 0.000001 - momentum: 0.000000 2023-10-25 13:23:52,987 epoch 10 - iter 1800/1809 - loss 0.00349659 - time (sec): 158.60 - samples/sec: 2384.70 - lr: 0.000000 - momentum: 0.000000 2023-10-25 13:23:53,696 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:23:53,697 EPOCH 10 done: loss 0.0035 - lr: 0.000000 2023-10-25 13:23:59,011 DEV : loss 0.4234822392463684 - f1-score (micro avg) 0.6419 2023-10-25 13:23:59,603 ---------------------------------------------------------------------------------------------------- 2023-10-25 13:23:59,604 Loading model from best epoch ... 2023-10-25 13:24:01,370 SequenceTagger predicts: Dictionary with 13 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org 2023-10-25 13:24:07,107 Results: - F-score (micro) 0.6591 - F-score (macro) 0.4663 - Accuracy 0.5014 By class: precision recall f1-score support loc 0.6863 0.7479 0.7158 591 pers 0.5734 0.7115 0.6350 357 org 0.5000 0.0253 0.0482 79 micro avg 0.6398 0.6796 0.6591 1027 macro avg 0.5866 0.4949 0.4663 1027 weighted avg 0.6327 0.6796 0.6364 1027 2023-10-25 13:24:07,107 ----------------------------------------------------------------------------------------------------