2023-10-23 20:53:59,183 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:53:59,184 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=21, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-23 20:53:59,184 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:53:59,184 MultiCorpus: 3575 train + 1235 dev + 1266 test sentences - NER_HIPE_2022 Corpus: 3575 train + 1235 dev + 1266 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/hipe2020/de/with_doc_seperator 2023-10-23 20:53:59,184 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:53:59,184 Train: 3575 sentences 2023-10-23 20:53:59,184 (train_with_dev=False, train_with_test=False) 2023-10-23 20:53:59,184 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:53:59,184 Training Params: 2023-10-23 20:53:59,184 - learning_rate: "5e-05" 2023-10-23 20:53:59,184 - mini_batch_size: "4" 2023-10-23 20:53:59,184 - max_epochs: "10" 2023-10-23 20:53:59,184 - shuffle: "True" 2023-10-23 20:53:59,184 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:53:59,184 Plugins: 2023-10-23 20:53:59,185 - TensorboardLogger 2023-10-23 20:53:59,185 - LinearScheduler | warmup_fraction: '0.1' 2023-10-23 20:53:59,185 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:53:59,185 Final evaluation on model from best epoch (best-model.pt) 2023-10-23 20:53:59,185 - metric: "('micro avg', 'f1-score')" 2023-10-23 20:53:59,185 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:53:59,185 Computation: 2023-10-23 20:53:59,185 - compute on device: cuda:0 2023-10-23 20:53:59,185 - embedding storage: none 2023-10-23 20:53:59,185 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:53:59,185 Model training base path: "hmbench-hipe2020/de-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs4-wsFalse-e10-lr5e-05-poolingfirst-layers-1-crfFalse-2" 2023-10-23 20:53:59,185 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:53:59,185 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:53:59,185 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-23 20:54:04,695 epoch 1 - iter 89/894 - loss 2.05648241 - time (sec): 5.51 - samples/sec: 1445.62 - lr: 0.000005 - momentum: 0.000000 2023-10-23 20:54:10,399 epoch 1 - iter 178/894 - loss 1.21404748 - time (sec): 11.21 - samples/sec: 1485.36 - lr: 0.000010 - momentum: 0.000000 2023-10-23 20:54:16,086 epoch 1 - iter 267/894 - loss 0.90628017 - time (sec): 16.90 - samples/sec: 1488.84 - lr: 0.000015 - momentum: 0.000000 2023-10-23 20:54:21,773 epoch 1 - iter 356/894 - loss 0.76044414 - time (sec): 22.59 - samples/sec: 1492.63 - lr: 0.000020 - momentum: 0.000000 2023-10-23 20:54:27,374 epoch 1 - iter 445/894 - loss 0.66552201 - time (sec): 28.19 - samples/sec: 1501.30 - lr: 0.000025 - momentum: 0.000000 2023-10-23 20:54:32,883 epoch 1 - iter 534/894 - loss 0.60071179 - time (sec): 33.70 - samples/sec: 1495.06 - lr: 0.000030 - momentum: 0.000000 2023-10-23 20:54:38,479 epoch 1 - iter 623/894 - loss 0.54372354 - time (sec): 39.29 - samples/sec: 1500.03 - lr: 0.000035 - momentum: 0.000000 2023-10-23 20:54:44,124 epoch 1 - iter 712/894 - loss 0.50080113 - time (sec): 44.94 - samples/sec: 1504.45 - lr: 0.000040 - momentum: 0.000000 2023-10-23 20:54:50,084 epoch 1 - iter 801/894 - loss 0.46387916 - time (sec): 50.90 - samples/sec: 1517.16 - lr: 0.000045 - momentum: 0.000000 2023-10-23 20:54:55,699 epoch 1 - iter 890/894 - loss 0.43757430 - time (sec): 56.51 - samples/sec: 1526.27 - lr: 0.000050 - momentum: 0.000000 2023-10-23 20:54:55,932 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:54:55,932 EPOCH 1 done: loss 0.4371 - lr: 0.000050 2023-10-23 20:55:00,775 DEV : loss 0.1591983586549759 - f1-score (micro avg) 0.6143 2023-10-23 20:55:00,795 saving best model 2023-10-23 20:55:01,267 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:55:06,720 epoch 2 - iter 89/894 - loss 0.16974048 - time (sec): 5.45 - samples/sec: 1518.20 - lr: 0.000049 - momentum: 0.000000 2023-10-23 20:55:12,416 epoch 2 - iter 178/894 - loss 0.16143225 - time (sec): 11.15 - samples/sec: 1525.41 - lr: 0.000049 - momentum: 0.000000 2023-10-23 20:55:18,123 epoch 2 - iter 267/894 - loss 0.15180311 - time (sec): 16.85 - samples/sec: 1537.73 - lr: 0.000048 - momentum: 0.000000 2023-10-23 20:55:23,889 epoch 2 - iter 356/894 - loss 0.15236701 - time (sec): 22.62 - samples/sec: 1532.24 - lr: 0.000048 - momentum: 0.000000 2023-10-23 20:55:29,405 epoch 2 - iter 445/894 - loss 0.14334298 - time (sec): 28.14 - samples/sec: 1509.83 - lr: 0.000047 - momentum: 0.000000 2023-10-23 20:55:35,194 epoch 2 - iter 534/894 - loss 0.15247437 - time (sec): 33.93 - samples/sec: 1517.25 - lr: 0.000047 - momentum: 0.000000 2023-10-23 20:55:40,912 epoch 2 - iter 623/894 - loss 0.14975823 - time (sec): 39.64 - samples/sec: 1525.54 - lr: 0.000046 - momentum: 0.000000 2023-10-23 20:55:46,390 epoch 2 - iter 712/894 - loss 0.14876006 - time (sec): 45.12 - samples/sec: 1514.34 - lr: 0.000046 - momentum: 0.000000 2023-10-23 20:55:52,336 epoch 2 - iter 801/894 - loss 0.14878889 - time (sec): 51.07 - samples/sec: 1525.36 - lr: 0.000045 - momentum: 0.000000 2023-10-23 20:55:57,878 epoch 2 - iter 890/894 - loss 0.14578259 - time (sec): 56.61 - samples/sec: 1520.97 - lr: 0.000044 - momentum: 0.000000 2023-10-23 20:55:58,141 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:55:58,141 EPOCH 2 done: loss 0.1456 - lr: 0.000044 2023-10-23 20:56:04,638 DEV : loss 0.22977472841739655 - f1-score (micro avg) 0.6806 2023-10-23 20:56:04,659 saving best model 2023-10-23 20:56:05,256 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:56:11,024 epoch 3 - iter 89/894 - loss 0.10865051 - time (sec): 5.77 - samples/sec: 1553.30 - lr: 0.000044 - momentum: 0.000000 2023-10-23 20:56:16,764 epoch 3 - iter 178/894 - loss 0.11003605 - time (sec): 11.51 - samples/sec: 1535.17 - lr: 0.000043 - momentum: 0.000000 2023-10-23 20:56:22,515 epoch 3 - iter 267/894 - loss 0.10395964 - time (sec): 17.26 - samples/sec: 1555.50 - lr: 0.000043 - momentum: 0.000000 2023-10-23 20:56:28,133 epoch 3 - iter 356/894 - loss 0.09794376 - time (sec): 22.88 - samples/sec: 1524.17 - lr: 0.000042 - momentum: 0.000000 2023-10-23 20:56:33,749 epoch 3 - iter 445/894 - loss 0.09527822 - time (sec): 28.49 - samples/sec: 1527.06 - lr: 0.000042 - momentum: 0.000000 2023-10-23 20:56:39,381 epoch 3 - iter 534/894 - loss 0.09373453 - time (sec): 34.12 - samples/sec: 1519.68 - lr: 0.000041 - momentum: 0.000000 2023-10-23 20:56:44,925 epoch 3 - iter 623/894 - loss 0.09364125 - time (sec): 39.67 - samples/sec: 1511.34 - lr: 0.000041 - momentum: 0.000000 2023-10-23 20:56:50,828 epoch 3 - iter 712/894 - loss 0.09013864 - time (sec): 45.57 - samples/sec: 1518.87 - lr: 0.000040 - momentum: 0.000000 2023-10-23 20:56:56,436 epoch 3 - iter 801/894 - loss 0.09096854 - time (sec): 51.18 - samples/sec: 1513.59 - lr: 0.000039 - momentum: 0.000000 2023-10-23 20:57:02,199 epoch 3 - iter 890/894 - loss 0.08996061 - time (sec): 56.94 - samples/sec: 1514.41 - lr: 0.000039 - momentum: 0.000000 2023-10-23 20:57:02,431 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:57:02,431 EPOCH 3 done: loss 0.0898 - lr: 0.000039 2023-10-23 20:57:08,939 DEV : loss 0.19408421218395233 - f1-score (micro avg) 0.7459 2023-10-23 20:57:08,960 saving best model 2023-10-23 20:57:09,557 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:57:15,087 epoch 4 - iter 89/894 - loss 0.07126826 - time (sec): 5.53 - samples/sec: 1467.85 - lr: 0.000038 - momentum: 0.000000 2023-10-23 20:57:20,786 epoch 4 - iter 178/894 - loss 0.06075018 - time (sec): 11.23 - samples/sec: 1492.76 - lr: 0.000038 - momentum: 0.000000 2023-10-23 20:57:26,437 epoch 4 - iter 267/894 - loss 0.05637859 - time (sec): 16.88 - samples/sec: 1508.80 - lr: 0.000037 - momentum: 0.000000 2023-10-23 20:57:32,295 epoch 4 - iter 356/894 - loss 0.05891890 - time (sec): 22.74 - samples/sec: 1518.14 - lr: 0.000037 - momentum: 0.000000 2023-10-23 20:57:38,003 epoch 4 - iter 445/894 - loss 0.06114642 - time (sec): 28.44 - samples/sec: 1512.47 - lr: 0.000036 - momentum: 0.000000 2023-10-23 20:57:43,730 epoch 4 - iter 534/894 - loss 0.06236999 - time (sec): 34.17 - samples/sec: 1514.92 - lr: 0.000036 - momentum: 0.000000 2023-10-23 20:57:49,533 epoch 4 - iter 623/894 - loss 0.06392335 - time (sec): 39.97 - samples/sec: 1525.05 - lr: 0.000035 - momentum: 0.000000 2023-10-23 20:57:55,255 epoch 4 - iter 712/894 - loss 0.06295692 - time (sec): 45.70 - samples/sec: 1526.07 - lr: 0.000034 - momentum: 0.000000 2023-10-23 20:58:00,800 epoch 4 - iter 801/894 - loss 0.06292309 - time (sec): 51.24 - samples/sec: 1518.26 - lr: 0.000034 - momentum: 0.000000 2023-10-23 20:58:06,323 epoch 4 - iter 890/894 - loss 0.06266573 - time (sec): 56.76 - samples/sec: 1518.51 - lr: 0.000033 - momentum: 0.000000 2023-10-23 20:58:06,577 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:58:06,577 EPOCH 4 done: loss 0.0631 - lr: 0.000033 2023-10-23 20:58:13,122 DEV : loss 0.22660154104232788 - f1-score (micro avg) 0.7247 2023-10-23 20:58:13,143 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:58:18,820 epoch 5 - iter 89/894 - loss 0.03574570 - time (sec): 5.68 - samples/sec: 1548.77 - lr: 0.000033 - momentum: 0.000000 2023-10-23 20:58:24,508 epoch 5 - iter 178/894 - loss 0.03970603 - time (sec): 11.36 - samples/sec: 1500.75 - lr: 0.000032 - momentum: 0.000000 2023-10-23 20:58:30,000 epoch 5 - iter 267/894 - loss 0.04068036 - time (sec): 16.86 - samples/sec: 1487.29 - lr: 0.000032 - momentum: 0.000000 2023-10-23 20:58:35,851 epoch 5 - iter 356/894 - loss 0.04525724 - time (sec): 22.71 - samples/sec: 1521.43 - lr: 0.000031 - momentum: 0.000000 2023-10-23 20:58:41,423 epoch 5 - iter 445/894 - loss 0.04526026 - time (sec): 28.28 - samples/sec: 1507.73 - lr: 0.000031 - momentum: 0.000000 2023-10-23 20:58:46,970 epoch 5 - iter 534/894 - loss 0.04516609 - time (sec): 33.83 - samples/sec: 1503.75 - lr: 0.000030 - momentum: 0.000000 2023-10-23 20:58:52,901 epoch 5 - iter 623/894 - loss 0.04404538 - time (sec): 39.76 - samples/sec: 1518.02 - lr: 0.000029 - momentum: 0.000000 2023-10-23 20:58:58,571 epoch 5 - iter 712/894 - loss 0.04347653 - time (sec): 45.43 - samples/sec: 1519.94 - lr: 0.000029 - momentum: 0.000000 2023-10-23 20:59:04,172 epoch 5 - iter 801/894 - loss 0.04427952 - time (sec): 51.03 - samples/sec: 1526.89 - lr: 0.000028 - momentum: 0.000000 2023-10-23 20:59:09,706 epoch 5 - iter 890/894 - loss 0.04395567 - time (sec): 56.56 - samples/sec: 1523.08 - lr: 0.000028 - momentum: 0.000000 2023-10-23 20:59:09,959 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:59:09,959 EPOCH 5 done: loss 0.0440 - lr: 0.000028 2023-10-23 20:59:16,476 DEV : loss 0.2578391432762146 - f1-score (micro avg) 0.7454 2023-10-23 20:59:16,496 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:59:22,046 epoch 6 - iter 89/894 - loss 0.02996543 - time (sec): 5.55 - samples/sec: 1443.43 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:59:27,682 epoch 6 - iter 178/894 - loss 0.02468163 - time (sec): 11.19 - samples/sec: 1441.23 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:59:33,444 epoch 6 - iter 267/894 - loss 0.02950738 - time (sec): 16.95 - samples/sec: 1480.05 - lr: 0.000026 - momentum: 0.000000 2023-10-23 20:59:39,138 epoch 6 - iter 356/894 - loss 0.02848739 - time (sec): 22.64 - samples/sec: 1520.93 - lr: 0.000026 - momentum: 0.000000 2023-10-23 20:59:44,762 epoch 6 - iter 445/894 - loss 0.02765367 - time (sec): 28.27 - samples/sec: 1524.43 - lr: 0.000025 - momentum: 0.000000 2023-10-23 20:59:50,448 epoch 6 - iter 534/894 - loss 0.02635219 - time (sec): 33.95 - samples/sec: 1514.47 - lr: 0.000024 - momentum: 0.000000 2023-10-23 20:59:55,967 epoch 6 - iter 623/894 - loss 0.02640742 - time (sec): 39.47 - samples/sec: 1514.26 - lr: 0.000024 - momentum: 0.000000 2023-10-23 21:00:01,649 epoch 6 - iter 712/894 - loss 0.02966489 - time (sec): 45.15 - samples/sec: 1521.97 - lr: 0.000023 - momentum: 0.000000 2023-10-23 21:00:07,488 epoch 6 - iter 801/894 - loss 0.02907114 - time (sec): 50.99 - samples/sec: 1516.15 - lr: 0.000023 - momentum: 0.000000 2023-10-23 21:00:13,087 epoch 6 - iter 890/894 - loss 0.02979063 - time (sec): 56.59 - samples/sec: 1523.94 - lr: 0.000022 - momentum: 0.000000 2023-10-23 21:00:13,331 ---------------------------------------------------------------------------------------------------- 2023-10-23 21:00:13,331 EPOCH 6 done: loss 0.0298 - lr: 0.000022 2023-10-23 21:00:19,831 DEV : loss 0.25447967648506165 - f1-score (micro avg) 0.7468 2023-10-23 21:00:19,852 saving best model 2023-10-23 21:00:20,442 ---------------------------------------------------------------------------------------------------- 2023-10-23 21:00:26,004 epoch 7 - iter 89/894 - loss 0.02097532 - time (sec): 5.56 - samples/sec: 1524.38 - lr: 0.000022 - momentum: 0.000000 2023-10-23 21:00:31,783 epoch 7 - iter 178/894 - loss 0.02109630 - time (sec): 11.34 - samples/sec: 1519.21 - lr: 0.000021 - momentum: 0.000000 2023-10-23 21:00:37,788 epoch 7 - iter 267/894 - loss 0.01994670 - time (sec): 17.35 - samples/sec: 1536.45 - lr: 0.000021 - momentum: 0.000000 2023-10-23 21:00:43,398 epoch 7 - iter 356/894 - loss 0.01812653 - time (sec): 22.96 - samples/sec: 1528.07 - lr: 0.000020 - momentum: 0.000000 2023-10-23 21:00:49,036 epoch 7 - iter 445/894 - loss 0.01965635 - time (sec): 28.59 - samples/sec: 1522.92 - lr: 0.000019 - momentum: 0.000000 2023-10-23 21:00:54,701 epoch 7 - iter 534/894 - loss 0.01998244 - time (sec): 34.26 - samples/sec: 1526.54 - lr: 0.000019 - momentum: 0.000000 2023-10-23 21:01:00,374 epoch 7 - iter 623/894 - loss 0.02031292 - time (sec): 39.93 - samples/sec: 1523.49 - lr: 0.000018 - momentum: 0.000000 2023-10-23 21:01:05,941 epoch 7 - iter 712/894 - loss 0.01882461 - time (sec): 45.50 - samples/sec: 1520.45 - lr: 0.000018 - momentum: 0.000000 2023-10-23 21:01:11,509 epoch 7 - iter 801/894 - loss 0.02001490 - time (sec): 51.07 - samples/sec: 1523.08 - lr: 0.000017 - momentum: 0.000000 2023-10-23 21:01:17,109 epoch 7 - iter 890/894 - loss 0.01945576 - time (sec): 56.67 - samples/sec: 1521.90 - lr: 0.000017 - momentum: 0.000000 2023-10-23 21:01:17,349 ---------------------------------------------------------------------------------------------------- 2023-10-23 21:01:17,350 EPOCH 7 done: loss 0.0197 - lr: 0.000017 2023-10-23 21:01:23,819 DEV : loss 0.27903473377227783 - f1-score (micro avg) 0.744 2023-10-23 21:01:23,840 ---------------------------------------------------------------------------------------------------- 2023-10-23 21:01:29,442 epoch 8 - iter 89/894 - loss 0.01477492 - time (sec): 5.60 - samples/sec: 1514.72 - lr: 0.000016 - momentum: 0.000000 2023-10-23 21:01:35,014 epoch 8 - iter 178/894 - loss 0.01911420 - time (sec): 11.17 - samples/sec: 1524.00 - lr: 0.000016 - momentum: 0.000000 2023-10-23 21:01:40,581 epoch 8 - iter 267/894 - loss 0.01561106 - time (sec): 16.74 - samples/sec: 1490.26 - lr: 0.000015 - momentum: 0.000000 2023-10-23 21:01:46,683 epoch 8 - iter 356/894 - loss 0.01289383 - time (sec): 22.84 - samples/sec: 1535.64 - lr: 0.000014 - momentum: 0.000000 2023-10-23 21:01:52,319 epoch 8 - iter 445/894 - loss 0.01328556 - time (sec): 28.48 - samples/sec: 1539.25 - lr: 0.000014 - momentum: 0.000000 2023-10-23 21:01:57,972 epoch 8 - iter 534/894 - loss 0.01205554 - time (sec): 34.13 - samples/sec: 1518.84 - lr: 0.000013 - momentum: 0.000000 2023-10-23 21:02:03,588 epoch 8 - iter 623/894 - loss 0.01126584 - time (sec): 39.75 - samples/sec: 1517.09 - lr: 0.000013 - momentum: 0.000000 2023-10-23 21:02:09,239 epoch 8 - iter 712/894 - loss 0.01248566 - time (sec): 45.40 - samples/sec: 1515.56 - lr: 0.000012 - momentum: 0.000000 2023-10-23 21:02:15,129 epoch 8 - iter 801/894 - loss 0.01195343 - time (sec): 51.29 - samples/sec: 1520.84 - lr: 0.000012 - momentum: 0.000000 2023-10-23 21:02:20,671 epoch 8 - iter 890/894 - loss 0.01193844 - time (sec): 56.83 - samples/sec: 1517.08 - lr: 0.000011 - momentum: 0.000000 2023-10-23 21:02:20,912 ---------------------------------------------------------------------------------------------------- 2023-10-23 21:02:20,912 EPOCH 8 done: loss 0.0123 - lr: 0.000011 2023-10-23 21:02:27,403 DEV : loss 0.31139957904815674 - f1-score (micro avg) 0.7589 2023-10-23 21:02:27,424 saving best model 2023-10-23 21:02:28,018 ---------------------------------------------------------------------------------------------------- 2023-10-23 21:02:33,491 epoch 9 - iter 89/894 - loss 0.00404374 - time (sec): 5.47 - samples/sec: 1479.01 - lr: 0.000011 - momentum: 0.000000 2023-10-23 21:02:39,146 epoch 9 - iter 178/894 - loss 0.00808564 - time (sec): 11.13 - samples/sec: 1479.90 - lr: 0.000010 - momentum: 0.000000 2023-10-23 21:02:44,983 epoch 9 - iter 267/894 - loss 0.00904128 - time (sec): 16.96 - samples/sec: 1498.79 - lr: 0.000009 - momentum: 0.000000 2023-10-23 21:02:50,610 epoch 9 - iter 356/894 - loss 0.00830892 - time (sec): 22.59 - samples/sec: 1509.23 - lr: 0.000009 - momentum: 0.000000 2023-10-23 21:02:56,263 epoch 9 - iter 445/894 - loss 0.00809236 - time (sec): 28.24 - samples/sec: 1519.59 - lr: 0.000008 - momentum: 0.000000 2023-10-23 21:03:02,035 epoch 9 - iter 534/894 - loss 0.00805290 - time (sec): 34.02 - samples/sec: 1522.78 - lr: 0.000008 - momentum: 0.000000 2023-10-23 21:03:07,916 epoch 9 - iter 623/894 - loss 0.00765601 - time (sec): 39.90 - samples/sec: 1532.31 - lr: 0.000007 - momentum: 0.000000 2023-10-23 21:03:13,497 epoch 9 - iter 712/894 - loss 0.00744532 - time (sec): 45.48 - samples/sec: 1524.91 - lr: 0.000007 - momentum: 0.000000 2023-10-23 21:03:18,999 epoch 9 - iter 801/894 - loss 0.00757061 - time (sec): 50.98 - samples/sec: 1522.42 - lr: 0.000006 - momentum: 0.000000 2023-10-23 21:03:24,708 epoch 9 - iter 890/894 - loss 0.00716724 - time (sec): 56.69 - samples/sec: 1521.97 - lr: 0.000006 - momentum: 0.000000 2023-10-23 21:03:24,937 ---------------------------------------------------------------------------------------------------- 2023-10-23 21:03:24,938 EPOCH 9 done: loss 0.0071 - lr: 0.000006 2023-10-23 21:03:31,158 DEV : loss 0.2947549819946289 - f1-score (micro avg) 0.772 2023-10-23 21:03:31,178 saving best model 2023-10-23 21:03:31,770 ---------------------------------------------------------------------------------------------------- 2023-10-23 21:03:37,625 epoch 10 - iter 89/894 - loss 0.00103907 - time (sec): 5.85 - samples/sec: 1472.22 - lr: 0.000005 - momentum: 0.000000 2023-10-23 21:03:43,366 epoch 10 - iter 178/894 - loss 0.00117928 - time (sec): 11.60 - samples/sec: 1525.80 - lr: 0.000004 - momentum: 0.000000 2023-10-23 21:03:48,985 epoch 10 - iter 267/894 - loss 0.00126812 - time (sec): 17.21 - samples/sec: 1511.46 - lr: 0.000004 - momentum: 0.000000 2023-10-23 21:03:54,482 epoch 10 - iter 356/894 - loss 0.00168602 - time (sec): 22.71 - samples/sec: 1511.69 - lr: 0.000003 - momentum: 0.000000 2023-10-23 21:04:00,365 epoch 10 - iter 445/894 - loss 0.00234417 - time (sec): 28.59 - samples/sec: 1530.30 - lr: 0.000003 - momentum: 0.000000 2023-10-23 21:04:05,924 epoch 10 - iter 534/894 - loss 0.00256521 - time (sec): 34.15 - samples/sec: 1513.63 - lr: 0.000002 - momentum: 0.000000 2023-10-23 21:04:11,476 epoch 10 - iter 623/894 - loss 0.00250174 - time (sec): 39.71 - samples/sec: 1518.09 - lr: 0.000002 - momentum: 0.000000 2023-10-23 21:04:17,189 epoch 10 - iter 712/894 - loss 0.00262420 - time (sec): 45.42 - samples/sec: 1515.93 - lr: 0.000001 - momentum: 0.000000 2023-10-23 21:04:23,075 epoch 10 - iter 801/894 - loss 0.00338086 - time (sec): 51.30 - samples/sec: 1513.04 - lr: 0.000001 - momentum: 0.000000 2023-10-23 21:04:28,776 epoch 10 - iter 890/894 - loss 0.00333224 - time (sec): 57.01 - samples/sec: 1511.84 - lr: 0.000000 - momentum: 0.000000 2023-10-23 21:04:29,013 ---------------------------------------------------------------------------------------------------- 2023-10-23 21:04:29,013 EPOCH 10 done: loss 0.0033 - lr: 0.000000 2023-10-23 21:04:35,263 DEV : loss 0.3027936816215515 - f1-score (micro avg) 0.7733 2023-10-23 21:04:35,284 saving best model 2023-10-23 21:04:36,353 ---------------------------------------------------------------------------------------------------- 2023-10-23 21:04:36,354 Loading model from best epoch ... 2023-10-23 21:04:38,053 SequenceTagger predicts: Dictionary with 21 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org, S-prod, B-prod, E-prod, I-prod, S-time, B-time, E-time, I-time 2023-10-23 21:04:42,902 Results: - F-score (micro) 0.7427 - F-score (macro) 0.6605 - Accuracy 0.6064 By class: precision recall f1-score support loc 0.7984 0.8507 0.8237 596 pers 0.6863 0.7688 0.7252 333 org 0.5385 0.4773 0.5060 132 prod 0.5818 0.4848 0.5289 66 time 0.6852 0.7551 0.7184 49 micro avg 0.7253 0.7611 0.7427 1176 macro avg 0.6580 0.6673 0.6605 1176 weighted avg 0.7206 0.7611 0.7392 1176 2023-10-23 21:04:42,902 ----------------------------------------------------------------------------------------------------