2023-10-23 20:26:31,123 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:26:31,124 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=21, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-23 20:26:31,124 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:26:31,124 MultiCorpus: 3575 train + 1235 dev + 1266 test sentences - NER_HIPE_2022 Corpus: 3575 train + 1235 dev + 1266 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/hipe2020/de/with_doc_seperator 2023-10-23 20:26:31,124 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:26:31,124 Train: 3575 sentences 2023-10-23 20:26:31,124 (train_with_dev=False, train_with_test=False) 2023-10-23 20:26:31,124 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:26:31,124 Training Params: 2023-10-23 20:26:31,124 - learning_rate: "3e-05" 2023-10-23 20:26:31,124 - mini_batch_size: "8" 2023-10-23 20:26:31,124 - max_epochs: "10" 2023-10-23 20:26:31,124 - shuffle: "True" 2023-10-23 20:26:31,124 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:26:31,125 Plugins: 2023-10-23 20:26:31,125 - TensorboardLogger 2023-10-23 20:26:31,125 - LinearScheduler | warmup_fraction: '0.1' 2023-10-23 20:26:31,125 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:26:31,125 Final evaluation on model from best epoch (best-model.pt) 2023-10-23 20:26:31,125 - metric: "('micro avg', 'f1-score')" 2023-10-23 20:26:31,125 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:26:31,125 Computation: 2023-10-23 20:26:31,125 - compute on device: cuda:0 2023-10-23 20:26:31,125 - embedding storage: none 2023-10-23 20:26:31,125 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:26:31,125 Model training base path: "hmbench-hipe2020/de-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs8-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-1" 2023-10-23 20:26:31,125 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:26:31,125 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:26:31,125 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-23 20:26:35,069 epoch 1 - iter 44/447 - loss 3.40559275 - time (sec): 3.94 - samples/sec: 2157.26 - lr: 0.000003 - momentum: 0.000000 2023-10-23 20:26:38,774 epoch 1 - iter 88/447 - loss 2.27066127 - time (sec): 7.65 - samples/sec: 2138.15 - lr: 0.000006 - momentum: 0.000000 2023-10-23 20:26:42,689 epoch 1 - iter 132/447 - loss 1.64802661 - time (sec): 11.56 - samples/sec: 2165.17 - lr: 0.000009 - momentum: 0.000000 2023-10-23 20:26:46,722 epoch 1 - iter 176/447 - loss 1.32711622 - time (sec): 15.60 - samples/sec: 2129.40 - lr: 0.000012 - momentum: 0.000000 2023-10-23 20:26:50,516 epoch 1 - iter 220/447 - loss 1.13112757 - time (sec): 19.39 - samples/sec: 2148.76 - lr: 0.000015 - momentum: 0.000000 2023-10-23 20:26:54,391 epoch 1 - iter 264/447 - loss 0.98605246 - time (sec): 23.27 - samples/sec: 2138.84 - lr: 0.000018 - momentum: 0.000000 2023-10-23 20:26:58,431 epoch 1 - iter 308/447 - loss 0.87774702 - time (sec): 27.31 - samples/sec: 2133.43 - lr: 0.000021 - momentum: 0.000000 2023-10-23 20:27:02,860 epoch 1 - iter 352/447 - loss 0.79068129 - time (sec): 31.73 - samples/sec: 2140.14 - lr: 0.000024 - momentum: 0.000000 2023-10-23 20:27:06,854 epoch 1 - iter 396/447 - loss 0.72548100 - time (sec): 35.73 - samples/sec: 2149.02 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:27:10,849 epoch 1 - iter 440/447 - loss 0.67813581 - time (sec): 39.72 - samples/sec: 2149.35 - lr: 0.000029 - momentum: 0.000000 2023-10-23 20:27:11,419 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:27:11,420 EPOCH 1 done: loss 0.6706 - lr: 0.000029 2023-10-23 20:27:16,228 DEV : loss 0.1458185613155365 - f1-score (micro avg) 0.6581 2023-10-23 20:27:16,248 saving best model 2023-10-23 20:27:16,801 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:27:21,278 epoch 2 - iter 44/447 - loss 0.17898902 - time (sec): 4.48 - samples/sec: 2130.94 - lr: 0.000030 - momentum: 0.000000 2023-10-23 20:27:25,147 epoch 2 - iter 88/447 - loss 0.16047620 - time (sec): 8.34 - samples/sec: 2134.89 - lr: 0.000029 - momentum: 0.000000 2023-10-23 20:27:29,240 epoch 2 - iter 132/447 - loss 0.15610822 - time (sec): 12.44 - samples/sec: 2100.29 - lr: 0.000029 - momentum: 0.000000 2023-10-23 20:27:33,013 epoch 2 - iter 176/447 - loss 0.15494224 - time (sec): 16.21 - samples/sec: 2123.66 - lr: 0.000029 - momentum: 0.000000 2023-10-23 20:27:36,858 epoch 2 - iter 220/447 - loss 0.15138265 - time (sec): 20.06 - samples/sec: 2114.73 - lr: 0.000028 - momentum: 0.000000 2023-10-23 20:27:40,890 epoch 2 - iter 264/447 - loss 0.15125402 - time (sec): 24.09 - samples/sec: 2123.99 - lr: 0.000028 - momentum: 0.000000 2023-10-23 20:27:44,988 epoch 2 - iter 308/447 - loss 0.14586521 - time (sec): 28.19 - samples/sec: 2130.87 - lr: 0.000028 - momentum: 0.000000 2023-10-23 20:27:49,182 epoch 2 - iter 352/447 - loss 0.14365652 - time (sec): 32.38 - samples/sec: 2125.78 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:27:53,057 epoch 2 - iter 396/447 - loss 0.14023517 - time (sec): 36.25 - samples/sec: 2119.73 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:27:56,855 epoch 2 - iter 440/447 - loss 0.13550428 - time (sec): 40.05 - samples/sec: 2126.38 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:27:57,477 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:27:57,477 EPOCH 2 done: loss 0.1349 - lr: 0.000027 2023-10-23 20:28:03,938 DEV : loss 0.12103226780891418 - f1-score (micro avg) 0.7163 2023-10-23 20:28:03,958 saving best model 2023-10-23 20:28:04,777 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:28:08,610 epoch 3 - iter 44/447 - loss 0.07460696 - time (sec): 3.83 - samples/sec: 2099.40 - lr: 0.000026 - momentum: 0.000000 2023-10-23 20:28:12,723 epoch 3 - iter 88/447 - loss 0.06922936 - time (sec): 7.94 - samples/sec: 2150.34 - lr: 0.000026 - momentum: 0.000000 2023-10-23 20:28:16,744 epoch 3 - iter 132/447 - loss 0.06704794 - time (sec): 11.97 - samples/sec: 2105.48 - lr: 0.000026 - momentum: 0.000000 2023-10-23 20:28:20,633 epoch 3 - iter 176/447 - loss 0.07184773 - time (sec): 15.86 - samples/sec: 2121.83 - lr: 0.000025 - momentum: 0.000000 2023-10-23 20:28:24,327 epoch 3 - iter 220/447 - loss 0.07620919 - time (sec): 19.55 - samples/sec: 2112.93 - lr: 0.000025 - momentum: 0.000000 2023-10-23 20:28:28,243 epoch 3 - iter 264/447 - loss 0.07384269 - time (sec): 23.46 - samples/sec: 2133.46 - lr: 0.000025 - momentum: 0.000000 2023-10-23 20:28:32,208 epoch 3 - iter 308/447 - loss 0.07454931 - time (sec): 27.43 - samples/sec: 2134.92 - lr: 0.000024 - momentum: 0.000000 2023-10-23 20:28:36,018 epoch 3 - iter 352/447 - loss 0.07264295 - time (sec): 31.24 - samples/sec: 2132.00 - lr: 0.000024 - momentum: 0.000000 2023-10-23 20:28:40,250 epoch 3 - iter 396/447 - loss 0.07353915 - time (sec): 35.47 - samples/sec: 2126.24 - lr: 0.000024 - momentum: 0.000000 2023-10-23 20:28:44,114 epoch 3 - iter 440/447 - loss 0.07338628 - time (sec): 39.34 - samples/sec: 2139.02 - lr: 0.000023 - momentum: 0.000000 2023-10-23 20:28:45,093 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:28:45,093 EPOCH 3 done: loss 0.0736 - lr: 0.000023 2023-10-23 20:28:51,584 DEV : loss 0.1441224366426468 - f1-score (micro avg) 0.7494 2023-10-23 20:28:51,604 saving best model 2023-10-23 20:28:52,282 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:28:56,486 epoch 4 - iter 44/447 - loss 0.05528108 - time (sec): 4.20 - samples/sec: 2143.25 - lr: 0.000023 - momentum: 0.000000 2023-10-23 20:29:00,275 epoch 4 - iter 88/447 - loss 0.04748136 - time (sec): 7.99 - samples/sec: 2162.51 - lr: 0.000023 - momentum: 0.000000 2023-10-23 20:29:04,126 epoch 4 - iter 132/447 - loss 0.04748733 - time (sec): 11.84 - samples/sec: 2157.90 - lr: 0.000022 - momentum: 0.000000 2023-10-23 20:29:07,812 epoch 4 - iter 176/447 - loss 0.04536813 - time (sec): 15.53 - samples/sec: 2159.73 - lr: 0.000022 - momentum: 0.000000 2023-10-23 20:29:12,061 epoch 4 - iter 220/447 - loss 0.04421550 - time (sec): 19.78 - samples/sec: 2153.18 - lr: 0.000022 - momentum: 0.000000 2023-10-23 20:29:15,847 epoch 4 - iter 264/447 - loss 0.04719171 - time (sec): 23.56 - samples/sec: 2133.66 - lr: 0.000021 - momentum: 0.000000 2023-10-23 20:29:19,658 epoch 4 - iter 308/447 - loss 0.04624824 - time (sec): 27.37 - samples/sec: 2138.24 - lr: 0.000021 - momentum: 0.000000 2023-10-23 20:29:23,597 epoch 4 - iter 352/447 - loss 0.04553819 - time (sec): 31.31 - samples/sec: 2133.66 - lr: 0.000021 - momentum: 0.000000 2023-10-23 20:29:28,135 epoch 4 - iter 396/447 - loss 0.04771921 - time (sec): 35.85 - samples/sec: 2130.52 - lr: 0.000020 - momentum: 0.000000 2023-10-23 20:29:32,088 epoch 4 - iter 440/447 - loss 0.04663221 - time (sec): 39.80 - samples/sec: 2138.66 - lr: 0.000020 - momentum: 0.000000 2023-10-23 20:29:32,771 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:29:32,771 EPOCH 4 done: loss 0.0463 - lr: 0.000020 2023-10-23 20:29:39,240 DEV : loss 0.18340256810188293 - f1-score (micro avg) 0.7452 2023-10-23 20:29:39,261 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:29:42,916 epoch 5 - iter 44/447 - loss 0.02363584 - time (sec): 3.65 - samples/sec: 2091.42 - lr: 0.000020 - momentum: 0.000000 2023-10-23 20:29:46,763 epoch 5 - iter 88/447 - loss 0.02708204 - time (sec): 7.50 - samples/sec: 2105.34 - lr: 0.000019 - momentum: 0.000000 2023-10-23 20:29:51,070 epoch 5 - iter 132/447 - loss 0.03019743 - time (sec): 11.81 - samples/sec: 2099.11 - lr: 0.000019 - momentum: 0.000000 2023-10-23 20:29:54,880 epoch 5 - iter 176/447 - loss 0.02987305 - time (sec): 15.62 - samples/sec: 2119.68 - lr: 0.000019 - momentum: 0.000000 2023-10-23 20:29:58,600 epoch 5 - iter 220/447 - loss 0.03110977 - time (sec): 19.34 - samples/sec: 2130.38 - lr: 0.000018 - momentum: 0.000000 2023-10-23 20:30:03,050 epoch 5 - iter 264/447 - loss 0.03146956 - time (sec): 23.79 - samples/sec: 2134.65 - lr: 0.000018 - momentum: 0.000000 2023-10-23 20:30:06,749 epoch 5 - iter 308/447 - loss 0.03134889 - time (sec): 27.49 - samples/sec: 2146.21 - lr: 0.000018 - momentum: 0.000000 2023-10-23 20:30:10,791 epoch 5 - iter 352/447 - loss 0.03066796 - time (sec): 31.53 - samples/sec: 2157.24 - lr: 0.000017 - momentum: 0.000000 2023-10-23 20:30:14,570 epoch 5 - iter 396/447 - loss 0.02912833 - time (sec): 35.31 - samples/sec: 2147.67 - lr: 0.000017 - momentum: 0.000000 2023-10-23 20:30:19,140 epoch 5 - iter 440/447 - loss 0.02919446 - time (sec): 39.88 - samples/sec: 2137.98 - lr: 0.000017 - momentum: 0.000000 2023-10-23 20:30:19,753 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:30:19,753 EPOCH 5 done: loss 0.0298 - lr: 0.000017 2023-10-23 20:30:26,235 DEV : loss 0.18645448982715607 - f1-score (micro avg) 0.7792 2023-10-23 20:30:26,255 saving best model 2023-10-23 20:30:27,018 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:30:30,775 epoch 6 - iter 44/447 - loss 0.01761254 - time (sec): 3.76 - samples/sec: 2202.78 - lr: 0.000016 - momentum: 0.000000 2023-10-23 20:30:34,641 epoch 6 - iter 88/447 - loss 0.01949947 - time (sec): 7.62 - samples/sec: 2184.21 - lr: 0.000016 - momentum: 0.000000 2023-10-23 20:30:39,203 epoch 6 - iter 132/447 - loss 0.01802132 - time (sec): 12.18 - samples/sec: 2166.88 - lr: 0.000016 - momentum: 0.000000 2023-10-23 20:30:43,540 epoch 6 - iter 176/447 - loss 0.01924059 - time (sec): 16.52 - samples/sec: 2136.18 - lr: 0.000015 - momentum: 0.000000 2023-10-23 20:30:47,667 epoch 6 - iter 220/447 - loss 0.01755935 - time (sec): 20.65 - samples/sec: 2133.50 - lr: 0.000015 - momentum: 0.000000 2023-10-23 20:30:51,710 epoch 6 - iter 264/447 - loss 0.01763127 - time (sec): 24.69 - samples/sec: 2136.95 - lr: 0.000015 - momentum: 0.000000 2023-10-23 20:30:55,340 epoch 6 - iter 308/447 - loss 0.01793616 - time (sec): 28.32 - samples/sec: 2125.68 - lr: 0.000014 - momentum: 0.000000 2023-10-23 20:30:59,084 epoch 6 - iter 352/447 - loss 0.02009423 - time (sec): 32.06 - samples/sec: 2133.18 - lr: 0.000014 - momentum: 0.000000 2023-10-23 20:31:03,014 epoch 6 - iter 396/447 - loss 0.02001702 - time (sec): 35.99 - samples/sec: 2130.93 - lr: 0.000014 - momentum: 0.000000 2023-10-23 20:31:06,935 epoch 6 - iter 440/447 - loss 0.01927894 - time (sec): 39.92 - samples/sec: 2138.15 - lr: 0.000013 - momentum: 0.000000 2023-10-23 20:31:07,561 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:31:07,561 EPOCH 6 done: loss 0.0193 - lr: 0.000013 2023-10-23 20:31:14,030 DEV : loss 0.20568153262138367 - f1-score (micro avg) 0.773 2023-10-23 20:31:14,050 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:31:18,387 epoch 7 - iter 44/447 - loss 0.01648589 - time (sec): 4.34 - samples/sec: 2168.11 - lr: 0.000013 - momentum: 0.000000 2023-10-23 20:31:22,525 epoch 7 - iter 88/447 - loss 0.01190539 - time (sec): 8.47 - samples/sec: 2107.89 - lr: 0.000013 - momentum: 0.000000 2023-10-23 20:31:26,739 epoch 7 - iter 132/447 - loss 0.01255698 - time (sec): 12.69 - samples/sec: 2130.35 - lr: 0.000012 - momentum: 0.000000 2023-10-23 20:31:30,553 epoch 7 - iter 176/447 - loss 0.01161302 - time (sec): 16.50 - samples/sec: 2123.07 - lr: 0.000012 - momentum: 0.000000 2023-10-23 20:31:34,386 epoch 7 - iter 220/447 - loss 0.01153263 - time (sec): 20.34 - samples/sec: 2113.34 - lr: 0.000012 - momentum: 0.000000 2023-10-23 20:31:38,216 epoch 7 - iter 264/447 - loss 0.01117656 - time (sec): 24.16 - samples/sec: 2124.89 - lr: 0.000011 - momentum: 0.000000 2023-10-23 20:31:42,456 epoch 7 - iter 308/447 - loss 0.01123172 - time (sec): 28.41 - samples/sec: 2127.68 - lr: 0.000011 - momentum: 0.000000 2023-10-23 20:31:46,184 epoch 7 - iter 352/447 - loss 0.01157591 - time (sec): 32.13 - samples/sec: 2146.42 - lr: 0.000011 - momentum: 0.000000 2023-10-23 20:31:50,167 epoch 7 - iter 396/447 - loss 0.01235896 - time (sec): 36.12 - samples/sec: 2129.42 - lr: 0.000010 - momentum: 0.000000 2023-10-23 20:31:54,022 epoch 7 - iter 440/447 - loss 0.01199197 - time (sec): 39.97 - samples/sec: 2140.09 - lr: 0.000010 - momentum: 0.000000 2023-10-23 20:31:54,542 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:31:54,543 EPOCH 7 done: loss 0.0119 - lr: 0.000010 2023-10-23 20:32:01,006 DEV : loss 0.2467608004808426 - f1-score (micro avg) 0.7781 2023-10-23 20:32:01,026 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:32:04,886 epoch 8 - iter 44/447 - loss 0.00812035 - time (sec): 3.86 - samples/sec: 2155.78 - lr: 0.000010 - momentum: 0.000000 2023-10-23 20:32:08,714 epoch 8 - iter 88/447 - loss 0.00675385 - time (sec): 7.69 - samples/sec: 2183.54 - lr: 0.000009 - momentum: 0.000000 2023-10-23 20:32:13,296 epoch 8 - iter 132/447 - loss 0.00807939 - time (sec): 12.27 - samples/sec: 2125.04 - lr: 0.000009 - momentum: 0.000000 2023-10-23 20:32:17,004 epoch 8 - iter 176/447 - loss 0.00904526 - time (sec): 15.98 - samples/sec: 2150.50 - lr: 0.000009 - momentum: 0.000000 2023-10-23 20:32:21,106 epoch 8 - iter 220/447 - loss 0.00833139 - time (sec): 20.08 - samples/sec: 2144.12 - lr: 0.000008 - momentum: 0.000000 2023-10-23 20:32:24,757 epoch 8 - iter 264/447 - loss 0.00759400 - time (sec): 23.73 - samples/sec: 2134.79 - lr: 0.000008 - momentum: 0.000000 2023-10-23 20:32:28,731 epoch 8 - iter 308/447 - loss 0.00798396 - time (sec): 27.70 - samples/sec: 2129.98 - lr: 0.000008 - momentum: 0.000000 2023-10-23 20:32:32,682 epoch 8 - iter 352/447 - loss 0.00793679 - time (sec): 31.65 - samples/sec: 2137.04 - lr: 0.000007 - momentum: 0.000000 2023-10-23 20:32:36,651 epoch 8 - iter 396/447 - loss 0.00768071 - time (sec): 35.62 - samples/sec: 2138.79 - lr: 0.000007 - momentum: 0.000000 2023-10-23 20:32:40,934 epoch 8 - iter 440/447 - loss 0.00822976 - time (sec): 39.91 - samples/sec: 2135.70 - lr: 0.000007 - momentum: 0.000000 2023-10-23 20:32:41,549 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:32:41,549 EPOCH 8 done: loss 0.0082 - lr: 0.000007 2023-10-23 20:32:48,045 DEV : loss 0.2412562370300293 - f1-score (micro avg) 0.7825 2023-10-23 20:32:48,065 saving best model 2023-10-23 20:32:48,761 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:32:52,934 epoch 9 - iter 44/447 - loss 0.00453444 - time (sec): 4.17 - samples/sec: 2140.52 - lr: 0.000006 - momentum: 0.000000 2023-10-23 20:32:57,201 epoch 9 - iter 88/447 - loss 0.00439378 - time (sec): 8.44 - samples/sec: 2135.90 - lr: 0.000006 - momentum: 0.000000 2023-10-23 20:33:01,259 epoch 9 - iter 132/447 - loss 0.00408738 - time (sec): 12.50 - samples/sec: 2109.13 - lr: 0.000006 - momentum: 0.000000 2023-10-23 20:33:04,933 epoch 9 - iter 176/447 - loss 0.00432207 - time (sec): 16.17 - samples/sec: 2090.21 - lr: 0.000005 - momentum: 0.000000 2023-10-23 20:33:08,650 epoch 9 - iter 220/447 - loss 0.00390841 - time (sec): 19.89 - samples/sec: 2097.07 - lr: 0.000005 - momentum: 0.000000 2023-10-23 20:33:12,736 epoch 9 - iter 264/447 - loss 0.00480594 - time (sec): 23.97 - samples/sec: 2102.34 - lr: 0.000005 - momentum: 0.000000 2023-10-23 20:33:16,421 epoch 9 - iter 308/447 - loss 0.00437718 - time (sec): 27.66 - samples/sec: 2117.89 - lr: 0.000004 - momentum: 0.000000 2023-10-23 20:33:21,027 epoch 9 - iter 352/447 - loss 0.00480072 - time (sec): 32.27 - samples/sec: 2146.99 - lr: 0.000004 - momentum: 0.000000 2023-10-23 20:33:24,776 epoch 9 - iter 396/447 - loss 0.00441195 - time (sec): 36.01 - samples/sec: 2152.58 - lr: 0.000004 - momentum: 0.000000 2023-10-23 20:33:28,519 epoch 9 - iter 440/447 - loss 0.00446100 - time (sec): 39.76 - samples/sec: 2147.70 - lr: 0.000003 - momentum: 0.000000 2023-10-23 20:33:29,100 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:33:29,101 EPOCH 9 done: loss 0.0044 - lr: 0.000003 2023-10-23 20:33:35,299 DEV : loss 0.2651752233505249 - f1-score (micro avg) 0.7869 2023-10-23 20:33:35,319 saving best model 2023-10-23 20:33:36,292 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:33:40,432 epoch 10 - iter 44/447 - loss 0.00222507 - time (sec): 4.14 - samples/sec: 2060.04 - lr: 0.000003 - momentum: 0.000000 2023-10-23 20:33:44,280 epoch 10 - iter 88/447 - loss 0.00222432 - time (sec): 7.99 - samples/sec: 2140.99 - lr: 0.000003 - momentum: 0.000000 2023-10-23 20:33:48,091 epoch 10 - iter 132/447 - loss 0.00195111 - time (sec): 11.80 - samples/sec: 2135.65 - lr: 0.000002 - momentum: 0.000000 2023-10-23 20:33:51,921 epoch 10 - iter 176/447 - loss 0.00167208 - time (sec): 15.63 - samples/sec: 2132.03 - lr: 0.000002 - momentum: 0.000000 2023-10-23 20:33:55,720 epoch 10 - iter 220/447 - loss 0.00201678 - time (sec): 19.43 - samples/sec: 2129.34 - lr: 0.000002 - momentum: 0.000000 2023-10-23 20:33:59,624 epoch 10 - iter 264/447 - loss 0.00281450 - time (sec): 23.33 - samples/sec: 2129.36 - lr: 0.000001 - momentum: 0.000000 2023-10-23 20:34:03,473 epoch 10 - iter 308/447 - loss 0.00267761 - time (sec): 27.18 - samples/sec: 2122.63 - lr: 0.000001 - momentum: 0.000000 2023-10-23 20:34:07,179 epoch 10 - iter 352/447 - loss 0.00267766 - time (sec): 30.89 - samples/sec: 2136.70 - lr: 0.000001 - momentum: 0.000000 2023-10-23 20:34:11,837 epoch 10 - iter 396/447 - loss 0.00272475 - time (sec): 35.54 - samples/sec: 2151.82 - lr: 0.000000 - momentum: 0.000000 2023-10-23 20:34:15,662 epoch 10 - iter 440/447 - loss 0.00324245 - time (sec): 39.37 - samples/sec: 2146.57 - lr: 0.000000 - momentum: 0.000000 2023-10-23 20:34:16,577 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:34:16,577 EPOCH 10 done: loss 0.0032 - lr: 0.000000 2023-10-23 20:34:22,820 DEV : loss 0.2557121813297272 - f1-score (micro avg) 0.7858 2023-10-23 20:34:23,391 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:34:23,392 Loading model from best epoch ... 2023-10-23 20:34:25,437 SequenceTagger predicts: Dictionary with 21 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org, S-prod, B-prod, E-prod, I-prod, S-time, B-time, E-time, I-time 2023-10-23 20:34:29,973 Results: - F-score (micro) 0.747 - F-score (macro) 0.6687 - Accuracy 0.6144 By class: precision recall f1-score support loc 0.8336 0.8406 0.8371 596 pers 0.6838 0.7598 0.7198 333 org 0.5126 0.4621 0.4861 132 prod 0.6271 0.5606 0.5920 66 time 0.7234 0.6939 0.7083 49 micro avg 0.7408 0.7534 0.7470 1176 macro avg 0.6761 0.6634 0.6687 1176 weighted avg 0.7390 0.7534 0.7453 1176 2023-10-23 20:34:29,973 ----------------------------------------------------------------------------------------------------