2023-10-23 23:00:28,649 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:00:28,650 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=21, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-23 23:00:28,651 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:00:28,651 MultiCorpus: 3575 train + 1235 dev + 1266 test sentences - NER_HIPE_2022 Corpus: 3575 train + 1235 dev + 1266 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/hipe2020/de/with_doc_seperator 2023-10-23 23:00:28,651 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:00:28,651 Train: 3575 sentences 2023-10-23 23:00:28,651 (train_with_dev=False, train_with_test=False) 2023-10-23 23:00:28,651 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:00:28,651 Training Params: 2023-10-23 23:00:28,651 - learning_rate: "3e-05" 2023-10-23 23:00:28,651 - mini_batch_size: "8" 2023-10-23 23:00:28,651 - max_epochs: "10" 2023-10-23 23:00:28,651 - shuffle: "True" 2023-10-23 23:00:28,651 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:00:28,651 Plugins: 2023-10-23 23:00:28,651 - TensorboardLogger 2023-10-23 23:00:28,651 - LinearScheduler | warmup_fraction: '0.1' 2023-10-23 23:00:28,651 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:00:28,651 Final evaluation on model from best epoch (best-model.pt) 2023-10-23 23:00:28,651 - metric: "('micro avg', 'f1-score')" 2023-10-23 23:00:28,651 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:00:28,651 Computation: 2023-10-23 23:00:28,651 - compute on device: cuda:0 2023-10-23 23:00:28,651 - embedding storage: none 2023-10-23 23:00:28,651 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:00:28,651 Model training base path: "hmbench-hipe2020/de-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs8-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-5" 2023-10-23 23:00:28,651 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:00:28,651 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:00:28,651 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-23 23:00:32,746 epoch 1 - iter 44/447 - loss 2.52602023 - time (sec): 4.09 - samples/sec: 2195.25 - lr: 0.000003 - momentum: 0.000000 2023-10-23 23:00:36,618 epoch 1 - iter 88/447 - loss 1.70182761 - time (sec): 7.97 - samples/sec: 2171.87 - lr: 0.000006 - momentum: 0.000000 2023-10-23 23:00:40,574 epoch 1 - iter 132/447 - loss 1.31473468 - time (sec): 11.92 - samples/sec: 2185.11 - lr: 0.000009 - momentum: 0.000000 2023-10-23 23:00:44,249 epoch 1 - iter 176/447 - loss 1.09744513 - time (sec): 15.60 - samples/sec: 2207.06 - lr: 0.000012 - momentum: 0.000000 2023-10-23 23:00:48,800 epoch 1 - iter 220/447 - loss 0.92998358 - time (sec): 20.15 - samples/sec: 2172.71 - lr: 0.000015 - momentum: 0.000000 2023-10-23 23:00:52,526 epoch 1 - iter 264/447 - loss 0.83386539 - time (sec): 23.87 - samples/sec: 2166.96 - lr: 0.000018 - momentum: 0.000000 2023-10-23 23:00:56,473 epoch 1 - iter 308/447 - loss 0.75611333 - time (sec): 27.82 - samples/sec: 2152.51 - lr: 0.000021 - momentum: 0.000000 2023-10-23 23:01:00,214 epoch 1 - iter 352/447 - loss 0.69739560 - time (sec): 31.56 - samples/sec: 2134.46 - lr: 0.000024 - momentum: 0.000000 2023-10-23 23:01:04,333 epoch 1 - iter 396/447 - loss 0.64562676 - time (sec): 35.68 - samples/sec: 2144.31 - lr: 0.000027 - momentum: 0.000000 2023-10-23 23:01:08,462 epoch 1 - iter 440/447 - loss 0.59937392 - time (sec): 39.81 - samples/sec: 2143.25 - lr: 0.000029 - momentum: 0.000000 2023-10-23 23:01:09,047 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:01:09,048 EPOCH 1 done: loss 0.5937 - lr: 0.000029 2023-10-23 23:01:13,861 DEV : loss 0.19523081183433533 - f1-score (micro avg) 0.602 2023-10-23 23:01:13,882 saving best model 2023-10-23 23:01:14,355 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:01:18,559 epoch 2 - iter 44/447 - loss 0.17169884 - time (sec): 4.20 - samples/sec: 2020.76 - lr: 0.000030 - momentum: 0.000000 2023-10-23 23:01:22,581 epoch 2 - iter 88/447 - loss 0.17647433 - time (sec): 8.22 - samples/sec: 2101.90 - lr: 0.000029 - momentum: 0.000000 2023-10-23 23:01:26,617 epoch 2 - iter 132/447 - loss 0.16565784 - time (sec): 12.26 - samples/sec: 2086.54 - lr: 0.000029 - momentum: 0.000000 2023-10-23 23:01:30,763 epoch 2 - iter 176/447 - loss 0.16430118 - time (sec): 16.41 - samples/sec: 2107.26 - lr: 0.000029 - momentum: 0.000000 2023-10-23 23:01:34,549 epoch 2 - iter 220/447 - loss 0.15912237 - time (sec): 20.19 - samples/sec: 2116.34 - lr: 0.000028 - momentum: 0.000000 2023-10-23 23:01:38,548 epoch 2 - iter 264/447 - loss 0.15555409 - time (sec): 24.19 - samples/sec: 2112.73 - lr: 0.000028 - momentum: 0.000000 2023-10-23 23:01:42,387 epoch 2 - iter 308/447 - loss 0.15167650 - time (sec): 28.03 - samples/sec: 2126.29 - lr: 0.000028 - momentum: 0.000000 2023-10-23 23:01:46,396 epoch 2 - iter 352/447 - loss 0.14567258 - time (sec): 32.04 - samples/sec: 2135.14 - lr: 0.000027 - momentum: 0.000000 2023-10-23 23:01:50,342 epoch 2 - iter 396/447 - loss 0.14264947 - time (sec): 35.99 - samples/sec: 2127.62 - lr: 0.000027 - momentum: 0.000000 2023-10-23 23:01:54,245 epoch 2 - iter 440/447 - loss 0.13821798 - time (sec): 39.89 - samples/sec: 2136.97 - lr: 0.000027 - momentum: 0.000000 2023-10-23 23:01:54,864 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:01:54,864 EPOCH 2 done: loss 0.1395 - lr: 0.000027 2023-10-23 23:02:01,346 DEV : loss 0.12295468896627426 - f1-score (micro avg) 0.7103 2023-10-23 23:02:01,366 saving best model 2023-10-23 23:02:01,953 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:02:06,309 epoch 3 - iter 44/447 - loss 0.09549111 - time (sec): 4.35 - samples/sec: 2195.36 - lr: 0.000026 - momentum: 0.000000 2023-10-23 23:02:10,254 epoch 3 - iter 88/447 - loss 0.07868123 - time (sec): 8.30 - samples/sec: 2242.07 - lr: 0.000026 - momentum: 0.000000 2023-10-23 23:02:14,277 epoch 3 - iter 132/447 - loss 0.08318832 - time (sec): 12.32 - samples/sec: 2209.09 - lr: 0.000026 - momentum: 0.000000 2023-10-23 23:02:17,984 epoch 3 - iter 176/447 - loss 0.07733080 - time (sec): 16.03 - samples/sec: 2198.38 - lr: 0.000025 - momentum: 0.000000 2023-10-23 23:02:22,352 epoch 3 - iter 220/447 - loss 0.07500939 - time (sec): 20.40 - samples/sec: 2181.80 - lr: 0.000025 - momentum: 0.000000 2023-10-23 23:02:26,654 epoch 3 - iter 264/447 - loss 0.07437472 - time (sec): 24.70 - samples/sec: 2171.11 - lr: 0.000025 - momentum: 0.000000 2023-10-23 23:02:30,577 epoch 3 - iter 308/447 - loss 0.07459318 - time (sec): 28.62 - samples/sec: 2157.01 - lr: 0.000024 - momentum: 0.000000 2023-10-23 23:02:34,258 epoch 3 - iter 352/447 - loss 0.07470733 - time (sec): 32.30 - samples/sec: 2137.56 - lr: 0.000024 - momentum: 0.000000 2023-10-23 23:02:38,173 epoch 3 - iter 396/447 - loss 0.07388045 - time (sec): 36.22 - samples/sec: 2148.46 - lr: 0.000024 - momentum: 0.000000 2023-10-23 23:02:41,938 epoch 3 - iter 440/447 - loss 0.07304330 - time (sec): 39.98 - samples/sec: 2136.97 - lr: 0.000023 - momentum: 0.000000 2023-10-23 23:02:42,478 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:02:42,479 EPOCH 3 done: loss 0.0728 - lr: 0.000023 2023-10-23 23:02:48,963 DEV : loss 0.15343354642391205 - f1-score (micro avg) 0.7489 2023-10-23 23:02:48,983 saving best model 2023-10-23 23:02:49,570 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:02:53,380 epoch 4 - iter 44/447 - loss 0.03471509 - time (sec): 3.81 - samples/sec: 2180.41 - lr: 0.000023 - momentum: 0.000000 2023-10-23 23:02:57,700 epoch 4 - iter 88/447 - loss 0.04095950 - time (sec): 8.13 - samples/sec: 2166.29 - lr: 0.000023 - momentum: 0.000000 2023-10-23 23:03:01,896 epoch 4 - iter 132/447 - loss 0.04313707 - time (sec): 12.33 - samples/sec: 2145.87 - lr: 0.000022 - momentum: 0.000000 2023-10-23 23:03:05,622 epoch 4 - iter 176/447 - loss 0.04217202 - time (sec): 16.05 - samples/sec: 2123.73 - lr: 0.000022 - momentum: 0.000000 2023-10-23 23:03:10,203 epoch 4 - iter 220/447 - loss 0.04425905 - time (sec): 20.63 - samples/sec: 2113.31 - lr: 0.000022 - momentum: 0.000000 2023-10-23 23:03:14,002 epoch 4 - iter 264/447 - loss 0.04327538 - time (sec): 24.43 - samples/sec: 2107.43 - lr: 0.000021 - momentum: 0.000000 2023-10-23 23:03:18,050 epoch 4 - iter 308/447 - loss 0.04558314 - time (sec): 28.48 - samples/sec: 2129.10 - lr: 0.000021 - momentum: 0.000000 2023-10-23 23:03:22,001 epoch 4 - iter 352/447 - loss 0.04373608 - time (sec): 32.43 - samples/sec: 2127.73 - lr: 0.000021 - momentum: 0.000000 2023-10-23 23:03:25,841 epoch 4 - iter 396/447 - loss 0.04253316 - time (sec): 36.27 - samples/sec: 2127.56 - lr: 0.000020 - momentum: 0.000000 2023-10-23 23:03:29,679 epoch 4 - iter 440/447 - loss 0.04399254 - time (sec): 40.11 - samples/sec: 2125.07 - lr: 0.000020 - momentum: 0.000000 2023-10-23 23:03:30,294 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:03:30,294 EPOCH 4 done: loss 0.0437 - lr: 0.000020 2023-10-23 23:03:36,776 DEV : loss 0.16440840065479279 - f1-score (micro avg) 0.7465 2023-10-23 23:03:36,796 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:03:40,718 epoch 5 - iter 44/447 - loss 0.03153993 - time (sec): 3.92 - samples/sec: 2183.07 - lr: 0.000020 - momentum: 0.000000 2023-10-23 23:03:45,031 epoch 5 - iter 88/447 - loss 0.03158561 - time (sec): 8.23 - samples/sec: 2117.98 - lr: 0.000019 - momentum: 0.000000 2023-10-23 23:03:48,851 epoch 5 - iter 132/447 - loss 0.03154475 - time (sec): 12.05 - samples/sec: 2134.16 - lr: 0.000019 - momentum: 0.000000 2023-10-23 23:03:52,601 epoch 5 - iter 176/447 - loss 0.02896412 - time (sec): 15.80 - samples/sec: 2134.56 - lr: 0.000019 - momentum: 0.000000 2023-10-23 23:03:56,655 epoch 5 - iter 220/447 - loss 0.02964248 - time (sec): 19.86 - samples/sec: 2123.94 - lr: 0.000018 - momentum: 0.000000 2023-10-23 23:04:00,713 epoch 5 - iter 264/447 - loss 0.02934625 - time (sec): 23.92 - samples/sec: 2115.14 - lr: 0.000018 - momentum: 0.000000 2023-10-23 23:04:04,358 epoch 5 - iter 308/447 - loss 0.02819574 - time (sec): 27.56 - samples/sec: 2124.76 - lr: 0.000018 - momentum: 0.000000 2023-10-23 23:04:08,764 epoch 5 - iter 352/447 - loss 0.03063596 - time (sec): 31.97 - samples/sec: 2126.05 - lr: 0.000017 - momentum: 0.000000 2023-10-23 23:04:12,599 epoch 5 - iter 396/447 - loss 0.02978320 - time (sec): 35.80 - samples/sec: 2137.51 - lr: 0.000017 - momentum: 0.000000 2023-10-23 23:04:16,554 epoch 5 - iter 440/447 - loss 0.02930160 - time (sec): 39.76 - samples/sec: 2141.01 - lr: 0.000017 - momentum: 0.000000 2023-10-23 23:04:17,157 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:04:17,157 EPOCH 5 done: loss 0.0292 - lr: 0.000017 2023-10-23 23:04:23,636 DEV : loss 0.18590261042118073 - f1-score (micro avg) 0.771 2023-10-23 23:04:23,656 saving best model 2023-10-23 23:04:24,245 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:04:28,359 epoch 6 - iter 44/447 - loss 0.02168377 - time (sec): 4.11 - samples/sec: 2005.67 - lr: 0.000016 - momentum: 0.000000 2023-10-23 23:04:32,535 epoch 6 - iter 88/447 - loss 0.02664472 - time (sec): 8.29 - samples/sec: 2046.54 - lr: 0.000016 - momentum: 0.000000 2023-10-23 23:04:37,101 epoch 6 - iter 132/447 - loss 0.02379201 - time (sec): 12.85 - samples/sec: 2069.01 - lr: 0.000016 - momentum: 0.000000 2023-10-23 23:04:40,881 epoch 6 - iter 176/447 - loss 0.02219099 - time (sec): 16.64 - samples/sec: 2088.60 - lr: 0.000015 - momentum: 0.000000 2023-10-23 23:04:44,787 epoch 6 - iter 220/447 - loss 0.02183849 - time (sec): 20.54 - samples/sec: 2102.02 - lr: 0.000015 - momentum: 0.000000 2023-10-23 23:04:48,657 epoch 6 - iter 264/447 - loss 0.02049396 - time (sec): 24.41 - samples/sec: 2109.29 - lr: 0.000015 - momentum: 0.000000 2023-10-23 23:04:52,413 epoch 6 - iter 308/447 - loss 0.02101667 - time (sec): 28.17 - samples/sec: 2106.25 - lr: 0.000014 - momentum: 0.000000 2023-10-23 23:04:56,108 epoch 6 - iter 352/447 - loss 0.01998352 - time (sec): 31.86 - samples/sec: 2107.46 - lr: 0.000014 - momentum: 0.000000 2023-10-23 23:05:00,274 epoch 6 - iter 396/447 - loss 0.02022647 - time (sec): 36.03 - samples/sec: 2113.02 - lr: 0.000014 - momentum: 0.000000 2023-10-23 23:05:04,211 epoch 6 - iter 440/447 - loss 0.02038921 - time (sec): 39.96 - samples/sec: 2131.51 - lr: 0.000013 - momentum: 0.000000 2023-10-23 23:05:04,873 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:05:04,874 EPOCH 6 done: loss 0.0206 - lr: 0.000013 2023-10-23 23:05:11,385 DEV : loss 0.2045108526945114 - f1-score (micro avg) 0.7576 2023-10-23 23:05:11,406 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:05:15,710 epoch 7 - iter 44/447 - loss 0.02004284 - time (sec): 4.30 - samples/sec: 2123.05 - lr: 0.000013 - momentum: 0.000000 2023-10-23 23:05:19,740 epoch 7 - iter 88/447 - loss 0.01541434 - time (sec): 8.33 - samples/sec: 2121.61 - lr: 0.000013 - momentum: 0.000000 2023-10-23 23:05:23,460 epoch 7 - iter 132/447 - loss 0.01587152 - time (sec): 12.05 - samples/sec: 2141.90 - lr: 0.000012 - momentum: 0.000000 2023-10-23 23:05:27,657 epoch 7 - iter 176/447 - loss 0.01550813 - time (sec): 16.25 - samples/sec: 2171.27 - lr: 0.000012 - momentum: 0.000000 2023-10-23 23:05:31,665 epoch 7 - iter 220/447 - loss 0.01526874 - time (sec): 20.26 - samples/sec: 2146.13 - lr: 0.000012 - momentum: 0.000000 2023-10-23 23:05:35,806 epoch 7 - iter 264/447 - loss 0.01545565 - time (sec): 24.40 - samples/sec: 2129.96 - lr: 0.000011 - momentum: 0.000000 2023-10-23 23:05:39,700 epoch 7 - iter 308/447 - loss 0.01438074 - time (sec): 28.29 - samples/sec: 2135.49 - lr: 0.000011 - momentum: 0.000000 2023-10-23 23:05:43,906 epoch 7 - iter 352/447 - loss 0.01391057 - time (sec): 32.50 - samples/sec: 2137.46 - lr: 0.000011 - momentum: 0.000000 2023-10-23 23:05:47,968 epoch 7 - iter 396/447 - loss 0.01352129 - time (sec): 36.56 - samples/sec: 2134.04 - lr: 0.000010 - momentum: 0.000000 2023-10-23 23:05:51,529 epoch 7 - iter 440/447 - loss 0.01375727 - time (sec): 40.12 - samples/sec: 2126.85 - lr: 0.000010 - momentum: 0.000000 2023-10-23 23:05:52,094 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:05:52,095 EPOCH 7 done: loss 0.0138 - lr: 0.000010 2023-10-23 23:05:58,309 DEV : loss 0.22162960469722748 - f1-score (micro avg) 0.7836 2023-10-23 23:05:58,329 saving best model 2023-10-23 23:05:59,227 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:06:03,130 epoch 8 - iter 44/447 - loss 0.00338610 - time (sec): 3.90 - samples/sec: 2177.22 - lr: 0.000010 - momentum: 0.000000 2023-10-23 23:06:07,503 epoch 8 - iter 88/447 - loss 0.00649872 - time (sec): 8.28 - samples/sec: 2124.18 - lr: 0.000009 - momentum: 0.000000 2023-10-23 23:06:11,262 epoch 8 - iter 132/447 - loss 0.00844255 - time (sec): 12.03 - samples/sec: 2139.75 - lr: 0.000009 - momentum: 0.000000 2023-10-23 23:06:15,220 epoch 8 - iter 176/447 - loss 0.00810306 - time (sec): 15.99 - samples/sec: 2115.00 - lr: 0.000009 - momentum: 0.000000 2023-10-23 23:06:19,137 epoch 8 - iter 220/447 - loss 0.00775609 - time (sec): 19.91 - samples/sec: 2119.19 - lr: 0.000008 - momentum: 0.000000 2023-10-23 23:06:22,765 epoch 8 - iter 264/447 - loss 0.00753910 - time (sec): 23.54 - samples/sec: 2132.64 - lr: 0.000008 - momentum: 0.000000 2023-10-23 23:06:26,708 epoch 8 - iter 308/447 - loss 0.00742374 - time (sec): 27.48 - samples/sec: 2137.96 - lr: 0.000008 - momentum: 0.000000 2023-10-23 23:06:31,358 epoch 8 - iter 352/447 - loss 0.00757262 - time (sec): 32.13 - samples/sec: 2125.85 - lr: 0.000007 - momentum: 0.000000 2023-10-23 23:06:35,232 epoch 8 - iter 396/447 - loss 0.00893999 - time (sec): 36.00 - samples/sec: 2144.09 - lr: 0.000007 - momentum: 0.000000 2023-10-23 23:06:39,044 epoch 8 - iter 440/447 - loss 0.00911259 - time (sec): 39.82 - samples/sec: 2143.63 - lr: 0.000007 - momentum: 0.000000 2023-10-23 23:06:39,653 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:06:39,653 EPOCH 8 done: loss 0.0094 - lr: 0.000007 2023-10-23 23:06:45,867 DEV : loss 0.2305288016796112 - f1-score (micro avg) 0.7771 2023-10-23 23:06:45,887 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:06:49,797 epoch 9 - iter 44/447 - loss 0.00749318 - time (sec): 3.91 - samples/sec: 2132.77 - lr: 0.000006 - momentum: 0.000000 2023-10-23 23:06:54,055 epoch 9 - iter 88/447 - loss 0.00578018 - time (sec): 8.17 - samples/sec: 2106.92 - lr: 0.000006 - momentum: 0.000000 2023-10-23 23:06:58,041 epoch 9 - iter 132/447 - loss 0.00513309 - time (sec): 12.15 - samples/sec: 2160.03 - lr: 0.000006 - momentum: 0.000000 2023-10-23 23:07:02,056 epoch 9 - iter 176/447 - loss 0.00514188 - time (sec): 16.17 - samples/sec: 2121.05 - lr: 0.000005 - momentum: 0.000000 2023-10-23 23:07:05,872 epoch 9 - iter 220/447 - loss 0.00441923 - time (sec): 19.98 - samples/sec: 2141.34 - lr: 0.000005 - momentum: 0.000000 2023-10-23 23:07:09,433 epoch 9 - iter 264/447 - loss 0.00540403 - time (sec): 23.54 - samples/sec: 2134.74 - lr: 0.000005 - momentum: 0.000000 2023-10-23 23:07:13,624 epoch 9 - iter 308/447 - loss 0.00549172 - time (sec): 27.74 - samples/sec: 2130.38 - lr: 0.000004 - momentum: 0.000000 2023-10-23 23:07:17,938 epoch 9 - iter 352/447 - loss 0.00557974 - time (sec): 32.05 - samples/sec: 2144.11 - lr: 0.000004 - momentum: 0.000000 2023-10-23 23:07:21,891 epoch 9 - iter 396/447 - loss 0.00615603 - time (sec): 36.00 - samples/sec: 2130.07 - lr: 0.000004 - momentum: 0.000000 2023-10-23 23:07:25,987 epoch 9 - iter 440/447 - loss 0.00622433 - time (sec): 40.10 - samples/sec: 2123.91 - lr: 0.000003 - momentum: 0.000000 2023-10-23 23:07:26,557 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:07:26,557 EPOCH 9 done: loss 0.0061 - lr: 0.000003 2023-10-23 23:07:32,804 DEV : loss 0.24391140043735504 - f1-score (micro avg) 0.7819 2023-10-23 23:07:32,824 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:07:36,558 epoch 10 - iter 44/447 - loss 0.00893320 - time (sec): 3.73 - samples/sec: 2136.89 - lr: 0.000003 - momentum: 0.000000 2023-10-23 23:07:40,288 epoch 10 - iter 88/447 - loss 0.00533915 - time (sec): 7.46 - samples/sec: 2130.68 - lr: 0.000003 - momentum: 0.000000 2023-10-23 23:07:44,363 epoch 10 - iter 132/447 - loss 0.00466202 - time (sec): 11.54 - samples/sec: 2152.54 - lr: 0.000002 - momentum: 0.000000 2023-10-23 23:07:48,688 epoch 10 - iter 176/447 - loss 0.00398884 - time (sec): 15.86 - samples/sec: 2135.03 - lr: 0.000002 - momentum: 0.000000 2023-10-23 23:07:52,909 epoch 10 - iter 220/447 - loss 0.00318742 - time (sec): 20.08 - samples/sec: 2136.60 - lr: 0.000002 - momentum: 0.000000 2023-10-23 23:07:57,023 epoch 10 - iter 264/447 - loss 0.00288008 - time (sec): 24.20 - samples/sec: 2115.28 - lr: 0.000001 - momentum: 0.000000 2023-10-23 23:08:00,749 epoch 10 - iter 308/447 - loss 0.00336446 - time (sec): 27.92 - samples/sec: 2121.73 - lr: 0.000001 - momentum: 0.000000 2023-10-23 23:08:04,576 epoch 10 - iter 352/447 - loss 0.00326997 - time (sec): 31.75 - samples/sec: 2122.19 - lr: 0.000001 - momentum: 0.000000 2023-10-23 23:08:08,617 epoch 10 - iter 396/447 - loss 0.00380379 - time (sec): 35.79 - samples/sec: 2130.28 - lr: 0.000000 - momentum: 0.000000 2023-10-23 23:08:12,964 epoch 10 - iter 440/447 - loss 0.00399744 - time (sec): 40.14 - samples/sec: 2118.14 - lr: 0.000000 - momentum: 0.000000 2023-10-23 23:08:13,600 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:08:13,601 EPOCH 10 done: loss 0.0039 - lr: 0.000000 2023-10-23 23:08:19,844 DEV : loss 0.24326062202453613 - f1-score (micro avg) 0.7825 2023-10-23 23:08:20,336 ---------------------------------------------------------------------------------------------------- 2023-10-23 23:08:20,337 Loading model from best epoch ... 2023-10-23 23:08:22,007 SequenceTagger predicts: Dictionary with 21 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org, S-prod, B-prod, E-prod, I-prod, S-time, B-time, E-time, I-time 2023-10-23 23:08:26,848 Results: - F-score (micro) 0.7342 - F-score (macro) 0.6411 - Accuracy 0.5989 By class: precision recall f1-score support loc 0.7823 0.8624 0.8204 596 pers 0.6778 0.7327 0.7042 333 org 0.5625 0.4773 0.5164 132 prod 0.5510 0.4091 0.4696 66 time 0.7174 0.6735 0.6947 49 micro avg 0.7198 0.7491 0.7342 1176 macro avg 0.6582 0.6310 0.6411 1176 weighted avg 0.7124 0.7491 0.7285 1176 2023-10-23 23:08:26,848 ----------------------------------------------------------------------------------------------------