2023-10-23 20:04:33,691 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:04:33,692 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=21, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-23 20:04:33,692 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:04:33,692 MultiCorpus: 3575 train + 1235 dev + 1266 test sentences - NER_HIPE_2022 Corpus: 3575 train + 1235 dev + 1266 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/hipe2020/de/with_doc_seperator 2023-10-23 20:04:33,692 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:04:33,692 Train: 3575 sentences 2023-10-23 20:04:33,692 (train_with_dev=False, train_with_test=False) 2023-10-23 20:04:33,692 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:04:33,693 Training Params: 2023-10-23 20:04:33,693 - learning_rate: "3e-05" 2023-10-23 20:04:33,693 - mini_batch_size: "4" 2023-10-23 20:04:33,693 - max_epochs: "10" 2023-10-23 20:04:33,693 - shuffle: "True" 2023-10-23 20:04:33,693 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:04:33,693 Plugins: 2023-10-23 20:04:33,693 - TensorboardLogger 2023-10-23 20:04:33,693 - LinearScheduler | warmup_fraction: '0.1' 2023-10-23 20:04:33,693 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:04:33,693 Final evaluation on model from best epoch (best-model.pt) 2023-10-23 20:04:33,693 - metric: "('micro avg', 'f1-score')" 2023-10-23 20:04:33,693 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:04:33,693 Computation: 2023-10-23 20:04:33,693 - compute on device: cuda:0 2023-10-23 20:04:33,693 - embedding storage: none 2023-10-23 20:04:33,693 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:04:33,693 Model training base path: "hmbench-hipe2020/de-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs4-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-1" 2023-10-23 20:04:33,693 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:04:33,693 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:04:33,693 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-23 20:04:39,937 epoch 1 - iter 89/894 - loss 3.05316532 - time (sec): 6.24 - samples/sec: 1380.68 - lr: 0.000003 - momentum: 0.000000 2023-10-23 20:04:45,315 epoch 1 - iter 178/894 - loss 1.89496396 - time (sec): 11.62 - samples/sec: 1415.74 - lr: 0.000006 - momentum: 0.000000 2023-10-23 20:04:50,827 epoch 1 - iter 267/894 - loss 1.38213089 - time (sec): 17.13 - samples/sec: 1476.52 - lr: 0.000009 - momentum: 0.000000 2023-10-23 20:04:56,382 epoch 1 - iter 356/894 - loss 1.12023789 - time (sec): 22.69 - samples/sec: 1474.59 - lr: 0.000012 - momentum: 0.000000 2023-10-23 20:05:01,849 epoch 1 - iter 445/894 - loss 0.95174384 - time (sec): 28.15 - samples/sec: 1501.52 - lr: 0.000015 - momentum: 0.000000 2023-10-23 20:05:07,270 epoch 1 - iter 534/894 - loss 0.83523472 - time (sec): 33.58 - samples/sec: 1498.51 - lr: 0.000018 - momentum: 0.000000 2023-10-23 20:05:12,852 epoch 1 - iter 623/894 - loss 0.74784062 - time (sec): 39.16 - samples/sec: 1507.54 - lr: 0.000021 - momentum: 0.000000 2023-10-23 20:05:18,709 epoch 1 - iter 712/894 - loss 0.67591015 - time (sec): 45.02 - samples/sec: 1526.07 - lr: 0.000024 - momentum: 0.000000 2023-10-23 20:05:24,294 epoch 1 - iter 801/894 - loss 0.62253117 - time (sec): 50.60 - samples/sec: 1534.13 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:05:29,887 epoch 1 - iter 890/894 - loss 0.58384563 - time (sec): 56.19 - samples/sec: 1534.13 - lr: 0.000030 - momentum: 0.000000 2023-10-23 20:05:30,126 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:05:30,126 EPOCH 1 done: loss 0.5819 - lr: 0.000030 2023-10-23 20:05:34,598 DEV : loss 0.1846158355474472 - f1-score (micro avg) 0.6478 2023-10-23 20:05:34,617 saving best model 2023-10-23 20:05:35,169 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:05:41,041 epoch 2 - iter 89/894 - loss 0.17039932 - time (sec): 5.87 - samples/sec: 1640.40 - lr: 0.000030 - momentum: 0.000000 2023-10-23 20:05:46,577 epoch 2 - iter 178/894 - loss 0.15101709 - time (sec): 11.41 - samples/sec: 1588.09 - lr: 0.000029 - momentum: 0.000000 2023-10-23 20:05:52,191 epoch 2 - iter 267/894 - loss 0.15228742 - time (sec): 17.02 - samples/sec: 1549.12 - lr: 0.000029 - momentum: 0.000000 2023-10-23 20:05:57,658 epoch 2 - iter 356/894 - loss 0.14911380 - time (sec): 22.49 - samples/sec: 1546.34 - lr: 0.000029 - momentum: 0.000000 2023-10-23 20:06:03,176 epoch 2 - iter 445/894 - loss 0.14248969 - time (sec): 28.01 - samples/sec: 1535.66 - lr: 0.000028 - momentum: 0.000000 2023-10-23 20:06:08,785 epoch 2 - iter 534/894 - loss 0.14210291 - time (sec): 33.61 - samples/sec: 1543.64 - lr: 0.000028 - momentum: 0.000000 2023-10-23 20:06:14,394 epoch 2 - iter 623/894 - loss 0.13697959 - time (sec): 39.22 - samples/sec: 1546.81 - lr: 0.000028 - momentum: 0.000000 2023-10-23 20:06:20,110 epoch 2 - iter 712/894 - loss 0.13739609 - time (sec): 44.94 - samples/sec: 1548.84 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:06:25,618 epoch 2 - iter 801/894 - loss 0.13435609 - time (sec): 50.45 - samples/sec: 1539.46 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:06:31,150 epoch 2 - iter 890/894 - loss 0.13262331 - time (sec): 55.98 - samples/sec: 1541.50 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:06:31,379 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:06:31,379 EPOCH 2 done: loss 0.1324 - lr: 0.000027 2023-10-23 20:06:37,751 DEV : loss 0.15661019086837769 - f1-score (micro avg) 0.7258 2023-10-23 20:06:37,770 saving best model 2023-10-23 20:06:38,516 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:06:44,028 epoch 3 - iter 89/894 - loss 0.07081377 - time (sec): 5.51 - samples/sec: 1467.63 - lr: 0.000026 - momentum: 0.000000 2023-10-23 20:06:49,733 epoch 3 - iter 178/894 - loss 0.06445548 - time (sec): 11.22 - samples/sec: 1539.23 - lr: 0.000026 - momentum: 0.000000 2023-10-23 20:06:55,369 epoch 3 - iter 267/894 - loss 0.07421548 - time (sec): 16.85 - samples/sec: 1520.71 - lr: 0.000026 - momentum: 0.000000 2023-10-23 20:07:00,871 epoch 3 - iter 356/894 - loss 0.07828797 - time (sec): 22.35 - samples/sec: 1516.78 - lr: 0.000025 - momentum: 0.000000 2023-10-23 20:07:06,328 epoch 3 - iter 445/894 - loss 0.08369600 - time (sec): 27.81 - samples/sec: 1503.21 - lr: 0.000025 - momentum: 0.000000 2023-10-23 20:07:11,900 epoch 3 - iter 534/894 - loss 0.08222038 - time (sec): 33.38 - samples/sec: 1511.91 - lr: 0.000025 - momentum: 0.000000 2023-10-23 20:07:17,546 epoch 3 - iter 623/894 - loss 0.08020079 - time (sec): 39.03 - samples/sec: 1519.33 - lr: 0.000024 - momentum: 0.000000 2023-10-23 20:07:23,017 epoch 3 - iter 712/894 - loss 0.08212380 - time (sec): 44.50 - samples/sec: 1513.26 - lr: 0.000024 - momentum: 0.000000 2023-10-23 20:07:28,867 epoch 3 - iter 801/894 - loss 0.08182659 - time (sec): 50.35 - samples/sec: 1513.21 - lr: 0.000024 - momentum: 0.000000 2023-10-23 20:07:34,426 epoch 3 - iter 890/894 - loss 0.08063802 - time (sec): 55.91 - samples/sec: 1523.62 - lr: 0.000023 - momentum: 0.000000 2023-10-23 20:07:34,955 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:07:34,956 EPOCH 3 done: loss 0.0810 - lr: 0.000023 2023-10-23 20:07:41,362 DEV : loss 0.1884111911058426 - f1-score (micro avg) 0.7402 2023-10-23 20:07:41,381 saving best model 2023-10-23 20:07:42,172 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:07:47,935 epoch 4 - iter 89/894 - loss 0.06146136 - time (sec): 5.76 - samples/sec: 1573.87 - lr: 0.000023 - momentum: 0.000000 2023-10-23 20:07:53,490 epoch 4 - iter 178/894 - loss 0.05192372 - time (sec): 11.32 - samples/sec: 1550.68 - lr: 0.000023 - momentum: 0.000000 2023-10-23 20:07:59,004 epoch 4 - iter 267/894 - loss 0.05030131 - time (sec): 16.83 - samples/sec: 1531.00 - lr: 0.000022 - momentum: 0.000000 2023-10-23 20:08:04,492 epoch 4 - iter 356/894 - loss 0.04602569 - time (sec): 22.32 - samples/sec: 1521.68 - lr: 0.000022 - momentum: 0.000000 2023-10-23 20:08:10,309 epoch 4 - iter 445/894 - loss 0.04674359 - time (sec): 28.14 - samples/sec: 1528.69 - lr: 0.000022 - momentum: 0.000000 2023-10-23 20:08:15,817 epoch 4 - iter 534/894 - loss 0.04899786 - time (sec): 33.64 - samples/sec: 1511.87 - lr: 0.000021 - momentum: 0.000000 2023-10-23 20:08:21,368 epoch 4 - iter 623/894 - loss 0.04915412 - time (sec): 39.19 - samples/sec: 1508.74 - lr: 0.000021 - momentum: 0.000000 2023-10-23 20:08:26,930 epoch 4 - iter 712/894 - loss 0.05054854 - time (sec): 44.76 - samples/sec: 1511.23 - lr: 0.000021 - momentum: 0.000000 2023-10-23 20:08:32,927 epoch 4 - iter 801/894 - loss 0.05139540 - time (sec): 50.75 - samples/sec: 1523.11 - lr: 0.000020 - momentum: 0.000000 2023-10-23 20:08:38,584 epoch 4 - iter 890/894 - loss 0.05180747 - time (sec): 56.41 - samples/sec: 1528.34 - lr: 0.000020 - momentum: 0.000000 2023-10-23 20:08:38,820 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:08:38,820 EPOCH 4 done: loss 0.0520 - lr: 0.000020 2023-10-23 20:08:45,244 DEV : loss 0.2043798565864563 - f1-score (micro avg) 0.7489 2023-10-23 20:08:45,262 saving best model 2023-10-23 20:08:45,958 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:08:51,390 epoch 5 - iter 89/894 - loss 0.03670468 - time (sec): 5.43 - samples/sec: 1419.82 - lr: 0.000020 - momentum: 0.000000 2023-10-23 20:08:56,933 epoch 5 - iter 178/894 - loss 0.03045295 - time (sec): 10.97 - samples/sec: 1455.41 - lr: 0.000019 - momentum: 0.000000 2023-10-23 20:09:02,754 epoch 5 - iter 267/894 - loss 0.03134785 - time (sec): 16.80 - samples/sec: 1492.41 - lr: 0.000019 - momentum: 0.000000 2023-10-23 20:09:08,318 epoch 5 - iter 356/894 - loss 0.03213895 - time (sec): 22.36 - samples/sec: 1495.96 - lr: 0.000019 - momentum: 0.000000 2023-10-23 20:09:13,823 epoch 5 - iter 445/894 - loss 0.03446495 - time (sec): 27.86 - samples/sec: 1498.94 - lr: 0.000018 - momentum: 0.000000 2023-10-23 20:09:19,744 epoch 5 - iter 534/894 - loss 0.03314168 - time (sec): 33.78 - samples/sec: 1522.79 - lr: 0.000018 - momentum: 0.000000 2023-10-23 20:09:25,190 epoch 5 - iter 623/894 - loss 0.03507241 - time (sec): 39.23 - samples/sec: 1516.98 - lr: 0.000018 - momentum: 0.000000 2023-10-23 20:09:30,956 epoch 5 - iter 712/894 - loss 0.03379707 - time (sec): 45.00 - samples/sec: 1527.51 - lr: 0.000017 - momentum: 0.000000 2023-10-23 20:09:36,428 epoch 5 - iter 801/894 - loss 0.03498774 - time (sec): 50.47 - samples/sec: 1518.25 - lr: 0.000017 - momentum: 0.000000 2023-10-23 20:09:42,441 epoch 5 - iter 890/894 - loss 0.03520890 - time (sec): 56.48 - samples/sec: 1526.03 - lr: 0.000017 - momentum: 0.000000 2023-10-23 20:09:42,682 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:09:42,682 EPOCH 5 done: loss 0.0351 - lr: 0.000017 2023-10-23 20:09:49,126 DEV : loss 0.21067775785923004 - f1-score (micro avg) 0.7814 2023-10-23 20:09:49,144 saving best model 2023-10-23 20:09:49,860 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:09:55,405 epoch 6 - iter 89/894 - loss 0.02804411 - time (sec): 5.54 - samples/sec: 1519.87 - lr: 0.000016 - momentum: 0.000000 2023-10-23 20:10:00,924 epoch 6 - iter 178/894 - loss 0.02290773 - time (sec): 11.06 - samples/sec: 1518.53 - lr: 0.000016 - momentum: 0.000000 2023-10-23 20:10:06,965 epoch 6 - iter 267/894 - loss 0.02282067 - time (sec): 17.10 - samples/sec: 1561.00 - lr: 0.000016 - momentum: 0.000000 2023-10-23 20:10:12,796 epoch 6 - iter 356/894 - loss 0.02842765 - time (sec): 22.94 - samples/sec: 1557.26 - lr: 0.000015 - momentum: 0.000000 2023-10-23 20:10:18,543 epoch 6 - iter 445/894 - loss 0.02595776 - time (sec): 28.68 - samples/sec: 1554.29 - lr: 0.000015 - momentum: 0.000000 2023-10-23 20:10:24,274 epoch 6 - iter 534/894 - loss 0.02372176 - time (sec): 34.41 - samples/sec: 1548.41 - lr: 0.000015 - momentum: 0.000000 2023-10-23 20:10:29,688 epoch 6 - iter 623/894 - loss 0.02367998 - time (sec): 39.83 - samples/sec: 1523.75 - lr: 0.000014 - momentum: 0.000000 2023-10-23 20:10:35,272 epoch 6 - iter 712/894 - loss 0.02452133 - time (sec): 45.41 - samples/sec: 1525.84 - lr: 0.000014 - momentum: 0.000000 2023-10-23 20:10:40,796 epoch 6 - iter 801/894 - loss 0.02538859 - time (sec): 50.93 - samples/sec: 1522.16 - lr: 0.000014 - momentum: 0.000000 2023-10-23 20:10:46,436 epoch 6 - iter 890/894 - loss 0.02556957 - time (sec): 56.57 - samples/sec: 1525.68 - lr: 0.000013 - momentum: 0.000000 2023-10-23 20:10:46,670 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:10:46,670 EPOCH 6 done: loss 0.0255 - lr: 0.000013 2023-10-23 20:10:53,124 DEV : loss 0.24338190257549286 - f1-score (micro avg) 0.7716 2023-10-23 20:10:53,143 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:10:59,007 epoch 7 - iter 89/894 - loss 0.01776813 - time (sec): 5.86 - samples/sec: 1619.74 - lr: 0.000013 - momentum: 0.000000 2023-10-23 20:11:04,738 epoch 7 - iter 178/894 - loss 0.01191667 - time (sec): 11.59 - samples/sec: 1556.36 - lr: 0.000013 - momentum: 0.000000 2023-10-23 20:11:10,506 epoch 7 - iter 267/894 - loss 0.01290126 - time (sec): 17.36 - samples/sec: 1574.21 - lr: 0.000012 - momentum: 0.000000 2023-10-23 20:11:16,057 epoch 7 - iter 356/894 - loss 0.01279215 - time (sec): 22.91 - samples/sec: 1548.53 - lr: 0.000012 - momentum: 0.000000 2023-10-23 20:11:21,537 epoch 7 - iter 445/894 - loss 0.01358427 - time (sec): 28.39 - samples/sec: 1528.44 - lr: 0.000012 - momentum: 0.000000 2023-10-23 20:11:27,070 epoch 7 - iter 534/894 - loss 0.01257668 - time (sec): 33.93 - samples/sec: 1523.57 - lr: 0.000011 - momentum: 0.000000 2023-10-23 20:11:32,940 epoch 7 - iter 623/894 - loss 0.01268051 - time (sec): 39.80 - samples/sec: 1533.14 - lr: 0.000011 - momentum: 0.000000 2023-10-23 20:11:38,478 epoch 7 - iter 712/894 - loss 0.01210361 - time (sec): 45.33 - samples/sec: 1536.64 - lr: 0.000011 - momentum: 0.000000 2023-10-23 20:11:44,123 epoch 7 - iter 801/894 - loss 0.01229478 - time (sec): 50.98 - samples/sec: 1528.30 - lr: 0.000010 - momentum: 0.000000 2023-10-23 20:11:49,672 epoch 7 - iter 890/894 - loss 0.01180895 - time (sec): 56.53 - samples/sec: 1527.31 - lr: 0.000010 - momentum: 0.000000 2023-10-23 20:11:49,898 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:11:49,898 EPOCH 7 done: loss 0.0119 - lr: 0.000010 2023-10-23 20:11:56,335 DEV : loss 0.232055202126503 - f1-score (micro avg) 0.7753 2023-10-23 20:11:56,354 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:12:01,939 epoch 8 - iter 89/894 - loss 0.01217064 - time (sec): 5.58 - samples/sec: 1505.52 - lr: 0.000010 - momentum: 0.000000 2023-10-23 20:12:07,516 epoch 8 - iter 178/894 - loss 0.01152484 - time (sec): 11.16 - samples/sec: 1523.30 - lr: 0.000009 - momentum: 0.000000 2023-10-23 20:12:13,555 epoch 8 - iter 267/894 - loss 0.01348119 - time (sec): 17.20 - samples/sec: 1535.72 - lr: 0.000009 - momentum: 0.000000 2023-10-23 20:12:19,074 epoch 8 - iter 356/894 - loss 0.01256261 - time (sec): 22.72 - samples/sec: 1527.13 - lr: 0.000009 - momentum: 0.000000 2023-10-23 20:12:24,803 epoch 8 - iter 445/894 - loss 0.01250572 - time (sec): 28.45 - samples/sec: 1528.89 - lr: 0.000008 - momentum: 0.000000 2023-10-23 20:12:30,242 epoch 8 - iter 534/894 - loss 0.01172981 - time (sec): 33.89 - samples/sec: 1512.17 - lr: 0.000008 - momentum: 0.000000 2023-10-23 20:12:35,910 epoch 8 - iter 623/894 - loss 0.01075653 - time (sec): 39.56 - samples/sec: 1509.37 - lr: 0.000008 - momentum: 0.000000 2023-10-23 20:12:41,530 epoch 8 - iter 712/894 - loss 0.01092404 - time (sec): 45.18 - samples/sec: 1513.05 - lr: 0.000007 - momentum: 0.000000 2023-10-23 20:12:47,163 epoch 8 - iter 801/894 - loss 0.01050008 - time (sec): 50.81 - samples/sec: 1513.09 - lr: 0.000007 - momentum: 0.000000 2023-10-23 20:12:52,984 epoch 8 - iter 890/894 - loss 0.00971863 - time (sec): 56.63 - samples/sec: 1519.97 - lr: 0.000007 - momentum: 0.000000 2023-10-23 20:12:53,272 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:12:53,272 EPOCH 8 done: loss 0.0097 - lr: 0.000007 2023-10-23 20:12:59,742 DEV : loss 0.2622121274471283 - f1-score (micro avg) 0.7694 2023-10-23 20:12:59,761 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:13:05,573 epoch 9 - iter 89/894 - loss 0.00784636 - time (sec): 5.81 - samples/sec: 1564.08 - lr: 0.000006 - momentum: 0.000000 2023-10-23 20:13:11,386 epoch 9 - iter 178/894 - loss 0.00575954 - time (sec): 11.62 - samples/sec: 1569.07 - lr: 0.000006 - momentum: 0.000000 2023-10-23 20:13:17,114 epoch 9 - iter 267/894 - loss 0.00627537 - time (sec): 17.35 - samples/sec: 1531.04 - lr: 0.000006 - momentum: 0.000000 2023-10-23 20:13:22,587 epoch 9 - iter 356/894 - loss 0.00648185 - time (sec): 22.83 - samples/sec: 1501.26 - lr: 0.000005 - momentum: 0.000000 2023-10-23 20:13:28,127 epoch 9 - iter 445/894 - loss 0.00697479 - time (sec): 28.36 - samples/sec: 1492.02 - lr: 0.000005 - momentum: 0.000000 2023-10-23 20:13:33,726 epoch 9 - iter 534/894 - loss 0.00667577 - time (sec): 33.96 - samples/sec: 1496.38 - lr: 0.000005 - momentum: 0.000000 2023-10-23 20:13:39,256 epoch 9 - iter 623/894 - loss 0.00602441 - time (sec): 39.49 - samples/sec: 1500.84 - lr: 0.000004 - momentum: 0.000000 2023-10-23 20:13:45,253 epoch 9 - iter 712/894 - loss 0.00609223 - time (sec): 45.49 - samples/sec: 1535.76 - lr: 0.000004 - momentum: 0.000000 2023-10-23 20:13:50,815 epoch 9 - iter 801/894 - loss 0.00575853 - time (sec): 51.05 - samples/sec: 1532.88 - lr: 0.000004 - momentum: 0.000000 2023-10-23 20:13:56,349 epoch 9 - iter 890/894 - loss 0.00559841 - time (sec): 56.59 - samples/sec: 1524.82 - lr: 0.000003 - momentum: 0.000000 2023-10-23 20:13:56,586 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:13:56,586 EPOCH 9 done: loss 0.0056 - lr: 0.000003 2023-10-23 20:14:03,087 DEV : loss 0.2699427008628845 - f1-score (micro avg) 0.7751 2023-10-23 20:14:03,106 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:14:08,868 epoch 10 - iter 89/894 - loss 0.00285995 - time (sec): 5.76 - samples/sec: 1522.41 - lr: 0.000003 - momentum: 0.000000 2023-10-23 20:14:14,429 epoch 10 - iter 178/894 - loss 0.00175943 - time (sec): 11.32 - samples/sec: 1526.78 - lr: 0.000003 - momentum: 0.000000 2023-10-23 20:14:19,974 epoch 10 - iter 267/894 - loss 0.00142868 - time (sec): 16.87 - samples/sec: 1506.26 - lr: 0.000002 - momentum: 0.000000 2023-10-23 20:14:25,556 epoch 10 - iter 356/894 - loss 0.00227438 - time (sec): 22.45 - samples/sec: 1496.90 - lr: 0.000002 - momentum: 0.000000 2023-10-23 20:14:31,119 epoch 10 - iter 445/894 - loss 0.00225331 - time (sec): 28.01 - samples/sec: 1492.79 - lr: 0.000002 - momentum: 0.000000 2023-10-23 20:14:36,772 epoch 10 - iter 534/894 - loss 0.00274936 - time (sec): 33.67 - samples/sec: 1492.95 - lr: 0.000001 - momentum: 0.000000 2023-10-23 20:14:42,348 epoch 10 - iter 623/894 - loss 0.00267072 - time (sec): 39.24 - samples/sec: 1488.28 - lr: 0.000001 - momentum: 0.000000 2023-10-23 20:14:47,855 epoch 10 - iter 712/894 - loss 0.00255903 - time (sec): 44.75 - samples/sec: 1490.55 - lr: 0.000001 - momentum: 0.000000 2023-10-23 20:14:54,025 epoch 10 - iter 801/894 - loss 0.00256285 - time (sec): 50.92 - samples/sec: 1520.63 - lr: 0.000000 - momentum: 0.000000 2023-10-23 20:14:59,611 epoch 10 - iter 890/894 - loss 0.00299517 - time (sec): 56.50 - samples/sec: 1516.23 - lr: 0.000000 - momentum: 0.000000 2023-10-23 20:15:00,029 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:15:00,029 EPOCH 10 done: loss 0.0030 - lr: 0.000000 2023-10-23 20:15:06,498 DEV : loss 0.2668047845363617 - f1-score (micro avg) 0.7798 2023-10-23 20:15:07,075 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:15:07,076 Loading model from best epoch ... 2023-10-23 20:15:09,025 SequenceTagger predicts: Dictionary with 21 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org, S-prod, B-prod, E-prod, I-prod, S-time, B-time, E-time, I-time 2023-10-23 20:15:13,561 Results: - F-score (micro) 0.7372 - F-score (macro) 0.6555 - Accuracy 0.6024 By class: precision recall f1-score support loc 0.8000 0.8389 0.8190 596 pers 0.6692 0.7898 0.7245 333 org 0.5000 0.4167 0.4545 132 prod 0.6731 0.5303 0.5932 66 time 0.6604 0.7143 0.6863 49 micro avg 0.7202 0.7551 0.7372 1176 macro avg 0.6605 0.6580 0.6555 1176 weighted avg 0.7164 0.7551 0.7331 1176 2023-10-23 20:15:13,561 ----------------------------------------------------------------------------------------------------