2023-10-23 20:42:58,126 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:42:58,127 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=21, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-23 20:42:58,127 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:42:58,128 MultiCorpus: 3575 train + 1235 dev + 1266 test sentences - NER_HIPE_2022 Corpus: 3575 train + 1235 dev + 1266 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/hipe2020/de/with_doc_seperator 2023-10-23 20:42:58,128 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:42:58,128 Train: 3575 sentences 2023-10-23 20:42:58,128 (train_with_dev=False, train_with_test=False) 2023-10-23 20:42:58,128 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:42:58,128 Training Params: 2023-10-23 20:42:58,128 - learning_rate: "3e-05" 2023-10-23 20:42:58,128 - mini_batch_size: "4" 2023-10-23 20:42:58,128 - max_epochs: "10" 2023-10-23 20:42:58,128 - shuffle: "True" 2023-10-23 20:42:58,128 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:42:58,128 Plugins: 2023-10-23 20:42:58,128 - TensorboardLogger 2023-10-23 20:42:58,128 - LinearScheduler | warmup_fraction: '0.1' 2023-10-23 20:42:58,128 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:42:58,128 Final evaluation on model from best epoch (best-model.pt) 2023-10-23 20:42:58,128 - metric: "('micro avg', 'f1-score')" 2023-10-23 20:42:58,128 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:42:58,128 Computation: 2023-10-23 20:42:58,129 - compute on device: cuda:0 2023-10-23 20:42:58,129 - embedding storage: none 2023-10-23 20:42:58,129 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:42:58,129 Model training base path: "hmbench-hipe2020/de-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs4-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-2" 2023-10-23 20:42:58,129 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:42:58,129 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:42:58,129 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-23 20:43:03,640 epoch 1 - iter 89/894 - loss 2.48080723 - time (sec): 5.51 - samples/sec: 1445.17 - lr: 0.000003 - momentum: 0.000000 2023-10-23 20:43:09,345 epoch 1 - iter 178/894 - loss 1.45885269 - time (sec): 11.22 - samples/sec: 1485.05 - lr: 0.000006 - momentum: 0.000000 2023-10-23 20:43:15,029 epoch 1 - iter 267/894 - loss 1.08472181 - time (sec): 16.90 - samples/sec: 1488.89 - lr: 0.000009 - momentum: 0.000000 2023-10-23 20:43:20,705 epoch 1 - iter 356/894 - loss 0.90566478 - time (sec): 22.58 - samples/sec: 1493.43 - lr: 0.000012 - momentum: 0.000000 2023-10-23 20:43:26,321 epoch 1 - iter 445/894 - loss 0.78795357 - time (sec): 28.19 - samples/sec: 1501.14 - lr: 0.000015 - momentum: 0.000000 2023-10-23 20:43:31,849 epoch 1 - iter 534/894 - loss 0.70900774 - time (sec): 33.72 - samples/sec: 1494.09 - lr: 0.000018 - momentum: 0.000000 2023-10-23 20:43:37,466 epoch 1 - iter 623/894 - loss 0.63979138 - time (sec): 39.34 - samples/sec: 1498.39 - lr: 0.000021 - momentum: 0.000000 2023-10-23 20:43:43,114 epoch 1 - iter 712/894 - loss 0.58612567 - time (sec): 44.98 - samples/sec: 1502.92 - lr: 0.000024 - momentum: 0.000000 2023-10-23 20:43:49,081 epoch 1 - iter 801/894 - loss 0.54155740 - time (sec): 50.95 - samples/sec: 1515.59 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:43:54,698 epoch 1 - iter 890/894 - loss 0.50622355 - time (sec): 56.57 - samples/sec: 1524.76 - lr: 0.000030 - momentum: 0.000000 2023-10-23 20:43:54,933 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:43:54,933 EPOCH 1 done: loss 0.5053 - lr: 0.000030 2023-10-23 20:43:59,782 DEV : loss 0.1566276252269745 - f1-score (micro avg) 0.6176 2023-10-23 20:43:59,803 saving best model 2023-10-23 20:44:00,374 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:44:05,850 epoch 2 - iter 89/894 - loss 0.16450987 - time (sec): 5.47 - samples/sec: 1511.62 - lr: 0.000030 - momentum: 0.000000 2023-10-23 20:44:11,563 epoch 2 - iter 178/894 - loss 0.15748841 - time (sec): 11.19 - samples/sec: 1519.78 - lr: 0.000029 - momentum: 0.000000 2023-10-23 20:44:17,274 epoch 2 - iter 267/894 - loss 0.15283243 - time (sec): 16.90 - samples/sec: 1533.58 - lr: 0.000029 - momentum: 0.000000 2023-10-23 20:44:23,049 epoch 2 - iter 356/894 - loss 0.15237744 - time (sec): 22.67 - samples/sec: 1528.63 - lr: 0.000029 - momentum: 0.000000 2023-10-23 20:44:28,594 epoch 2 - iter 445/894 - loss 0.14585054 - time (sec): 28.22 - samples/sec: 1505.44 - lr: 0.000028 - momentum: 0.000000 2023-10-23 20:44:34,379 epoch 2 - iter 534/894 - loss 0.15087654 - time (sec): 34.00 - samples/sec: 1513.75 - lr: 0.000028 - momentum: 0.000000 2023-10-23 20:44:40,096 epoch 2 - iter 623/894 - loss 0.14971394 - time (sec): 39.72 - samples/sec: 1522.56 - lr: 0.000028 - momentum: 0.000000 2023-10-23 20:44:45,566 epoch 2 - iter 712/894 - loss 0.15039161 - time (sec): 45.19 - samples/sec: 1511.99 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:44:51,516 epoch 2 - iter 801/894 - loss 0.14951375 - time (sec): 51.14 - samples/sec: 1523.17 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:44:57,059 epoch 2 - iter 890/894 - loss 0.14581873 - time (sec): 56.68 - samples/sec: 1518.95 - lr: 0.000027 - momentum: 0.000000 2023-10-23 20:44:57,322 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:44:57,322 EPOCH 2 done: loss 0.1455 - lr: 0.000027 2023-10-23 20:45:03,843 DEV : loss 0.16595827043056488 - f1-score (micro avg) 0.717 2023-10-23 20:45:03,863 saving best model 2023-10-23 20:45:04,578 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:45:10,343 epoch 3 - iter 89/894 - loss 0.09957272 - time (sec): 5.76 - samples/sec: 1554.16 - lr: 0.000026 - momentum: 0.000000 2023-10-23 20:45:16,075 epoch 3 - iter 178/894 - loss 0.09503941 - time (sec): 11.50 - samples/sec: 1536.73 - lr: 0.000026 - momentum: 0.000000 2023-10-23 20:45:21,827 epoch 3 - iter 267/894 - loss 0.08946847 - time (sec): 17.25 - samples/sec: 1556.44 - lr: 0.000026 - momentum: 0.000000 2023-10-23 20:45:27,441 epoch 3 - iter 356/894 - loss 0.08424858 - time (sec): 22.86 - samples/sec: 1525.11 - lr: 0.000025 - momentum: 0.000000 2023-10-23 20:45:33,050 epoch 3 - iter 445/894 - loss 0.08506845 - time (sec): 28.47 - samples/sec: 1528.17 - lr: 0.000025 - momentum: 0.000000 2023-10-23 20:45:38,640 epoch 3 - iter 534/894 - loss 0.08475705 - time (sec): 34.06 - samples/sec: 1522.51 - lr: 0.000025 - momentum: 0.000000 2023-10-23 20:45:44,182 epoch 3 - iter 623/894 - loss 0.08369846 - time (sec): 39.60 - samples/sec: 1513.84 - lr: 0.000024 - momentum: 0.000000 2023-10-23 20:45:50,076 epoch 3 - iter 712/894 - loss 0.08163382 - time (sec): 45.50 - samples/sec: 1521.35 - lr: 0.000024 - momentum: 0.000000 2023-10-23 20:45:55,687 epoch 3 - iter 801/894 - loss 0.08201642 - time (sec): 51.11 - samples/sec: 1515.67 - lr: 0.000024 - momentum: 0.000000 2023-10-23 20:46:01,448 epoch 3 - iter 890/894 - loss 0.08247904 - time (sec): 56.87 - samples/sec: 1516.36 - lr: 0.000023 - momentum: 0.000000 2023-10-23 20:46:01,681 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:46:01,681 EPOCH 3 done: loss 0.0827 - lr: 0.000023 2023-10-23 20:46:08,204 DEV : loss 0.17428651452064514 - f1-score (micro avg) 0.7264 2023-10-23 20:46:08,224 saving best model 2023-10-23 20:46:08,976 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:46:14,510 epoch 4 - iter 89/894 - loss 0.05262825 - time (sec): 5.53 - samples/sec: 1466.68 - lr: 0.000023 - momentum: 0.000000 2023-10-23 20:46:20,231 epoch 4 - iter 178/894 - loss 0.04468521 - time (sec): 11.25 - samples/sec: 1489.29 - lr: 0.000023 - momentum: 0.000000 2023-10-23 20:46:25,885 epoch 4 - iter 267/894 - loss 0.04312714 - time (sec): 16.91 - samples/sec: 1506.15 - lr: 0.000022 - momentum: 0.000000 2023-10-23 20:46:31,747 epoch 4 - iter 356/894 - loss 0.04469493 - time (sec): 22.77 - samples/sec: 1515.98 - lr: 0.000022 - momentum: 0.000000 2023-10-23 20:46:37,451 epoch 4 - iter 445/894 - loss 0.04758485 - time (sec): 28.47 - samples/sec: 1510.93 - lr: 0.000022 - momentum: 0.000000 2023-10-23 20:46:43,175 epoch 4 - iter 534/894 - loss 0.04927085 - time (sec): 34.20 - samples/sec: 1513.77 - lr: 0.000021 - momentum: 0.000000 2023-10-23 20:46:48,977 epoch 4 - iter 623/894 - loss 0.05000260 - time (sec): 40.00 - samples/sec: 1524.08 - lr: 0.000021 - momentum: 0.000000 2023-10-23 20:46:54,699 epoch 4 - iter 712/894 - loss 0.05005085 - time (sec): 45.72 - samples/sec: 1525.25 - lr: 0.000021 - momentum: 0.000000 2023-10-23 20:47:00,250 epoch 4 - iter 801/894 - loss 0.04974125 - time (sec): 51.27 - samples/sec: 1517.36 - lr: 0.000020 - momentum: 0.000000 2023-10-23 20:47:05,779 epoch 4 - iter 890/894 - loss 0.05150310 - time (sec): 56.80 - samples/sec: 1517.54 - lr: 0.000020 - momentum: 0.000000 2023-10-23 20:47:06,031 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:47:06,032 EPOCH 4 done: loss 0.0525 - lr: 0.000020 2023-10-23 20:47:12,556 DEV : loss 0.20298461616039276 - f1-score (micro avg) 0.7392 2023-10-23 20:47:12,576 saving best model 2023-10-23 20:47:13,313 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:47:18,999 epoch 5 - iter 89/894 - loss 0.03232253 - time (sec): 5.68 - samples/sec: 1546.64 - lr: 0.000020 - momentum: 0.000000 2023-10-23 20:47:24,694 epoch 5 - iter 178/894 - loss 0.03349854 - time (sec): 11.38 - samples/sec: 1498.69 - lr: 0.000019 - momentum: 0.000000 2023-10-23 20:47:30,205 epoch 5 - iter 267/894 - loss 0.03340786 - time (sec): 16.89 - samples/sec: 1484.26 - lr: 0.000019 - momentum: 0.000000 2023-10-23 20:47:36,073 epoch 5 - iter 356/894 - loss 0.03460467 - time (sec): 22.76 - samples/sec: 1518.03 - lr: 0.000019 - momentum: 0.000000 2023-10-23 20:47:41,657 epoch 5 - iter 445/894 - loss 0.03216847 - time (sec): 28.34 - samples/sec: 1504.36 - lr: 0.000018 - momentum: 0.000000 2023-10-23 20:47:47,217 epoch 5 - iter 534/894 - loss 0.03332744 - time (sec): 33.90 - samples/sec: 1500.36 - lr: 0.000018 - momentum: 0.000000 2023-10-23 20:47:53,166 epoch 5 - iter 623/894 - loss 0.03407386 - time (sec): 39.85 - samples/sec: 1514.43 - lr: 0.000018 - momentum: 0.000000 2023-10-23 20:47:58,849 epoch 5 - iter 712/894 - loss 0.03376706 - time (sec): 45.53 - samples/sec: 1516.40 - lr: 0.000017 - momentum: 0.000000 2023-10-23 20:48:04,460 epoch 5 - iter 801/894 - loss 0.03450533 - time (sec): 51.15 - samples/sec: 1523.39 - lr: 0.000017 - momentum: 0.000000 2023-10-23 20:48:10,007 epoch 5 - iter 890/894 - loss 0.03436077 - time (sec): 56.69 - samples/sec: 1519.59 - lr: 0.000017 - momentum: 0.000000 2023-10-23 20:48:10,262 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:48:10,262 EPOCH 5 done: loss 0.0344 - lr: 0.000017 2023-10-23 20:48:16,766 DEV : loss 0.24649250507354736 - f1-score (micro avg) 0.7686 2023-10-23 20:48:16,786 saving best model 2023-10-23 20:48:17,554 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:48:23,117 epoch 6 - iter 89/894 - loss 0.02315835 - time (sec): 5.56 - samples/sec: 1440.11 - lr: 0.000016 - momentum: 0.000000 2023-10-23 20:48:28,761 epoch 6 - iter 178/894 - loss 0.01840372 - time (sec): 11.21 - samples/sec: 1438.66 - lr: 0.000016 - momentum: 0.000000 2023-10-23 20:48:34,527 epoch 6 - iter 267/894 - loss 0.01933114 - time (sec): 16.97 - samples/sec: 1477.92 - lr: 0.000016 - momentum: 0.000000 2023-10-23 20:48:40,231 epoch 6 - iter 356/894 - loss 0.01924240 - time (sec): 22.68 - samples/sec: 1518.61 - lr: 0.000015 - momentum: 0.000000 2023-10-23 20:48:45,870 epoch 6 - iter 445/894 - loss 0.01840968 - time (sec): 28.31 - samples/sec: 1521.80 - lr: 0.000015 - momentum: 0.000000 2023-10-23 20:48:51,569 epoch 6 - iter 534/894 - loss 0.01732048 - time (sec): 34.01 - samples/sec: 1511.68 - lr: 0.000015 - momentum: 0.000000 2023-10-23 20:48:57,092 epoch 6 - iter 623/894 - loss 0.01729568 - time (sec): 39.54 - samples/sec: 1511.72 - lr: 0.000014 - momentum: 0.000000 2023-10-23 20:49:02,783 epoch 6 - iter 712/894 - loss 0.02022187 - time (sec): 45.23 - samples/sec: 1519.40 - lr: 0.000014 - momentum: 0.000000 2023-10-23 20:49:08,622 epoch 6 - iter 801/894 - loss 0.02054773 - time (sec): 51.07 - samples/sec: 1513.92 - lr: 0.000014 - momentum: 0.000000 2023-10-23 20:49:14,229 epoch 6 - iter 890/894 - loss 0.02089905 - time (sec): 56.67 - samples/sec: 1521.70 - lr: 0.000013 - momentum: 0.000000 2023-10-23 20:49:14,474 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:49:14,474 EPOCH 6 done: loss 0.0210 - lr: 0.000013 2023-10-23 20:49:21,009 DEV : loss 0.22474254667758942 - f1-score (micro avg) 0.7671 2023-10-23 20:49:21,030 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:49:26,585 epoch 7 - iter 89/894 - loss 0.00932732 - time (sec): 5.55 - samples/sec: 1525.93 - lr: 0.000013 - momentum: 0.000000 2023-10-23 20:49:32,387 epoch 7 - iter 178/894 - loss 0.01432918 - time (sec): 11.36 - samples/sec: 1517.03 - lr: 0.000013 - momentum: 0.000000 2023-10-23 20:49:38,406 epoch 7 - iter 267/894 - loss 0.01700073 - time (sec): 17.38 - samples/sec: 1533.79 - lr: 0.000012 - momentum: 0.000000 2023-10-23 20:49:44,028 epoch 7 - iter 356/894 - loss 0.01537951 - time (sec): 23.00 - samples/sec: 1525.27 - lr: 0.000012 - momentum: 0.000000 2023-10-23 20:49:49,679 epoch 7 - iter 445/894 - loss 0.01506681 - time (sec): 28.65 - samples/sec: 1519.99 - lr: 0.000012 - momentum: 0.000000 2023-10-23 20:49:55,356 epoch 7 - iter 534/894 - loss 0.01557746 - time (sec): 34.33 - samples/sec: 1523.54 - lr: 0.000011 - momentum: 0.000000 2023-10-23 20:50:01,041 epoch 7 - iter 623/894 - loss 0.01611775 - time (sec): 40.01 - samples/sec: 1520.46 - lr: 0.000011 - momentum: 0.000000 2023-10-23 20:50:06,612 epoch 7 - iter 712/894 - loss 0.01564183 - time (sec): 45.58 - samples/sec: 1517.64 - lr: 0.000011 - momentum: 0.000000 2023-10-23 20:50:12,188 epoch 7 - iter 801/894 - loss 0.01618850 - time (sec): 51.16 - samples/sec: 1520.34 - lr: 0.000010 - momentum: 0.000000 2023-10-23 20:50:17,797 epoch 7 - iter 890/894 - loss 0.01525803 - time (sec): 56.77 - samples/sec: 1519.19 - lr: 0.000010 - momentum: 0.000000 2023-10-23 20:50:18,038 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:50:18,038 EPOCH 7 done: loss 0.0152 - lr: 0.000010 2023-10-23 20:50:24,588 DEV : loss 0.25017356872558594 - f1-score (micro avg) 0.7642 2023-10-23 20:50:24,609 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:50:30,199 epoch 8 - iter 89/894 - loss 0.01612093 - time (sec): 5.59 - samples/sec: 1518.14 - lr: 0.000010 - momentum: 0.000000 2023-10-23 20:50:35,787 epoch 8 - iter 178/894 - loss 0.01639135 - time (sec): 11.18 - samples/sec: 1523.56 - lr: 0.000009 - momentum: 0.000000 2023-10-23 20:50:41,367 epoch 8 - iter 267/894 - loss 0.01252323 - time (sec): 16.76 - samples/sec: 1488.68 - lr: 0.000009 - momentum: 0.000000 2023-10-23 20:50:47,473 epoch 8 - iter 356/894 - loss 0.01262541 - time (sec): 22.86 - samples/sec: 1534.17 - lr: 0.000009 - momentum: 0.000000 2023-10-23 20:50:53,120 epoch 8 - iter 445/894 - loss 0.01149455 - time (sec): 28.51 - samples/sec: 1537.53 - lr: 0.000008 - momentum: 0.000000 2023-10-23 20:50:58,782 epoch 8 - iter 534/894 - loss 0.01152579 - time (sec): 34.17 - samples/sec: 1517.00 - lr: 0.000008 - momentum: 0.000000 2023-10-23 20:51:04,387 epoch 8 - iter 623/894 - loss 0.01078718 - time (sec): 39.78 - samples/sec: 1515.93 - lr: 0.000008 - momentum: 0.000000 2023-10-23 20:51:10,043 epoch 8 - iter 712/894 - loss 0.01121208 - time (sec): 45.43 - samples/sec: 1514.39 - lr: 0.000007 - momentum: 0.000000 2023-10-23 20:51:15,934 epoch 8 - iter 801/894 - loss 0.01081237 - time (sec): 51.32 - samples/sec: 1519.75 - lr: 0.000007 - momentum: 0.000000 2023-10-23 20:51:21,479 epoch 8 - iter 890/894 - loss 0.01053265 - time (sec): 56.87 - samples/sec: 1516.06 - lr: 0.000007 - momentum: 0.000000 2023-10-23 20:51:21,720 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:51:21,720 EPOCH 8 done: loss 0.0106 - lr: 0.000007 2023-10-23 20:51:27,969 DEV : loss 0.29330340027809143 - f1-score (micro avg) 0.7596 2023-10-23 20:51:27,990 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:51:33,729 epoch 9 - iter 89/894 - loss 0.00425727 - time (sec): 5.74 - samples/sec: 1410.23 - lr: 0.000006 - momentum: 0.000000 2023-10-23 20:51:39,392 epoch 9 - iter 178/894 - loss 0.00701630 - time (sec): 11.40 - samples/sec: 1444.35 - lr: 0.000006 - momentum: 0.000000 2023-10-23 20:51:45,246 epoch 9 - iter 267/894 - loss 0.00717471 - time (sec): 17.26 - samples/sec: 1473.49 - lr: 0.000006 - momentum: 0.000000 2023-10-23 20:51:50,879 epoch 9 - iter 356/894 - loss 0.00646948 - time (sec): 22.89 - samples/sec: 1489.65 - lr: 0.000005 - momentum: 0.000000 2023-10-23 20:51:56,536 epoch 9 - iter 445/894 - loss 0.00613541 - time (sec): 28.54 - samples/sec: 1503.56 - lr: 0.000005 - momentum: 0.000000 2023-10-23 20:52:02,312 epoch 9 - iter 534/894 - loss 0.00574876 - time (sec): 34.32 - samples/sec: 1509.25 - lr: 0.000005 - momentum: 0.000000 2023-10-23 20:52:08,201 epoch 9 - iter 623/894 - loss 0.00558351 - time (sec): 40.21 - samples/sec: 1520.38 - lr: 0.000004 - momentum: 0.000000 2023-10-23 20:52:13,789 epoch 9 - iter 712/894 - loss 0.00596474 - time (sec): 45.80 - samples/sec: 1514.27 - lr: 0.000004 - momentum: 0.000000 2023-10-23 20:52:19,295 epoch 9 - iter 801/894 - loss 0.00595036 - time (sec): 51.30 - samples/sec: 1512.79 - lr: 0.000004 - momentum: 0.000000 2023-10-23 20:52:25,015 epoch 9 - iter 890/894 - loss 0.00559881 - time (sec): 57.02 - samples/sec: 1513.03 - lr: 0.000003 - momentum: 0.000000 2023-10-23 20:52:25,245 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:52:25,245 EPOCH 9 done: loss 0.0056 - lr: 0.000003 2023-10-23 20:52:31,486 DEV : loss 0.27675920724868774 - f1-score (micro avg) 0.7733 2023-10-23 20:52:31,507 saving best model 2023-10-23 20:52:32,180 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:52:37,758 epoch 10 - iter 89/894 - loss 0.00098253 - time (sec): 5.58 - samples/sec: 1545.49 - lr: 0.000003 - momentum: 0.000000 2023-10-23 20:52:43,517 epoch 10 - iter 178/894 - loss 0.00095969 - time (sec): 11.34 - samples/sec: 1560.78 - lr: 0.000003 - momentum: 0.000000 2023-10-23 20:52:49,428 epoch 10 - iter 267/894 - loss 0.00128812 - time (sec): 17.25 - samples/sec: 1508.63 - lr: 0.000002 - momentum: 0.000000 2023-10-23 20:52:54,932 epoch 10 - iter 356/894 - loss 0.00278304 - time (sec): 22.75 - samples/sec: 1509.09 - lr: 0.000002 - momentum: 0.000000 2023-10-23 20:53:00,826 epoch 10 - iter 445/894 - loss 0.00345471 - time (sec): 28.64 - samples/sec: 1527.64 - lr: 0.000002 - momentum: 0.000000 2023-10-23 20:53:06,392 epoch 10 - iter 534/894 - loss 0.00389630 - time (sec): 34.21 - samples/sec: 1511.09 - lr: 0.000001 - momentum: 0.000000 2023-10-23 20:53:11,957 epoch 10 - iter 623/894 - loss 0.00368630 - time (sec): 39.78 - samples/sec: 1515.44 - lr: 0.000001 - momentum: 0.000000 2023-10-23 20:53:17,683 epoch 10 - iter 712/894 - loss 0.00344338 - time (sec): 45.50 - samples/sec: 1513.16 - lr: 0.000001 - momentum: 0.000000 2023-10-23 20:53:23,567 epoch 10 - iter 801/894 - loss 0.00365553 - time (sec): 51.39 - samples/sec: 1510.67 - lr: 0.000000 - momentum: 0.000000 2023-10-23 20:53:29,280 epoch 10 - iter 890/894 - loss 0.00368854 - time (sec): 57.10 - samples/sec: 1509.37 - lr: 0.000000 - momentum: 0.000000 2023-10-23 20:53:29,517 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:53:29,517 EPOCH 10 done: loss 0.0039 - lr: 0.000000 2023-10-23 20:53:35,745 DEV : loss 0.27931466698646545 - f1-score (micro avg) 0.7767 2023-10-23 20:53:35,765 saving best model 2023-10-23 20:53:37,037 ---------------------------------------------------------------------------------------------------- 2023-10-23 20:53:37,038 Loading model from best epoch ... 2023-10-23 20:53:38,841 SequenceTagger predicts: Dictionary with 21 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org, S-prod, B-prod, E-prod, I-prod, S-time, B-time, E-time, I-time 2023-10-23 20:53:43,691 Results: - F-score (micro) 0.7553 - F-score (macro) 0.6752 - Accuracy 0.6225 By class: precision recall f1-score support loc 0.8232 0.8674 0.8448 596 pers 0.6800 0.7658 0.7203 333 org 0.5379 0.5379 0.5379 132 prod 0.6316 0.5455 0.5854 66 time 0.7021 0.6735 0.6875 49 micro avg 0.7361 0.7755 0.7553 1176 macro avg 0.6750 0.6780 0.6752 1176 weighted avg 0.7349 0.7755 0.7540 1176 2023-10-23 20:53:43,691 ----------------------------------------------------------------------------------------------------