Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
72 kB
11
Indexable
Never
bert.embeddings.position_embeddings.weight tensor(1.1588e-06, device='cuda:0')
bert.embeddings.token_type_embeddings.weight tensor(1.8445e-07, device='cuda:0')
bert.embeddings.LayerNorm.weight tensor(8.4774e-09, device='cuda:0')
bert.embeddings.LayerNorm.bias tensor(1.1889e-08, device='cuda:0')
bert.encoder.layer.0.attention.self.query.weight tensor(1.4740e-06, device='cuda:0')
bert.encoder.layer.0.attention.self.query.bias tensor(3.1704e-09, device='cuda:0')
bert.encoder.layer.0.attention.self.key.weight tensor(1.2670e-06, device='cuda:0')
bert.encoder.layer.0.attention.self.key.bias tensor(6.7105e-16, device='cuda:0')
bert.encoder.layer.0.attention.self.value.weight tensor(1.7050e-06, device='cuda:0')
bert.encoder.layer.0.attention.self.value.bias tensor(7.8360e-09, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.weight tensor(1.4715e-06, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.bias tensor(5.8703e-09, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.weight tensor(5.3118e-09, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.bias tensor(6.2903e-09, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.weight tensor(3.2329e-06, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.bias tensor(5.5261e-09, device='cuda:0')
bert.encoder.layer.0.output.dense.weight tensor(2.4818e-06, device='cuda:0')
bert.encoder.layer.0.output.dense.bias tensor(5.7146e-09, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.weight tensor(7.1411e-09, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.bias tensor(7.5921e-09, device='cuda:0')
bert.encoder.layer.1.attention.self.query.weight tensor(6.6811e-07, device='cuda:0')
bert.encoder.layer.1.attention.self.query.bias tensor(1.5937e-09, device='cuda:0')
bert.encoder.layer.1.attention.self.key.weight tensor(1.0009e-06, device='cuda:0')
bert.encoder.layer.1.attention.self.key.bias tensor(5.5861e-16, device='cuda:0')
bert.encoder.layer.1.attention.self.value.weight tensor(1.9682e-06, device='cuda:0')
bert.encoder.layer.1.attention.self.value.bias tensor(4.5316e-09, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.weight tensor(2.7057e-06, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.bias tensor(3.7811e-09, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.weight tensor(7.9559e-09, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.bias tensor(9.6859e-09, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.weight tensor(3.8880e-06, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.bias tensor(6.6563e-09, device='cuda:0')
bert.encoder.layer.1.output.dense.weight tensor(2.5543e-06, device='cuda:0')
bert.encoder.layer.1.output.dense.bias tensor(5.3525e-09, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.weight tensor(7.3478e-09, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.bias tensor(8.3271e-09, device='cuda:0')
bert.encoder.layer.2.attention.self.query.weight tensor(1.7251e-07, device='cuda:0')
bert.encoder.layer.2.attention.self.query.bias tensor(3.2010e-10, device='cuda:0')
bert.encoder.layer.2.attention.self.key.weight tensor(3.0160e-07, device='cuda:0')
bert.encoder.layer.2.attention.self.key.bias tensor(4.2932e-16, device='cuda:0')
bert.encoder.layer.2.attention.self.value.weight tensor(2.8566e-06, device='cuda:0')
bert.encoder.layer.2.attention.self.value.bias tensor(4.7832e-09, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.weight tensor(4.6379e-06, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.bias tensor(4.1408e-09, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.weight tensor(1.6676e-08, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.bias tensor(2.0664e-08, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.weight tensor(9.3455e-06, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.bias tensor(1.7762e-08, device='cuda:0')
bert.encoder.layer.2.output.dense.weight tensor(8.4283e-06, device='cuda:0')
bert.encoder.layer.2.output.dense.bias tensor(1.0403e-08, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.weight tensor(1.4313e-08, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.bias tensor(2.0126e-08, device='cuda:0')
bert.encoder.layer.3.attention.self.query.weight tensor(2.0579e-07, device='cuda:0')
bert.encoder.layer.3.attention.self.query.bias tensor(3.8859e-10, device='cuda:0')
bert.encoder.layer.3.attention.self.key.weight tensor(3.0668e-07, device='cuda:0')
bert.encoder.layer.3.attention.self.key.bias tensor(6.7737e-16, device='cuda:0')
bert.encoder.layer.3.attention.self.value.weight tensor(6.4303e-06, device='cuda:0')
bert.encoder.layer.3.attention.self.value.bias tensor(1.2249e-08, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.weight tensor(1.0247e-05, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.bias tensor(1.0575e-08, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.weight tensor(3.3744e-08, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.bias tensor(4.2328e-08, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.weight tensor(2.2581e-05, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.bias tensor(4.5520e-08, device='cuda:0')
bert.encoder.layer.3.output.dense.weight tensor(5.6610e-05, device='cuda:0')
bert.encoder.layer.3.output.dense.bias tensor(8.6102e-09, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.weight tensor(1.5533e-07, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.bias tensor(2.2770e-07, device='cuda:0')
bert.encoder.layer.4.attention.self.query.weight tensor(1.2620e-06, device='cuda:0')
bert.encoder.layer.4.attention.self.query.bias tensor(2.9750e-09, device='cuda:0')
bert.encoder.layer.4.attention.self.key.weight tensor(1.5365e-06, device='cuda:0')
bert.encoder.layer.4.attention.self.key.bias tensor(6.9888e-15, device='cuda:0')
bert.encoder.layer.4.attention.self.value.weight tensor(4.9111e-05, device='cuda:0')
bert.encoder.layer.4.attention.self.value.bias tensor(1.1677e-07, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.weight tensor(7.2133e-05, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.bias tensor(8.1416e-08, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.weight tensor(2.3272e-07, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.bias tensor(3.0464e-07, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.weight tensor(7.4678e-05, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.bias tensor(1.5446e-07, device='cuda:0')
bert.encoder.layer.4.output.dense.weight tensor(6.8329e-05, device='cuda:0')
bert.encoder.layer.4.output.dense.bias tensor(1.9183e-07, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.weight tensor(1.3947e-07, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.bias tensor(2.3662e-07, device='cuda:0')
bert.encoder.layer.5.attention.self.query.weight tensor(2.0211e-06, device='cuda:0')
bert.encoder.layer.5.attention.self.query.bias tensor(4.5610e-09, device='cuda:0')
bert.encoder.layer.5.attention.self.key.weight tensor(3.2816e-06, device='cuda:0')
bert.encoder.layer.5.attention.self.key.bias tensor(5.2912e-15, device='cuda:0')
bert.encoder.layer.5.attention.self.value.weight tensor(6.1106e-05, device='cuda:0')
bert.encoder.layer.5.attention.self.value.bias tensor(1.4031e-07, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.weight tensor(7.8256e-05, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.bias tensor(1.2408e-07, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.weight tensor(2.1698e-07, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.bias tensor(3.0219e-07, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.weight tensor(5.1993e-05, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.bias tensor(1.0316e-07, device='cuda:0')
bert.encoder.layer.5.output.dense.weight tensor(4.6040e-05, device='cuda:0')
bert.encoder.layer.5.output.dense.bias tensor(2.6161e-07, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.weight tensor(2.1305e-07, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.bias tensor(3.1628e-07, device='cuda:0')
bert.encoder.layer.6.attention.self.query.weight tensor(2.1874e-06, device='cuda:0')
bert.encoder.layer.6.attention.self.query.bias tensor(4.3873e-09, device='cuda:0')
bert.encoder.layer.6.attention.self.key.weight tensor(4.1755e-06, device='cuda:0')
bert.encoder.layer.6.attention.self.key.bias tensor(9.6625e-15, device='cuda:0')
bert.encoder.layer.6.attention.self.value.weight tensor(8.5159e-05, device='cuda:0')
bert.encoder.layer.6.attention.self.value.bias tensor(1.7203e-07, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.bias tensor(1.5073e-07, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.weight tensor(3.7643e-07, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.bias tensor(4.9511e-07, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.bias tensor(2.2250e-07, device='cuda:0')
bert.encoder.layer.6.output.dense.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.6.output.dense.bias tensor(3.8046e-07, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.weight tensor(3.8512e-07, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.bias tensor(5.4132e-07, device='cuda:0')
bert.encoder.layer.7.attention.self.query.weight tensor(3.4016e-06, device='cuda:0')
bert.encoder.layer.7.attention.self.query.bias tensor(7.0772e-09, device='cuda:0')
bert.encoder.layer.7.attention.self.key.weight tensor(6.1396e-06, device='cuda:0')
bert.encoder.layer.7.attention.self.key.bias tensor(1.7816e-14, device='cuda:0')
bert.encoder.layer.7.attention.self.value.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.7.attention.self.value.bias tensor(3.3415e-07, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.weight tensor(0.0003, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.bias tensor(2.9793e-07, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.weight tensor(8.5811e-07, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.bias tensor(1.1214e-06, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.weight tensor(0.0005, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.bias tensor(1.0108e-06, device='cuda:0')
bert.encoder.layer.7.output.dense.weight tensor(0.0007, device='cuda:0')
bert.encoder.layer.7.output.dense.bias tensor(2.8344e-07, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.weight tensor(1.5319e-06, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.bias tensor(1.9551e-06, device='cuda:0')
bert.encoder.layer.8.attention.self.query.weight tensor(2.0501e-05, device='cuda:0')
bert.encoder.layer.8.attention.self.query.bias tensor(3.7951e-08, device='cuda:0')
bert.encoder.layer.8.attention.self.key.weight tensor(2.2500e-05, device='cuda:0')
bert.encoder.layer.8.attention.self.key.bias tensor(4.9655e-14, device='cuda:0')
bert.encoder.layer.8.attention.self.value.weight tensor(0.0006, device='cuda:0')
bert.encoder.layer.8.attention.self.value.bias tensor(1.1425e-06, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.weight tensor(0.0010, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.bias tensor(1.0118e-06, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.weight tensor(3.3544e-06, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.bias tensor(4.4642e-06, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.weight tensor(0.0019, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.bias tensor(4.0361e-06, device='cuda:0')
bert.encoder.layer.8.output.dense.weight tensor(0.0026, device='cuda:0')
bert.encoder.layer.8.output.dense.bias tensor(9.2833e-07, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.weight tensor(6.4274e-06, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.bias tensor(8.7411e-06, device='cuda:0')
bert.encoder.layer.9.attention.self.query.weight tensor(9.8629e-05, device='cuda:0')
bert.encoder.layer.9.attention.self.query.bias tensor(1.7416e-07, device='cuda:0')
bert.encoder.layer.9.attention.self.key.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.9.attention.self.key.bias tensor(6.5745e-13, device='cuda:0')
bert.encoder.layer.9.attention.self.value.weight tensor(0.0025, device='cuda:0')
bert.encoder.layer.9.attention.self.value.bias tensor(4.5539e-06, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.weight tensor(0.0025, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.bias tensor(2.9058e-06, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.weight tensor(6.5745e-06, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.bias tensor(8.6606e-06, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.weight tensor(0.0017, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.bias tensor(3.5803e-06, device='cuda:0')
bert.encoder.layer.9.output.dense.weight tensor(0.0046, device='cuda:0')
bert.encoder.layer.9.output.dense.bias tensor(7.1144e-07, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.weight tensor(1.9800e-05, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.bias tensor(2.9991e-05, device='cuda:0')
bert.encoder.layer.10.attention.self.query.weight tensor(9.2731e-05, device='cuda:0')
bert.encoder.layer.10.attention.self.query.bias tensor(3.4591e-07, device='cuda:0')
bert.encoder.layer.10.attention.self.key.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.10.attention.self.key.bias tensor(1.2536e-12, device='cuda:0')
bert.encoder.layer.10.attention.self.value.weight tensor(0.0035, device='cuda:0')
bert.encoder.layer.10.attention.self.value.bias tensor(1.6338e-05, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.weight tensor(0.0046, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.bias tensor(1.6165e-05, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.weight tensor(1.9848e-05, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.bias tensor(2.0172e-05, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.weight tensor(0.0208, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.bias tensor(3.7108e-05, device='cuda:0')
bert.encoder.layer.10.output.dense.weight tensor(0.0628, device='cuda:0')
bert.encoder.layer.10.output.dense.bias tensor(9.7485e-06, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.weight tensor(0.0003, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.bias tensor(0.0003, device='cuda:0')
bert.encoder.layer.11.attention.self.query.weight tensor(0.0174, device='cuda:0')
bert.encoder.layer.11.attention.self.query.bias tensor(3.7122e-05, device='cuda:0')
bert.encoder.layer.11.attention.self.key.weight tensor(0.0284, device='cuda:0')
bert.encoder.layer.11.attention.self.key.bias tensor(1.1561e-10, device='cuda:0')
bert.encoder.layer.11.attention.self.value.weight tensor(0.1047, device='cuda:0')
bert.encoder.layer.11.attention.self.value.bias tensor(0.0002, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.weight tensor(0.4069, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.bias tensor(0.0002, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.weight tensor(0.0026, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.bias tensor(0.0027, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.weight tensor(0.3203, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.bias tensor(0.0006, device='cuda:0')
bert.encoder.layer.11.output.dense.weight tensor(0.8494, device='cuda:0')
bert.encoder.layer.11.output.dense.bias tensor(0.0035, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.weight tensor(0.0330, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.bias tensor(0.0605, device='cuda:0')
bert.pooler.dense.weight tensor(11.8950, device='cuda:0')
bert.pooler.dense.bias tensor(0.0759, device='cuda:0')
classifier.weight tensor(7.8701, device='cuda:0')
classifier.bias tensor(0.0326, device='cuda:0')
**************************************************
LOSS  0.691978060109312
Accuracy  0.5327521090191335
Precision  0.5204369787006929
Recall  0.7899984235658365
F1  0.6274928043424327
Curr loss  tensor(0.6915, device='cuda:0', grad_fn=<DivBackward1>)
Curr prec  0.5483870967741935
Curr recall  1.0
tensor([[-7.0982e-02,  1.0516e-03],
        [-9.7615e-02,  9.8082e-03],
        [-2.7816e-02, -1.7428e-02],
        [-1.3663e-01, -9.7756e-02],
        [-1.1752e-01, -8.0760e-02],
        [-2.8542e-02,  1.0348e-02],
        [-1.5535e-01, -3.2614e-02],
        [-7.9404e-02, -1.9052e-05],
        [-1.0809e-01, -4.8252e-02],
        [-9.9692e-02, -8.0526e-02],
        [-8.8274e-02, -6.8140e-03],
        [-1.0458e-01, -5.1546e-02],
        [-1.1755e-01, -5.0124e-02],
        [-9.6589e-02, -3.5836e-02],
        [-9.9853e-02, -2.5502e-02],
        [-5.2290e-02, -2.3037e-02],
        [-8.9579e-02, -4.4084e-02],
        [-1.3163e-01, -2.8857e-02],
        [-8.7402e-02, -4.5070e-02],
        [-6.8125e-02,  1.4991e-02],
        [-1.0723e-01, -3.1207e-02],
        [-3.3310e-02, -3.6438e-02],
        [-5.4107e-02, -5.1426e-02],
        [-6.3494e-02, -3.1380e-02],
        [-1.5325e-01, -4.6887e-02],
        [-9.9490e-02, -9.0833e-03],
        [-6.9115e-02, -1.8127e-02],
        [-8.9678e-02, -4.5013e-02],
        [-1.1092e-01, -5.5380e-02],
        [-1.0891e-01,  9.0615e-03],
        [-6.4325e-02, -1.0487e-02],
        [-7.8275e-02, -1.3940e-02]], device='cuda:0', grad_fn=<AddmmBackward0>)
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
  7%|████████▏                                                                                                            | 43951/625000 [3:43:05<49:25:05,  3.27it/s]bert.embeddings.word_embeddings.weight tensor(2.0180e-06, device='cuda:0')
bert.embeddings.position_embeddings.weight tensor(1.1236e-06, device='cuda:0')
bert.embeddings.token_type_embeddings.weight tensor(2.5901e-07, device='cuda:0')
bert.embeddings.LayerNorm.weight tensor(8.8284e-09, device='cuda:0')
bert.embeddings.LayerNorm.bias tensor(1.6307e-08, device='cuda:0')
bert.encoder.layer.0.attention.self.query.weight tensor(1.5196e-06, device='cuda:0')
bert.encoder.layer.0.attention.self.query.bias tensor(3.5589e-09, device='cuda:0')
bert.encoder.layer.0.attention.self.key.weight tensor(1.2540e-06, device='cuda:0')
bert.encoder.layer.0.attention.self.key.bias tensor(6.9152e-16, device='cuda:0')
bert.encoder.layer.0.attention.self.value.weight tensor(1.9770e-06, device='cuda:0')
bert.encoder.layer.0.attention.self.value.bias tensor(1.1215e-08, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.weight tensor(1.9781e-06, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.bias tensor(1.0574e-08, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.weight tensor(7.5659e-09, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.bias tensor(1.1096e-08, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.weight tensor(3.8937e-06, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.bias tensor(7.5863e-09, device='cuda:0')
bert.encoder.layer.0.output.dense.weight tensor(3.4205e-06, device='cuda:0')
bert.encoder.layer.0.output.dense.bias tensor(1.0554e-08, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.weight tensor(1.1393e-08, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.bias tensor(1.4707e-08, device='cuda:0')
bert.encoder.layer.1.attention.self.query.weight tensor(7.8359e-07, device='cuda:0')
bert.encoder.layer.1.attention.self.query.bias tensor(2.0024e-09, device='cuda:0')
bert.encoder.layer.1.attention.self.key.weight tensor(1.1268e-06, device='cuda:0')
bert.encoder.layer.1.attention.self.key.bias tensor(7.1621e-16, device='cuda:0')
bert.encoder.layer.1.attention.self.value.weight tensor(3.4510e-06, device='cuda:0')
bert.encoder.layer.1.attention.self.value.bias tensor(9.8691e-09, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.weight tensor(5.5274e-06, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.bias tensor(8.8978e-09, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.weight tensor(1.7881e-08, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.bias tensor(2.4973e-08, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.weight tensor(8.2954e-06, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.bias tensor(1.5835e-08, device='cuda:0')
bert.encoder.layer.1.output.dense.weight tensor(6.7176e-06, device='cuda:0')
bert.encoder.layer.1.output.dense.bias tensor(1.9910e-08, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.weight tensor(2.5430e-08, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.bias tensor(3.2197e-08, device='cuda:0')
bert.encoder.layer.2.attention.self.query.weight tensor(2.9215e-07, device='cuda:0')
bert.encoder.layer.2.attention.self.query.bias tensor(5.6387e-10, device='cuda:0')
bert.encoder.layer.2.attention.self.key.weight tensor(4.6456e-07, device='cuda:0')
bert.encoder.layer.2.attention.self.key.bias tensor(4.4703e-16, device='cuda:0')
bert.encoder.layer.2.attention.self.value.weight tensor(9.9494e-06, device='cuda:0')
bert.encoder.layer.2.attention.self.value.bias tensor(1.9368e-08, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.weight tensor(1.8129e-05, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.bias tensor(1.6728e-08, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.weight tensor(6.8215e-08, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.bias tensor(8.8497e-08, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.weight tensor(3.6571e-05, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.bias tensor(7.1303e-08, device='cuda:0')
bert.encoder.layer.2.output.dense.weight tensor(3.3712e-05, device='cuda:0')
bert.encoder.layer.2.output.dense.bias tensor(4.4325e-08, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.weight tensor(6.4575e-08, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.bias tensor(9.0513e-08, device='cuda:0')
bert.encoder.layer.3.attention.self.query.weight tensor(3.7806e-07, device='cuda:0')
bert.encoder.layer.3.attention.self.query.bias tensor(7.2476e-10, device='cuda:0')
bert.encoder.layer.3.attention.self.key.weight tensor(4.3573e-07, device='cuda:0')
bert.encoder.layer.3.attention.self.key.bias tensor(8.3481e-16, device='cuda:0')
bert.encoder.layer.3.attention.self.value.weight tensor(2.8824e-05, device='cuda:0')
bert.encoder.layer.3.attention.self.value.bias tensor(5.5198e-08, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.weight tensor(4.8206e-05, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.bias tensor(4.7401e-08, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.weight tensor(1.6436e-07, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.bias tensor(2.0471e-07, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.bias tensor(2.1761e-07, device='cuda:0')
bert.encoder.layer.3.output.dense.weight tensor(0.0003, device='cuda:0')
bert.encoder.layer.3.output.dense.bias tensor(4.0612e-08, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.weight tensor(7.4521e-07, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.bias tensor(1.1022e-06, device='cuda:0')
bert.encoder.layer.4.attention.self.query.weight tensor(3.3381e-06, device='cuda:0')
bert.encoder.layer.4.attention.self.query.bias tensor(7.8811e-09, device='cuda:0')
bert.encoder.layer.4.attention.self.key.weight tensor(2.5164e-06, device='cuda:0')
bert.encoder.layer.4.attention.self.key.bias tensor(8.4820e-15, device='cuda:0')
bert.encoder.layer.4.attention.self.value.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.4.attention.self.value.bias tensor(5.6407e-07, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.weight tensor(0.0003, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.bias tensor(3.9306e-07, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.weight tensor(1.1285e-06, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.bias tensor(1.4716e-06, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.weight tensor(0.0004, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.bias tensor(7.5739e-07, device='cuda:0')
bert.encoder.layer.4.output.dense.weight tensor(0.0003, device='cuda:0')
bert.encoder.layer.4.output.dense.bias tensor(9.1445e-07, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.weight tensor(6.6128e-07, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.bias tensor(1.1275e-06, device='cuda:0')
bert.encoder.layer.5.attention.self.query.weight tensor(4.0232e-06, device='cuda:0')
bert.encoder.layer.5.attention.self.query.bias tensor(9.1115e-09, device='cuda:0')
bert.encoder.layer.5.attention.self.key.weight tensor(5.6277e-06, device='cuda:0')
bert.encoder.layer.5.attention.self.key.bias tensor(7.1360e-15, device='cuda:0')
bert.encoder.layer.5.attention.self.value.weight tensor(0.0003, device='cuda:0')
bert.encoder.layer.5.attention.self.value.bias tensor(6.6870e-07, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.weight tensor(0.0004, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.bias tensor(5.8389e-07, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.weight tensor(1.0363e-06, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.bias tensor(1.4168e-06, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.bias tensor(4.8222e-07, device='cuda:0')
bert.encoder.layer.5.output.dense.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.5.output.dense.bias tensor(1.2120e-06, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.weight tensor(9.9647e-07, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.bias tensor(1.4647e-06, device='cuda:0')
bert.encoder.layer.6.attention.self.query.weight tensor(3.0760e-06, device='cuda:0')
bert.encoder.layer.6.attention.self.query.bias tensor(6.1842e-09, device='cuda:0')
bert.encoder.layer.6.attention.self.key.weight tensor(5.3572e-06, device='cuda:0')
bert.encoder.layer.6.attention.self.key.bias tensor(1.2447e-14, device='cuda:0')
bert.encoder.layer.6.attention.self.value.weight tensor(0.0004, device='cuda:0')
bert.encoder.layer.6.attention.self.value.bias tensor(7.8635e-07, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.weight tensor(0.0005, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.bias tensor(6.6911e-07, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.weight tensor(1.6701e-06, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.bias tensor(2.1865e-06, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.weight tensor(0.0005, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.bias tensor(9.6968e-07, device='cuda:0')
bert.encoder.layer.6.output.dense.weight tensor(0.0005, device='cuda:0')
bert.encoder.layer.6.output.dense.bias tensor(1.6290e-06, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.weight tensor(1.6339e-06, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.bias tensor(2.3173e-06, device='cuda:0')
bert.encoder.layer.7.attention.self.query.weight tensor(5.4006e-06, device='cuda:0')
bert.encoder.layer.7.attention.self.query.bias tensor(1.1269e-08, device='cuda:0')
bert.encoder.layer.7.attention.self.key.weight tensor(8.1149e-06, device='cuda:0')
bert.encoder.layer.7.attention.self.key.bias tensor(1.5044e-14, device='cuda:0')
bert.encoder.layer.7.attention.self.value.weight tensor(0.0007, device='cuda:0')
bert.encoder.layer.7.attention.self.value.bias tensor(1.4168e-06, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.weight tensor(0.0012, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.bias tensor(1.2724e-06, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.weight tensor(3.6309e-06, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.bias tensor(4.7589e-06, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.weight tensor(0.0021, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.bias tensor(4.2765e-06, device='cuda:0')
bert.encoder.layer.7.output.dense.weight tensor(0.0030, device='cuda:0')
bert.encoder.layer.7.output.dense.bias tensor(1.2009e-06, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.weight tensor(6.4847e-06, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.bias tensor(8.2845e-06, device='cuda:0')
bert.encoder.layer.8.attention.self.query.weight tensor(3.2442e-05, device='cuda:0')
bert.encoder.layer.8.attention.self.query.bias tensor(6.0337e-08, device='cuda:0')
bert.encoder.layer.8.attention.self.key.weight tensor(3.1166e-05, device='cuda:0')
bert.encoder.layer.8.attention.self.key.bias tensor(6.2870e-14, device='cuda:0')
bert.encoder.layer.8.attention.self.value.weight tensor(0.0026, device='cuda:0')
bert.encoder.layer.8.attention.self.value.bias tensor(4.8286e-06, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.weight tensor(0.0045, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.bias tensor(4.3253e-06, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.weight tensor(1.4222e-05, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.bias tensor(1.9065e-05, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.weight tensor(0.0078, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.bias tensor(1.6339e-05, device='cuda:0')
bert.encoder.layer.8.output.dense.weight tensor(0.0103, device='cuda:0')
bert.encoder.layer.8.output.dense.bias tensor(3.7242e-06, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.weight tensor(2.5669e-05, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.bias tensor(3.4860e-05, device='cuda:0')
bert.encoder.layer.9.attention.self.query.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.9.attention.self.query.bias tensor(2.6200e-07, device='cuda:0')
bert.encoder.layer.9.attention.self.key.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.9.attention.self.key.bias tensor(4.9533e-13, device='cuda:0')
bert.encoder.layer.9.attention.self.value.weight tensor(0.0098, device='cuda:0')
bert.encoder.layer.9.attention.self.value.bias tensor(1.7562e-05, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.weight tensor(0.0095, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.bias tensor(1.1129e-05, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.weight tensor(2.5510e-05, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.bias tensor(3.4430e-05, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.weight tensor(0.0071, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.bias tensor(1.5203e-05, device='cuda:0')
bert.encoder.layer.9.output.dense.weight tensor(0.0186, device='cuda:0')
bert.encoder.layer.9.output.dense.bias tensor(2.8834e-06, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.weight tensor(7.8395e-05, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.bias tensor(0.0001, device='cuda:0')
bert.encoder.layer.10.attention.self.query.weight tensor(0.0003, device='cuda:0')
bert.encoder.layer.10.attention.self.query.bias tensor(1.2517e-06, device='cuda:0')
bert.encoder.layer.10.attention.self.key.weight tensor(0.0006, device='cuda:0')
bert.encoder.layer.10.attention.self.key.bias tensor(2.3525e-12, device='cuda:0')
bert.encoder.layer.10.attention.self.value.weight tensor(0.0124, device='cuda:0')
bert.encoder.layer.10.attention.self.value.bias tensor(6.1162e-05, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.weight tensor(0.0115, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.bias tensor(4.3201e-05, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.weight tensor(5.4873e-05, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.bias tensor(5.9264e-05, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.weight tensor(0.0354, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.bias tensor(6.9792e-05, device='cuda:0')
bert.encoder.layer.10.output.dense.weight tensor(0.1116, device='cuda:0')
bert.encoder.layer.10.output.dense.bias tensor(1.8312e-05, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.weight tensor(0.0005, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.bias tensor(0.0008, device='cuda:0')
bert.encoder.layer.11.attention.self.query.weight tensor(0.0205, device='cuda:0')
bert.encoder.layer.11.attention.self.query.bias tensor(4.5306e-05, device='cuda:0')
bert.encoder.layer.11.attention.self.key.weight tensor(0.0306, device='cuda:0')
bert.encoder.layer.11.attention.self.key.bias tensor(2.3444e-10, device='cuda:0')
bert.encoder.layer.11.attention.self.value.weight tensor(0.2223, device='cuda:0')
bert.encoder.layer.11.attention.self.value.bias tensor(0.0005, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.weight tensor(0.9185, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.bias tensor(0.0005, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.weight tensor(0.0060, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.bias tensor(0.0079, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.weight tensor(0.5883, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.bias tensor(0.0012, device='cuda:0')
bert.encoder.layer.11.output.dense.weight tensor(1.9760, device='cuda:0')
bert.encoder.layer.11.output.dense.bias tensor(0.0081, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.weight tensor(0.0724, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.bias tensor(0.2231, device='cuda:0')
bert.pooler.dense.weight tensor(20.1261, device='cuda:0')
bert.pooler.dense.bias tensor(0.2008, device='cuda:0')
classifier.weight tensor(11.2534, device='cuda:0')
classifier.bias tensor(0.1290, device='cuda:0')
**************************************************
LOSS  0.6927994961848397
Accuracy  0.5301323486528507
Precision  0.5298313392951595
Recall  0.9989176990818753
F1  0.6924065220175902
Curr loss  tensor(0.6827, device='cuda:0', grad_fn=<DivBackward1>)
Curr prec  0.59375
Curr recall  1.0
tensor([[-0.1301, -0.0172],
        [-0.0880, -0.0253],
        [-0.1446,  0.0079],
        [-0.1295,  0.0165],
        [-0.0959, -0.0526],
        [-0.0982,  0.0413],
        [-0.0832, -0.0681],
        [-0.1393, -0.0696],
        [-0.1067,  0.0168],
        [-0.1530, -0.0094],
        [-0.1051,  0.0026],
        [-0.1311, -0.0426],
        [-0.1726, -0.0322],
        [-0.1543, -0.0345],
        [-0.0559,  0.0279],
        [-0.1281, -0.0049],
        [-0.1193, -0.0214],
        [-0.1304,  0.0218],
        [-0.1040, -0.0077],
        [-0.1951,  0.0167],
        [-0.1838, -0.0217],
        [-0.1435, -0.0267],
        [-0.0943,  0.0423],
        [-0.1429,  0.0513],
        [-0.0866,  0.0004],
        [-0.1613, -0.0355],
        [-0.1150, -0.0297],
        [-0.0984, -0.0066],
        [-0.1502,  0.0075],
        [-0.1716, -0.0523],
        [-0.0867,  0.0068],
        [-0.1094,  0.0369]], device='cuda:0', grad_fn=<AddmmBackward0>)
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
  7%|████████▏                                                                                                            | 44001/625000 [3:43:21<49:34:36,  3.26it/s]bert.embeddings.word_embeddings.weight tensor(1.4011e-06, device='cuda:0')
bert.embeddings.position_embeddings.weight tensor(8.3822e-07, device='cuda:0')
bert.embeddings.token_type_embeddings.weight tensor(1.2133e-07, device='cuda:0')
bert.embeddings.LayerNorm.weight tensor(5.4858e-09, device='cuda:0')
bert.embeddings.LayerNorm.bias tensor(7.8875e-09, device='cuda:0')
bert.encoder.layer.0.attention.self.query.weight tensor(1.1684e-06, device='cuda:0')
bert.encoder.layer.0.attention.self.query.bias tensor(2.0674e-09, device='cuda:0')
bert.encoder.layer.0.attention.self.key.weight tensor(1.0020e-06, device='cuda:0')
bert.encoder.layer.0.attention.self.key.bias tensor(5.6959e-16, device='cuda:0')
bert.encoder.layer.0.attention.self.value.weight tensor(1.1448e-06, device='cuda:0')
bert.encoder.layer.0.attention.self.value.bias tensor(4.7313e-09, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.weight tensor(9.8494e-07, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.bias tensor(4.0361e-09, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.weight tensor(3.8655e-09, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.bias tensor(4.4111e-09, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.weight tensor(2.2721e-06, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.bias tensor(3.7624e-09, device='cuda:0')
bert.encoder.layer.0.output.dense.weight tensor(1.6912e-06, device='cuda:0')
bert.encoder.layer.0.output.dense.bias tensor(3.7716e-09, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.weight tensor(4.8371e-09, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.bias tensor(5.0337e-09, device='cuda:0')
bert.encoder.layer.1.attention.self.query.weight tensor(5.0297e-07, device='cuda:0')
bert.encoder.layer.1.attention.self.query.bias tensor(1.0954e-09, device='cuda:0')
bert.encoder.layer.1.attention.self.key.weight tensor(7.1423e-07, device='cuda:0')
bert.encoder.layer.1.attention.self.key.bias tensor(5.7358e-16, device='cuda:0')
bert.encoder.layer.1.attention.self.value.weight tensor(1.4579e-06, device='cuda:0')
bert.encoder.layer.1.attention.self.value.bias tensor(3.4347e-09, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.weight tensor(2.2556e-06, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.bias tensor(3.2618e-09, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.weight tensor(7.4703e-09, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.bias tensor(1.0126e-08, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.weight tensor(3.6198e-06, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.bias tensor(6.8360e-09, device='cuda:0')
bert.encoder.layer.1.output.dense.weight tensor(2.9964e-06, device='cuda:0')
bert.encoder.layer.1.output.dense.bias tensor(8.4890e-09, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.weight tensor(1.1304e-08, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.bias tensor(1.3945e-08, device='cuda:0')
bert.encoder.layer.2.attention.self.query.weight tensor(1.6654e-07, device='cuda:0')
bert.encoder.layer.2.attention.self.query.bias tensor(3.1914e-10, device='cuda:0')
bert.encoder.layer.2.attention.self.key.weight tensor(2.8946e-07, device='cuda:0')
bert.encoder.layer.2.attention.self.key.bias tensor(2.7869e-16, device='cuda:0')
bert.encoder.layer.2.attention.self.value.weight tensor(4.4343e-06, device='cuda:0')
bert.encoder.layer.2.attention.self.value.bias tensor(8.5151e-09, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.weight tensor(8.0335e-06, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.bias tensor(7.3584e-09, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.weight tensor(3.0001e-08, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.bias tensor(3.9121e-08, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.weight tensor(1.5906e-05, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.bias tensor(3.1077e-08, device='cuda:0')
bert.encoder.layer.2.output.dense.weight tensor(1.4756e-05, device='cuda:0')
bert.encoder.layer.2.output.dense.bias tensor(1.9300e-08, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.weight tensor(2.8378e-08, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.bias tensor(3.9465e-08, device='cuda:0')
bert.encoder.layer.3.attention.self.query.weight tensor(2.2199e-07, device='cuda:0')
bert.encoder.layer.3.attention.self.query.bias tensor(4.2320e-10, device='cuda:0')
bert.encoder.layer.3.attention.self.key.weight tensor(2.7700e-07, device='cuda:0')
bert.encoder.layer.3.attention.self.key.bias tensor(4.0624e-16, device='cuda:0')
bert.encoder.layer.3.attention.self.value.weight tensor(1.2593e-05, device='cuda:0')
bert.encoder.layer.3.attention.self.value.bias tensor(2.3931e-08, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.weight tensor(2.1007e-05, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.bias tensor(2.0580e-08, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.weight tensor(7.1996e-08, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.bias tensor(8.9475e-08, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.weight tensor(4.7261e-05, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.bias tensor(9.5706e-08, device='cuda:0')
bert.encoder.layer.3.output.dense.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.3.output.dense.bias tensor(1.7931e-08, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.weight tensor(3.2752e-07, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.bias tensor(4.8604e-07, device='cuda:0')
bert.encoder.layer.4.attention.self.query.weight tensor(1.4282e-06, device='cuda:0')
bert.encoder.layer.4.attention.self.query.bias tensor(3.3624e-09, device='cuda:0')
bert.encoder.layer.4.attention.self.key.weight tensor(1.4711e-06, device='cuda:0')
bert.encoder.layer.4.attention.self.key.bias tensor(5.9671e-15, device='cuda:0')
bert.encoder.layer.4.attention.self.value.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.4.attention.self.value.bias tensor(2.4873e-07, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.bias tensor(1.7377e-07, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.weight tensor(4.9797e-07, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.bias tensor(6.5034e-07, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.bias tensor(3.3501e-07, device='cuda:0')
bert.encoder.layer.4.output.dense.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.4.output.dense.bias tensor(4.0632e-07, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.weight tensor(2.9221e-07, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.bias tensor(5.0275e-07, device='cuda:0')
bert.encoder.layer.5.attention.self.query.weight tensor(1.9285e-06, device='cuda:0')
bert.encoder.layer.5.attention.self.query.bias tensor(4.3656e-09, device='cuda:0')
bert.encoder.layer.5.attention.self.key.weight tensor(3.0947e-06, device='cuda:0')
bert.encoder.layer.5.attention.self.key.bias tensor(5.3492e-15, device='cuda:0')
bert.encoder.layer.5.attention.self.value.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.5.attention.self.value.bias tensor(2.9822e-07, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.bias tensor(2.5796e-07, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.weight tensor(4.5685e-07, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.bias tensor(6.2712e-07, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.bias tensor(2.1132e-07, device='cuda:0')
bert.encoder.layer.5.output.dense.weight tensor(9.3442e-05, device='cuda:0')
bert.encoder.layer.5.output.dense.bias tensor(5.3699e-07, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.weight tensor(4.3754e-07, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.bias tensor(6.4662e-07, device='cuda:0')
bert.encoder.layer.6.attention.self.query.weight tensor(1.7654e-06, device='cuda:0')
bert.encoder.layer.6.attention.self.query.bias tensor(3.5406e-09, device='cuda:0')
bert.encoder.layer.6.attention.self.key.weight tensor(3.4180e-06, device='cuda:0')
bert.encoder.layer.6.attention.self.key.bias tensor(6.7176e-15, device='cuda:0')
bert.encoder.layer.6.attention.self.value.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.6.attention.self.value.bias tensor(3.4912e-07, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.bias tensor(2.9449e-07, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.weight tensor(7.3400e-07, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.bias tensor(9.6040e-07, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.bias tensor(4.2817e-07, device='cuda:0')
bert.encoder.layer.6.output.dense.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.6.output.dense.bias tensor(7.2363e-07, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.weight tensor(7.2848e-07, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.bias tensor(1.0285e-06, device='cuda:0')
bert.encoder.layer.7.attention.self.query.weight tensor(3.0341e-06, device='cuda:0')
bert.encoder.layer.7.attention.self.query.bias tensor(6.3140e-09, device='cuda:0')
bert.encoder.layer.7.attention.self.key.weight tensor(5.1268e-06, device='cuda:0')
bert.encoder.layer.7.attention.self.key.bias tensor(1.2095e-14, device='cuda:0')
bert.encoder.layer.7.attention.self.value.weight tensor(0.0003, device='cuda:0')
bert.encoder.layer.7.attention.self.value.bias tensor(6.2854e-07, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.weight tensor(0.0005, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.bias tensor(5.6130e-07, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.weight tensor(1.5902e-06, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.bias tensor(2.0918e-06, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.weight tensor(0.0009, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.bias tensor(1.8520e-06, device='cuda:0')
bert.encoder.layer.7.output.dense.weight tensor(0.0013, device='cuda:0')
bert.encoder.layer.7.output.dense.bias tensor(5.1116e-07, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.weight tensor(2.7696e-06, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.bias tensor(3.4930e-06, device='cuda:0')
bert.encoder.layer.8.attention.self.query.weight tensor(1.8848e-05, device='cuda:0')
bert.encoder.layer.8.attention.self.query.bias tensor(3.4986e-08, device='cuda:0')
bert.encoder.layer.8.attention.self.key.weight tensor(1.9574e-05, device='cuda:0')
bert.encoder.layer.8.attention.self.key.bias tensor(5.2807e-14, device='cuda:0')
bert.encoder.layer.8.attention.self.value.weight tensor(0.0011, device='cuda:0')
bert.encoder.layer.8.attention.self.value.bias tensor(2.0142e-06, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.weight tensor(0.0019, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.bias tensor(1.7893e-06, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.weight tensor(5.8991e-06, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.bias tensor(7.9079e-06, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.weight tensor(0.0031, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.bias tensor(6.5446e-06, device='cuda:0')
bert.encoder.layer.8.output.dense.weight tensor(0.0040, device='cuda:0')
bert.encoder.layer.8.output.dense.bias tensor(1.4847e-06, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.weight tensor(1.0091e-05, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.bias tensor(1.3642e-05, device='cuda:0')
bert.encoder.layer.9.attention.self.query.weight tensor(9.1486e-05, device='cuda:0')
bert.encoder.layer.9.attention.self.query.bias tensor(1.6050e-07, device='cuda:0')
bert.encoder.layer.9.attention.self.key.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.9.attention.self.key.bias tensor(5.5271e-13, device='cuda:0')
bert.encoder.layer.9.attention.self.value.weight tensor(0.0040, device='cuda:0')
bert.encoder.layer.9.attention.self.value.bias tensor(7.1938e-06, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.weight tensor(0.0040, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.bias tensor(4.6607e-06, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.weight tensor(1.0740e-05, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.bias tensor(1.4503e-05, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.weight tensor(0.0039, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.bias tensor(8.2944e-06, device='cuda:0')
bert.encoder.layer.9.output.dense.weight tensor(0.0117, device='cuda:0')
bert.encoder.layer.9.output.dense.bias tensor(1.9711e-06, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.weight tensor(3.9204e-05, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.bias tensor(7.8404e-05, device='cuda:0')
bert.encoder.layer.10.attention.self.query.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.10.attention.self.query.bias tensor(8.4153e-07, device='cuda:0')
bert.encoder.layer.10.attention.self.key.weight tensor(0.0004, device='cuda:0')
bert.encoder.layer.10.attention.self.key.bias tensor(1.7551e-12, device='cuda:0')
bert.encoder.layer.10.attention.self.value.weight tensor(0.0087, device='cuda:0')
bert.encoder.layer.10.attention.self.value.bias tensor(3.9746e-05, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.weight tensor(0.0134, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.bias tensor(4.3959e-05, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.weight tensor(5.5353e-05, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.bias tensor(7.0766e-05, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.weight tensor(0.0740, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.bias tensor(0.0001, device='cuda:0')
bert.encoder.layer.10.output.dense.weight tensor(0.2793, device='cuda:0')
bert.encoder.layer.10.output.dense.bias tensor(4.0696e-05, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.weight tensor(0.0013, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.bias tensor(0.0018, device='cuda:0')
bert.encoder.layer.11.attention.self.query.weight tensor(0.0242, device='cuda:0')
bert.encoder.layer.11.attention.self.query.bias tensor(4.8491e-05, device='cuda:0')
bert.encoder.layer.11.attention.self.key.weight tensor(0.0401, device='cuda:0')
bert.encoder.layer.11.attention.self.key.bias tensor(1.7276e-10, device='cuda:0')
bert.encoder.layer.11.attention.self.value.weight tensor(0.5148, device='cuda:0')
bert.encoder.layer.11.attention.self.value.bias tensor(0.0011, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.weight tensor(2.3830, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.bias tensor(0.0012, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.weight tensor(0.0164, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.bias tensor(0.0221, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.weight tensor(1.0410, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.bias tensor(0.0022, device='cuda:0')
bert.encoder.layer.11.output.dense.weight tensor(4.8531, device='cuda:0')
bert.encoder.layer.11.output.dense.bias tensor(0.0206, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.weight tensor(0.2346, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.bias tensor(0.6819, device='cuda:0')
bert.pooler.dense.weight tensor(58.2710, device='cuda:0')
bert.pooler.dense.bias tensor(0.6667, device='cuda:0')
classifier.weight tensor(28.4203, device='cuda:0')
classifier.bias tensor(0.4387, device='cuda:0')
**************************************************
LOSS  0.6923227575309502
Accuracy  0.5349072026888616
Precision  0.5783890263308371
Recall  0.6757602403188026
F1  0.6232947190970587
Curr loss  tensor(0.6917, device='cuda:0', grad_fn=<DivBackward1>)
Curr prec  0.7857142857142857
Curr recall  0.4782608695652174
tensor([[-0.0776, -0.0719],
        [-0.0223,  0.0058],
        [-0.0220, -0.0146],
        [-0.0522, -0.0406],
        [-0.0502, -0.0603],
        [-0.0667, -0.0471],
        [ 0.0125, -0.0395],
        [-0.0180, -0.0443],
        [-0.0703, -0.0797],
        [-0.0603, -0.1405],
        [-0.1061, -0.1173],
        [-0.0418, -0.0395],
        [-0.0496, -0.0084],
        [-0.0868, -0.0608],
        [-0.0552, -0.0644],
        [-0.0221, -0.0778],
        [-0.0678, -0.0785],
        [-0.1537, -0.0728],
        [-0.0550, -0.0412],
        [ 0.0369, -0.0021],
        [-0.0397, -0.0815],
        [-0.0739, -0.0464],
        [-0.0508, -0.0309],
        [-0.0450, -0.0487],
        [-0.1010, -0.0637],
        [-0.0626, -0.0645],
        [-0.0269, -0.0411],
        [-0.0599, -0.1174],
        [-0.0629, -0.0853],
        [-0.1173,  0.0205],
        [-0.0504, -0.0672],
        [ 0.0032, -0.0681]], device='cuda:0', grad_fn=<AddmmBackward0>)
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
  7%|████████▏                                                                                                            | 44051/625000 [3:43:36<49:39:55,  3.25it/s]bert.embeddings.word_embeddings.weight tensor(1.1034e-06, device='cuda:0')
bert.embeddings.position_embeddings.weight tensor(7.1425e-07, device='cuda:0')
bert.embeddings.token_type_embeddings.weight tensor(9.7976e-08, device='cuda:0')
bert.embeddings.LayerNorm.weight tensor(4.9382e-09, device='cuda:0')
bert.embeddings.LayerNorm.bias tensor(6.1959e-09, device='cuda:0')
bert.encoder.layer.0.attention.self.query.weight tensor(9.4768e-07, device='cuda:0')
bert.encoder.layer.0.attention.self.query.bias tensor(1.7409e-09, device='cuda:0')
bert.encoder.layer.0.attention.self.key.weight tensor(8.3158e-07, device='cuda:0')
bert.encoder.layer.0.attention.self.key.bias tensor(4.4994e-16, device='cuda:0')
bert.encoder.layer.0.attention.self.value.weight tensor(9.8400e-07, device='cuda:0')
bert.encoder.layer.0.attention.self.value.bias tensor(3.5061e-09, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.weight tensor(8.3080e-07, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.bias tensor(3.1714e-09, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.weight tensor(3.2438e-09, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.bias tensor(3.2384e-09, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.weight tensor(1.9489e-06, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.bias tensor(3.0066e-09, device='cuda:0')
bert.encoder.layer.0.output.dense.weight tensor(1.3846e-06, device='cuda:0')
bert.encoder.layer.0.output.dense.bias tensor(2.9768e-09, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.weight tensor(3.8177e-09, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.bias tensor(3.7016e-09, device='cuda:0')
bert.encoder.layer.1.attention.self.query.weight tensor(4.1039e-07, device='cuda:0')
bert.encoder.layer.1.attention.self.query.bias tensor(9.3520e-10, device='cuda:0')
bert.encoder.layer.1.attention.self.key.weight tensor(6.0064e-07, device='cuda:0')
bert.encoder.layer.1.attention.self.key.bias tensor(3.9973e-16, device='cuda:0')
bert.encoder.layer.1.attention.self.value.weight tensor(1.0449e-06, device='cuda:0')
bert.encoder.layer.1.attention.self.value.bias tensor(2.2399e-09, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.weight tensor(1.5279e-06, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.bias tensor(2.0548e-09, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.weight tensor(4.7120e-09, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.bias tensor(6.2157e-09, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.weight tensor(2.3926e-06, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.bias tensor(4.4450e-09, device='cuda:0')
bert.encoder.layer.1.output.dense.weight tensor(1.9096e-06, device='cuda:0')
bert.encoder.layer.1.output.dense.bias tensor(5.1577e-09, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.weight tensor(6.9388e-09, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.bias tensor(8.4543e-09, device='cuda:0')
bert.encoder.layer.2.attention.self.query.weight tensor(1.1708e-07, device='cuda:0')
bert.encoder.layer.2.attention.self.query.bias tensor(2.2191e-10, device='cuda:0')
bert.encoder.layer.2.attention.self.key.weight tensor(2.2123e-07, device='cuda:0')
bert.encoder.layer.2.attention.self.key.bias tensor(3.0048e-16, device='cuda:0')
bert.encoder.layer.2.attention.self.value.weight tensor(2.6912e-06, device='cuda:0')
bert.encoder.layer.2.attention.self.value.bias tensor(5.1339e-09, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.weight tensor(4.7907e-06, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.bias tensor(4.4215e-09, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.weight tensor(1.7978e-08, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.bias tensor(2.3338e-08, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.weight tensor(9.6948e-06, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.bias tensor(1.8845e-08, device='cuda:0')
bert.encoder.layer.2.output.dense.weight tensor(8.9487e-06, device='cuda:0')
bert.encoder.layer.2.output.dense.bias tensor(1.1693e-08, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.weight tensor(1.7173e-08, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.bias tensor(2.3806e-08, device='cuda:0')
bert.encoder.layer.3.attention.self.query.weight tensor(1.6030e-07, device='cuda:0')
bert.encoder.layer.3.attention.self.query.bias tensor(3.0423e-10, device='cuda:0')
bert.encoder.layer.3.attention.self.key.weight tensor(2.2344e-07, device='cuda:0')
bert.encoder.layer.3.attention.self.key.bias tensor(6.3818e-16, device='cuda:0')
bert.encoder.layer.3.attention.self.value.weight tensor(7.5796e-06, device='cuda:0')
bert.encoder.layer.3.attention.self.value.bias tensor(1.4403e-08, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.weight tensor(1.2674e-05, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.bias tensor(1.2458e-08, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.weight tensor(4.3216e-08, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.bias tensor(5.3711e-08, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.weight tensor(2.8312e-05, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.bias tensor(5.7361e-08, device='cuda:0')
bert.encoder.layer.3.output.dense.weight tensor(7.2899e-05, device='cuda:0')
bert.encoder.layer.3.output.dense.bias tensor(1.0739e-08, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.weight tensor(1.9722e-07, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.bias tensor(2.9285e-07, device='cuda:0')
bert.encoder.layer.4.attention.self.query.weight tensor(9.6870e-07, device='cuda:0')
bert.encoder.layer.4.attention.self.query.bias tensor(2.2875e-09, device='cuda:0')
bert.encoder.layer.4.attention.self.key.weight tensor(1.1045e-06, device='cuda:0')
bert.encoder.layer.4.attention.self.key.bias tensor(5.0453e-15, device='cuda:0')
bert.encoder.layer.4.attention.self.value.weight tensor(6.2945e-05, device='cuda:0')
bert.encoder.layer.4.attention.self.value.bias tensor(1.4980e-07, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.weight tensor(9.1679e-05, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.bias tensor(1.0364e-07, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.weight tensor(2.9594e-07, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.bias tensor(3.8703e-07, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.weight tensor(9.5935e-05, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.bias tensor(1.9921e-07, device='cuda:0')
bert.encoder.layer.4.output.dense.weight tensor(8.5558e-05, device='cuda:0')
bert.encoder.layer.4.output.dense.bias tensor(2.3848e-07, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.weight tensor(1.7152e-07, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.bias tensor(2.9482e-07, device='cuda:0')
bert.encoder.layer.5.attention.self.query.weight tensor(1.5645e-06, device='cuda:0')
bert.encoder.layer.5.attention.self.query.bias tensor(3.5345e-09, device='cuda:0')
bert.encoder.layer.5.attention.self.key.weight tensor(2.5239e-06, device='cuda:0')
bert.encoder.layer.5.attention.self.key.bias tensor(4.6411e-15, device='cuda:0')
bert.encoder.layer.5.attention.self.value.weight tensor(7.6601e-05, device='cuda:0')
bert.encoder.layer.5.attention.self.value.bias tensor(1.7527e-07, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.weight tensor(9.5816e-05, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.bias tensor(1.5238e-07, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.weight tensor(2.7065e-07, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.bias tensor(3.6783e-07, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.weight tensor(6.3363e-05, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.bias tensor(1.2615e-07, device='cuda:0')
bert.encoder.layer.5.output.dense.weight tensor(5.5398e-05, device='cuda:0')
bert.encoder.layer.5.output.dense.bias tensor(3.1691e-07, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.weight tensor(2.6387e-07, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.bias tensor(3.8313e-07, device='cuda:0')
bert.encoder.layer.6.attention.self.query.weight tensor(1.4211e-06, device='cuda:0')
bert.encoder.layer.6.attention.self.query.bias tensor(2.8450e-09, device='cuda:0')
bert.encoder.layer.6.attention.self.key.weight tensor(2.9816e-06, device='cuda:0')
bert.encoder.layer.6.attention.self.key.bias tensor(5.8139e-15, device='cuda:0')
bert.encoder.layer.6.attention.self.value.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.6.attention.self.value.bias tensor(2.0252e-07, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.bias tensor(1.7143e-07, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.weight tensor(4.2760e-07, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.bias tensor(5.5699e-07, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.bias tensor(2.4565e-07, device='cuda:0')
bert.encoder.layer.6.output.dense.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.6.output.dense.bias tensor(4.0815e-07, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.weight tensor(4.1014e-07, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.bias tensor(5.7377e-07, device='cuda:0')
bert.encoder.layer.7.attention.self.query.weight tensor(2.4003e-06, device='cuda:0')
bert.encoder.layer.7.attention.self.query.bias tensor(4.9890e-09, device='cuda:0')
bert.encoder.layer.7.attention.self.key.weight tensor(4.2276e-06, device='cuda:0')
bert.encoder.layer.7.attention.self.key.bias tensor(1.0349e-14, device='cuda:0')
bert.encoder.layer.7.attention.self.value.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.7.attention.self.value.bias tensor(3.5460e-07, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.weight tensor(0.0003, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.bias tensor(3.1797e-07, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.weight tensor(8.8643e-07, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.bias tensor(1.1704e-06, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.weight tensor(0.0005, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.bias tensor(1.1189e-06, device='cuda:0')
bert.encoder.layer.7.output.dense.weight tensor(0.0008, device='cuda:0')
bert.encoder.layer.7.output.dense.bias tensor(3.1466e-07, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.weight tensor(1.6933e-06, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.bias tensor(2.1348e-06, device='cuda:0')
bert.encoder.layer.8.attention.self.query.weight tensor(1.3564e-05, device='cuda:0')
bert.encoder.layer.8.attention.self.query.bias tensor(2.5027e-08, device='cuda:0')
bert.encoder.layer.8.attention.self.key.weight tensor(1.5201e-05, device='cuda:0')
bert.encoder.layer.8.attention.self.key.bias tensor(4.5039e-14, device='cuda:0')
bert.encoder.layer.8.attention.self.value.weight tensor(0.0007, device='cuda:0')
bert.encoder.layer.8.attention.self.value.bias tensor(1.2431e-06, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.weight tensor(0.0012, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.bias tensor(1.1015e-06, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.weight tensor(3.6423e-06, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.bias tensor(4.8673e-06, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.weight tensor(0.0017, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.bias tensor(3.5880e-06, device='cuda:0')
bert.encoder.layer.8.output.dense.weight tensor(0.0022, device='cuda:0')
bert.encoder.layer.8.output.dense.bias tensor(8.1504e-07, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.weight tensor(5.4510e-06, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.bias tensor(7.3663e-06, device='cuda:0')
bert.encoder.layer.9.attention.self.query.weight tensor(6.6262e-05, device='cuda:0')
bert.encoder.layer.9.attention.self.query.bias tensor(1.1491e-07, device='cuda:0')
bert.encoder.layer.9.attention.self.key.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.9.attention.self.key.bias tensor(3.3052e-13, device='cuda:0')
bert.encoder.layer.9.attention.self.value.weight tensor(0.0021, device='cuda:0')
bert.encoder.layer.9.attention.self.value.bias tensor(3.7066e-06, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.weight tensor(0.0021, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.bias tensor(2.4781e-06, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.weight tensor(5.7695e-06, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.bias tensor(7.6009e-06, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.weight tensor(0.0022, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.bias tensor(4.5012e-06, device='cuda:0')
bert.encoder.layer.9.output.dense.weight tensor(0.0053, device='cuda:0')
bert.encoder.layer.9.output.dense.bias tensor(9.4088e-07, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.weight tensor(1.7367e-05, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.bias tensor(3.4794e-05, device='cuda:0')
bert.encoder.layer.10.attention.self.query.weight tensor(0.0001, device='cuda:0')
bert.encoder.layer.10.attention.self.query.bias tensor(3.7341e-07, device='cuda:0')
bert.encoder.layer.10.attention.self.key.weight tensor(0.0002, device='cuda:0')
bert.encoder.layer.10.attention.self.key.bias tensor(1.2207e-12, device='cuda:0')
bert.encoder.layer.10.attention.self.value.weight tensor(0.0046, device='cuda:0')
bert.encoder.layer.10.attention.self.value.bias tensor(2.0909e-05, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.weight tensor(0.0057, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.bias tensor(1.7961e-05, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.weight tensor(2.3922e-05, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.bias tensor(2.6320e-05, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.weight tensor(0.0224, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.bias tensor(3.9044e-05, device='cuda:0')
bert.encoder.layer.10.output.dense.weight tensor(0.0745, device='cuda:0')
bert.encoder.layer.10.output.dense.bias tensor(1.0431e-05, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.weight tensor(0.0003, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.bias tensor(0.0004, device='cuda:0')
bert.encoder.layer.11.attention.self.query.weight tensor(0.0217, device='cuda:0')
bert.encoder.layer.11.attention.self.query.bias tensor(4.4426e-05, device='cuda:0')
bert.encoder.layer.11.attention.self.key.weight tensor(0.0382, device='cuda:0')
bert.encoder.layer.11.attention.self.key.bias tensor(2.3180e-10, device='cuda:0')
bert.encoder.layer.11.attention.self.value.weight tensor(0.1262, device='cuda:0')
bert.encoder.layer.11.attention.self.value.bias tensor(0.0003, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.weight tensor(0.4725, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.bias tensor(0.0002, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.weight tensor(0.0031, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.bias tensor(0.0040, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.weight tensor(0.3188, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.bias tensor(0.0006, device='cuda:0')
bert.encoder.layer.11.output.dense.weight tensor(1.0130, device='cuda:0')
bert.encoder.layer.11.output.dense.bias tensor(0.0044, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.weight tensor(0.0425, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.bias tensor(0.1036, device='cuda:0')
bert.pooler.dense.weight tensor(14.3008, device='cuda:0')
bert.pooler.dense.bias tensor(0.1004, device='cuda:0')
classifier.weight tensor(8.7110, device='cuda:0')
classifier.bias tensor(0.0629, device='cuda:0')
**************************************************
LOSS  0.6937012624952182
Accuracy  0.49333952187914965
Precision  0.48487885593353475
Recall  0.47835146865328054
F1  0.48159304568037187
Curr loss  tensor(0.6917, device='cuda:0', grad_fn=<DivBackward1>)
Curr prec  0.47058823529411764
Curr recall  0.5333333333333333
tensor([[-0.0454, -0.0973],
        [-0.0304, -0.0183],
        [-0.0691, -0.0757],
        [-0.0820, -0.0854],
        [-0.0466, -0.0228],
        [-0.1134, -0.1112],
        [-0.0220,  0.0098],
        [-0.0303, -0.0256],
        [-0.0513, -0.0378],
        [-0.0565, -0.0442],
        [-0.0575, -0.0144],
        [-0.0544, -0.0465],
        [-0.0273, -0.0674],
        [-0.0791, -0.0830],
        [-0.0366, -0.1034],
        [-0.0699, -0.0776],
        [-0.0318, -0.0407],
        [-0.0555, -0.0251],
        [-0.0318, -0.0628],
        [-0.0885, -0.0712],
        [-0.0541, -0.0669],
        [-0.0784, -0.0586],
        [-0.0296, -0.0588],
        [-0.0988, -0.1119],
        [-0.0129, -0.0056],
        [-0.0624, -0.0512],
        [-0.0915, -0.0648],
        [-0.0738, -0.1031],
        [-0.0798, -0.0526],
        [-0.0875, -0.0933],
        [ 0.0126, -0.0050],
        [-0.1375, -0.0774]], device='cuda:0', grad_fn=<AddmmBackward0>)
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
  7%|████████▎                                                                                                            | 44100/625000 [3:43:51<49:33:22,  3.26it/s]