Untitled

        [ 1.5725, -0.6790],
        [-1.3115, -0.0069],
        [ 1.7840, -1.7826],
        [-0.2045, -0.1986],
        [ 1.6201, -0.8250],
        [ 0.5591, -0.2189],
        [ 1.3421, -0.0735],
        [-0.8824,  0.9068],
        [-1.6484,  0.8212],
        [ 1.7785, -1.1927],
        [-0.5752,  0.3603],
        [-1.3456,  0.0284],
        [ 1.2525, -0.2723],
        [ 1.7901, -1.9431],
        [-0.2109, -0.2015],
        [ 1.7683, -1.9970],
        [ 0.0723, -0.2200],
        [ 1.7841, -1.7831],
        [-1.3460,  0.0283],
        [-0.7937,  0.1962],
        [ 1.8140, -1.7714],
        [ 1.8493, -1.8307]], device='cuda:0', grad_fn=<AddmmBackward0>)
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
  0%|                                                                                                                         | 301/625000 [01:30<51:18:41,  3.38it/s]bert.embeddings.word_embeddings.weight tensor(24.7451, device='cuda:0')
bert.embeddings.position_embeddings.weight tensor(15.2141, device='cuda:0')
bert.embeddings.token_type_embeddings.weight tensor(4.3310, device='cuda:0')
bert.embeddings.LayerNorm.weight tensor(0.1969, device='cuda:0')
bert.embeddings.LayerNorm.bias tensor(0.2861, device='cuda:0')
bert.encoder.layer.0.attention.self.query.weight tensor(18.1732, device='cuda:0')
bert.encoder.layer.0.attention.self.query.bias tensor(0.0338, device='cuda:0')
bert.encoder.layer.0.attention.self.key.weight tensor(16.3999, device='cuda:0')
bert.encoder.layer.0.attention.self.key.bias tensor(1.0863e-08, device='cuda:0')
bert.encoder.layer.0.attention.self.value.weight tensor(40.7080, device='cuda:0')
bert.encoder.layer.0.attention.self.value.bias tensor(0.2034, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.weight tensor(44.1957, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.bias tensor(0.2389, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.weight tensor(0.1982, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.bias tensor(0.2119, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.weight tensor(141.6650, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.bias tensor(0.2205, device='cuda:0')
bert.encoder.layer.0.output.dense.weight tensor(116.6934, device='cuda:0')
bert.encoder.layer.0.output.dense.bias tensor(0.1967, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.weight tensor(0.2096, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.bias tensor(0.2571, device='cuda:0')
bert.encoder.layer.1.attention.self.query.weight tensor(17.8390, device='cuda:0')
bert.encoder.layer.1.attention.self.query.bias tensor(0.0301, device='cuda:0')
bert.encoder.layer.1.attention.self.key.weight tensor(13.2472, device='cuda:0')
bert.encoder.layer.1.attention.self.key.bias tensor(9.0544e-09, device='cuda:0')
bert.encoder.layer.1.attention.self.value.weight tensor(26.3012, device='cuda:0')
bert.encoder.layer.1.attention.self.value.bias tensor(0.1775, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.weight tensor(26.1824, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.bias tensor(0.2220, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.weight tensor(0.1897, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.bias tensor(0.2180, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.weight tensor(120.5428, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.bias tensor(0.1977, device='cuda:0')
bert.encoder.layer.1.output.dense.weight tensor(94.2822, device='cuda:0')
bert.encoder.layer.1.output.dense.bias tensor(0.2031, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.weight tensor(0.2153, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.bias tensor(0.2713, device='cuda:0')
bert.encoder.layer.2.attention.self.query.weight tensor(21.0443, device='cuda:0')
bert.encoder.layer.2.attention.self.query.bias tensor(0.0364, device='cuda:0')
bert.encoder.layer.2.attention.self.key.weight tensor(18.0043, device='cuda:0')
bert.encoder.layer.2.attention.self.key.bias tensor(1.5472e-08, device='cuda:0')
bert.encoder.layer.2.attention.self.value.weight tensor(42.6026, device='cuda:0')
bert.encoder.layer.2.attention.self.value.bias tensor(0.1976, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.weight tensor(37.6613, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.bias tensor(0.1942, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.weight tensor(0.1985, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.bias tensor(0.1983, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.weight tensor(114.6776, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.bias tensor(0.1954, device='cuda:0')
bert.encoder.layer.2.output.dense.weight tensor(94.6725, device='cuda:0')
bert.encoder.layer.2.output.dense.bias tensor(0.1940, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.weight tensor(0.2098, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.bias tensor(0.2408, device='cuda:0')
bert.encoder.layer.3.attention.self.query.weight tensor(26.0952, device='cuda:0')
bert.encoder.layer.3.attention.self.query.bias tensor(0.0502, device='cuda:0')
bert.encoder.layer.3.attention.self.key.weight tensor(18.5284, device='cuda:0')
bert.encoder.layer.3.attention.self.key.bias tensor(1.5159e-08, device='cuda:0')
bert.encoder.layer.3.attention.self.value.weight tensor(32.6520, device='cuda:0')
bert.encoder.layer.3.attention.self.value.bias tensor(0.1638, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.weight tensor(34.9530, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.bias tensor(0.1878, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.weight tensor(0.1871, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.bias tensor(0.2006, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.weight tensor(115.0171, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.bias tensor(0.1868, device='cuda:0')
bert.encoder.layer.3.output.dense.weight tensor(96.3679, device='cuda:0')
bert.encoder.layer.3.output.dense.bias tensor(0.1705, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.weight tensor(0.1735, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.bias tensor(0.2591, device='cuda:0')
bert.encoder.layer.4.attention.self.query.weight tensor(13.0529, device='cuda:0')
bert.encoder.layer.4.attention.self.query.bias tensor(0.0215, device='cuda:0')
bert.encoder.layer.4.attention.self.key.weight tensor(10.6706, device='cuda:0')
bert.encoder.layer.4.attention.self.key.bias tensor(8.5438e-09, device='cuda:0')
bert.encoder.layer.4.attention.self.value.weight tensor(22.3710, device='cuda:0')
bert.encoder.layer.4.attention.self.value.bias tensor(0.1888, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.weight tensor(25.1846, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.bias tensor(0.1950, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.weight tensor(0.1864, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.bias tensor(0.2044, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.weight tensor(123.3252, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.bias tensor(0.2196, device='cuda:0')
bert.encoder.layer.4.output.dense.weight tensor(111.1157, device='cuda:0')
bert.encoder.layer.4.output.dense.bias tensor(0.2230, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.weight tensor(0.2257, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.bias tensor(0.2725, device='cuda:0')
bert.encoder.layer.5.attention.self.query.weight tensor(25.3206, device='cuda:0')
bert.encoder.layer.5.attention.self.query.bias tensor(0.0474, device='cuda:0')
bert.encoder.layer.5.attention.self.key.weight tensor(14.7598, device='cuda:0')
bert.encoder.layer.5.attention.self.key.bias tensor(9.6708e-09, device='cuda:0')
bert.encoder.layer.5.attention.self.value.weight tensor(19.7363, device='cuda:0')
bert.encoder.layer.5.attention.self.value.bias tensor(0.1297, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.weight tensor(21.8111, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.bias tensor(0.1498, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.weight tensor(0.1597, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.bias tensor(0.1638, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.weight tensor(92.8944, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.bias tensor(0.1601, device='cuda:0')
bert.encoder.layer.5.output.dense.weight tensor(80.8499, device='cuda:0')
bert.encoder.layer.5.output.dense.bias tensor(0.1465, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.weight tensor(0.1765, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.bias tensor(0.2115, device='cuda:0')
bert.encoder.layer.6.attention.self.query.weight tensor(14.0620, device='cuda:0')
bert.encoder.layer.6.attention.self.query.bias tensor(0.0290, device='cuda:0')
bert.encoder.layer.6.attention.self.key.weight tensor(9.8539, device='cuda:0')
bert.encoder.layer.6.attention.self.key.bias tensor(4.3723e-09, device='cuda:0')
bert.encoder.layer.6.attention.self.value.weight tensor(20.8664, device='cuda:0')
bert.encoder.layer.6.attention.self.value.bias tensor(0.1118, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.weight tensor(27.6132, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.bias tensor(0.1344, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.weight tensor(0.1524, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.bias tensor(0.1539, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.weight tensor(86.6879, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.bias tensor(0.1428, device='cuda:0')
bert.encoder.layer.6.output.dense.weight tensor(76.1514, device='cuda:0')
bert.encoder.layer.6.output.dense.bias tensor(0.1344, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.weight tensor(0.1398, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.bias tensor(0.2193, device='cuda:0')
bert.encoder.layer.7.attention.self.query.weight tensor(4.8259, device='cuda:0')
bert.encoder.layer.7.attention.self.query.bias tensor(0.0078, device='cuda:0')
bert.encoder.layer.7.attention.self.key.weight tensor(4.1931, device='cuda:0')
bert.encoder.layer.7.attention.self.key.bias tensor(4.4222e-09, device='cuda:0')
bert.encoder.layer.7.attention.self.value.weight tensor(16.8241, device='cuda:0')
bert.encoder.layer.7.attention.self.value.bias tensor(0.1338, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.weight tensor(17.7709, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.bias tensor(0.1604, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.weight tensor(0.1562, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.bias tensor(0.1642, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.weight tensor(94.5619, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.bias tensor(0.1554, device='cuda:0')
bert.encoder.layer.7.output.dense.weight tensor(73.7401, device='cuda:0')
bert.encoder.layer.7.output.dense.bias tensor(0.1251, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.weight tensor(0.1475, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.bias tensor(0.2323, device='cuda:0')
bert.encoder.layer.8.attention.self.query.weight tensor(16.0987, device='cuda:0')
bert.encoder.layer.8.attention.self.query.bias tensor(0.0276, device='cuda:0')
bert.encoder.layer.8.attention.self.key.weight tensor(16.9376, device='cuda:0')
bert.encoder.layer.8.attention.self.key.bias tensor(1.6752e-08, device='cuda:0')
bert.encoder.layer.8.attention.self.value.weight tensor(35.1003, device='cuda:0')
bert.encoder.layer.8.attention.self.value.bias tensor(0.1271, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.weight tensor(34.8184, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.bias tensor(0.1319, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.weight tensor(0.1465, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.bias tensor(0.1389, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.weight tensor(86.8601, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.bias tensor(0.1462, device='cuda:0')
bert.encoder.layer.8.output.dense.weight tensor(75.1832, device='cuda:0')
bert.encoder.layer.8.output.dense.bias tensor(0.1276, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.weight tensor(0.1391, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.bias tensor(0.1769, device='cuda:0')
bert.encoder.layer.9.attention.self.query.weight tensor(17.6393, device='cuda:0')
bert.encoder.layer.9.attention.self.query.bias tensor(0.0314, device='cuda:0')
bert.encoder.layer.9.attention.self.key.weight tensor(18.4342, device='cuda:0')
bert.encoder.layer.9.attention.self.key.bias tensor(1.6335e-08, device='cuda:0')
bert.encoder.layer.9.attention.self.value.weight tensor(35.4549, device='cuda:0')
bert.encoder.layer.9.attention.self.value.bias tensor(0.1125, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.weight tensor(35.8832, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.bias tensor(0.1150, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.weight tensor(0.1223, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.bias tensor(0.1206, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.weight tensor(77.4020, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.bias tensor(0.1244, device='cuda:0')
bert.encoder.layer.9.output.dense.weight tensor(67.6554, device='cuda:0')
bert.encoder.layer.9.output.dense.bias tensor(0.1014, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.weight tensor(0.1135, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.bias tensor(0.1126, device='cuda:0')
bert.encoder.layer.10.attention.self.query.weight tensor(11.2162, device='cuda:0')
bert.encoder.layer.10.attention.self.query.bias tensor(0.0159, device='cuda:0')
bert.encoder.layer.10.attention.self.key.weight tensor(11.9806, device='cuda:0')
bert.encoder.layer.10.attention.self.key.bias tensor(1.3006e-08, device='cuda:0')
bert.encoder.layer.10.attention.self.value.weight tensor(42.5185, device='cuda:0')
bert.encoder.layer.10.attention.self.value.bias tensor(0.0579, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.weight tensor(42.7435, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.bias tensor(0.0549, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.weight tensor(0.1091, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.bias tensor(0.0988, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.weight tensor(77.6458, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.bias tensor(0.1205, device='cuda:0')
bert.encoder.layer.10.output.dense.weight tensor(66.0054, device='cuda:0')
bert.encoder.layer.10.output.dense.bias tensor(0.0814, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.weight tensor(0.0989, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.bias tensor(0.0837, device='cuda:0')
bert.encoder.layer.11.attention.self.query.weight tensor(6.8811, device='cuda:0')
bert.encoder.layer.11.attention.self.query.bias tensor(0.0098, device='cuda:0')
bert.encoder.layer.11.attention.self.key.weight tensor(8.0425, device='cuda:0')
bert.encoder.layer.11.attention.self.key.bias tensor(1.0565e-08, device='cuda:0')
bert.encoder.layer.11.attention.self.value.weight tensor(38.6155, device='cuda:0')
bert.encoder.layer.11.attention.self.value.bias tensor(0.0467, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.weight tensor(40.1608, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.bias tensor(0.0413, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.weight tensor(0.1105, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.bias tensor(0.0918, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.weight tensor(81.1769, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.bias tensor(0.1125, device='cuda:0')
bert.encoder.layer.11.output.dense.weight tensor(54.5777, device='cuda:0')
bert.encoder.layer.11.output.dense.bias tensor(0.0664, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.weight tensor(0.1318, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.bias tensor(0.1419, device='cuda:0')
bert.pooler.dense.weight tensor(111.0366, device='cuda:0')
bert.pooler.dense.bias tensor(0.1533, device='cuda:0')
classifier.weight tensor(23.4280, device='cuda:0')
classifier.bias tensor(0.0300, device='cuda:0')
**************************************************
LOSS  0.4168481940792195
Accuracy  0.8214221845314406
Precision  0.8431657568101438
Recall  0.7925892012182459
F1  0.8170955807344478
Curr loss  tensor(0.4152, device='cuda:0', grad_fn=<DivBackward1>)
Curr prec  0.8636363636363636
Curr recall  0.9047619047619048
tensor([[ 1.5689, -1.2057],
        [-1.4648,  0.1516],
        [-1.7428,  1.7798],
        [ 0.6639, -0.8222],
        [ 1.4268, -1.3504],
        [-1.8149,  1.1485],
        [ 0.1138, -0.1722],
        [-1.8363,  1.0708],
        [ 1.5528, -1.3493],
        [-1.0350,  0.4415],
        [-0.0123, -0.0644],
        [-1.4242,  0.2860],
        [-1.8176,  1.1574],
        [-1.8342,  1.0726],
        [-1.4371,  0.2809],
        [ 0.3801, -0.6509],
        [ 0.0165, -0.1892],
        [ 1.3001, -1.2953],
        [-1.8004,  1.1572],
        [-1.9237,  1.6450],
        [-1.8128,  1.1507],
        [-1.9006,  1.1677],
        [-0.9735,  0.4517],
        [-1.3426,  0.1390],
        [-1.8139,  1.1574],
        [-1.4659,  0.1531],
        [-1.4446,  0.1093],
        [ 0.8565, -1.0648],
        [-1.4611,  0.1902],
        [-0.5737, -0.0354],
        [-1.4430,  0.1484],
        [-1.4443,  0.1112]], device='cuda:0', grad_fn=<AddmmBackward0>)
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
  0%|                                                                                                                         | 351/625000 [01:45<54:22:35,  3.19it/s]bert.embeddings.word_embeddings.weight tensor(66.6292, device='cuda:0')
bert.embeddings.position_embeddings.weight tensor(43.7594, device='cuda:0')
bert.embeddings.token_type_embeddings.weight tensor(13.9821, device='cuda:0')
bert.embeddings.LayerNorm.weight tensor(0.3890, device='cuda:0')
bert.embeddings.LayerNorm.bias tensor(0.9199, device='cuda:0')
bert.encoder.layer.0.attention.self.query.weight tensor(34.1214, device='cuda:0')
bert.encoder.layer.0.attention.self.query.bias tensor(0.0966, device='cuda:0')
bert.encoder.layer.0.attention.self.key.weight tensor(31.1542, device='cuda:0')
bert.encoder.layer.0.attention.self.key.bias tensor(1.7356e-08, device='cuda:0')
bert.encoder.layer.0.attention.self.value.weight tensor(99.1070, device='cuda:0')
bert.encoder.layer.0.attention.self.value.bias tensor(0.6440, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.weight tensor(106.4143, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.bias tensor(0.7171, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.weight tensor(0.3649, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.bias tensor(0.6170, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.weight tensor(268.6403, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.bias tensor(0.5079, device='cuda:0')
bert.encoder.layer.0.output.dense.weight tensor(258.0770, device='cuda:0')
bert.encoder.layer.0.output.dense.bias tensor(0.5939, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.weight tensor(0.4315, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.bias tensor(0.8439, device='cuda:0')
bert.encoder.layer.1.attention.self.query.weight tensor(41.5297, device='cuda:0')
bert.encoder.layer.1.attention.self.query.bias tensor(0.0933, device='cuda:0')
bert.encoder.layer.1.attention.self.key.weight tensor(33.0769, device='cuda:0')
bert.encoder.layer.1.attention.self.key.bias tensor(1.8144e-08, device='cuda:0')
bert.encoder.layer.1.attention.self.value.weight tensor(81.8984, device='cuda:0')
bert.encoder.layer.1.attention.self.value.bias tensor(0.6464, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.weight tensor(72.8722, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.bias tensor(0.7226, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.weight tensor(0.3903, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.bias tensor(0.6980, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.weight tensor(288.6067, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.bias tensor(0.5700, device='cuda:0')
bert.encoder.layer.1.output.dense.weight tensor(255.2820, device='cuda:0')
bert.encoder.layer.1.output.dense.bias tensor(0.6759, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.weight tensor(0.4897, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.bias tensor(0.9619, device='cuda:0')
bert.encoder.layer.2.attention.self.query.weight tensor(50.7929, device='cuda:0')
bert.encoder.layer.2.attention.self.query.bias tensor(0.0962, device='cuda:0')
bert.encoder.layer.2.attention.self.key.weight tensor(49.1473, device='cuda:0')
bert.encoder.layer.2.attention.self.key.bias tensor(3.7834e-08, device='cuda:0')
bert.encoder.layer.2.attention.self.value.weight tensor(156.6502, device='cuda:0')
bert.encoder.layer.2.attention.self.value.bias tensor(0.6431, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.weight tensor(150.3043, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.bias tensor(0.6569, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.weight tensor(0.4589, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.bias tensor(0.7348, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.weight tensor(281.3867, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.bias tensor(0.5560, device='cuda:0')
bert.encoder.layer.2.output.dense.weight tensor(279.1976, device='cuda:0')
bert.encoder.layer.2.output.dense.bias tensor(0.6511, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.weight tensor(0.4969, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.bias tensor(0.9101, device='cuda:0')
bert.encoder.layer.3.attention.self.query.weight tensor(41.5746, device='cuda:0')
bert.encoder.layer.3.attention.self.query.bias tensor(0.0770, device='cuda:0')
bert.encoder.layer.3.attention.self.key.weight tensor(31.9106, device='cuda:0')
bert.encoder.layer.3.attention.self.key.bias tensor(3.0986e-08, device='cuda:0')
bert.encoder.layer.3.attention.self.value.weight tensor(97.4103, device='cuda:0')
bert.encoder.layer.3.attention.self.value.bias tensor(0.6933, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.weight tensor(118.5880, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.bias tensor(0.7711, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.weight tensor(0.4825, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.bias tensor(0.7897, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.weight tensor(313.9943, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.bias tensor(0.6500, device='cuda:0')
bert.encoder.layer.3.output.dense.weight tensor(289.5966, device='cuda:0')
bert.encoder.layer.3.output.dense.bias tensor(0.8194, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.weight tensor(0.5462, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.bias tensor(1.0666, device='cuda:0')
bert.encoder.layer.4.attention.self.query.weight tensor(57.9713, device='cuda:0')
bert.encoder.layer.4.attention.self.query.bias tensor(0.1124, device='cuda:0')
bert.encoder.layer.4.attention.self.key.weight tensor(48.8412, device='cuda:0')
bert.encoder.layer.4.attention.self.key.bias tensor(2.8087e-08, device='cuda:0')
bert.encoder.layer.4.attention.self.value.weight tensor(158.8138, device='cuda:0')
bert.encoder.layer.4.attention.self.value.bias tensor(0.6527, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.weight tensor(152.7917, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.bias tensor(0.6563, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.weight tensor(0.5158, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.bias tensor(0.8080, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.weight tensor(312.9994, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.bias tensor(0.6722, device='cuda:0')
bert.encoder.layer.4.output.dense.weight tensor(292.5331, device='cuda:0')
bert.encoder.layer.4.output.dense.bias tensor(0.7028, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.weight tensor(0.5275, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.bias tensor(1.0101, device='cuda:0')
bert.encoder.layer.5.attention.self.query.weight tensor(45.3713, device='cuda:0')
bert.encoder.layer.5.attention.self.query.bias tensor(0.0941, device='cuda:0')
bert.encoder.layer.5.attention.self.key.weight tensor(35.1204, device='cuda:0')
bert.encoder.layer.5.attention.self.key.bias tensor(1.8529e-08, device='cuda:0')
bert.encoder.layer.5.attention.self.value.weight tensor(113.0423, device='cuda:0')
bert.encoder.layer.5.attention.self.value.bias tensor(0.5806, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.weight tensor(109.2490, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.bias tensor(0.6473, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.weight tensor(0.5203, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.bias tensor(0.7495, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.weight tensor(313.8066, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.bias tensor(0.6948, device='cuda:0')
bert.encoder.layer.5.output.dense.weight tensor(305.8079, device='cuda:0')
bert.encoder.layer.5.output.dense.bias tensor(0.7685, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.weight tensor(0.6394, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.bias tensor(0.9999, device='cuda:0')
bert.encoder.layer.6.attention.self.query.weight tensor(58.9353, device='cuda:0')
bert.encoder.layer.6.attention.self.query.bias tensor(0.1411, device='cuda:0')
bert.encoder.layer.6.attention.self.key.weight tensor(49.4978, device='cuda:0')
bert.encoder.layer.6.attention.self.key.bias tensor(1.9345e-08, device='cuda:0')
bert.encoder.layer.6.attention.self.value.weight tensor(90.3160, device='cuda:0')
bert.encoder.layer.6.attention.self.value.bias tensor(0.5659, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.weight tensor(115.3695, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.bias tensor(0.6154, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.weight tensor(0.5218, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.bias tensor(0.7385, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.weight tensor(265.7958, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.bias tensor(0.5419, device='cuda:0')
bert.encoder.layer.6.output.dense.weight tensor(258.4232, device='cuda:0')
bert.encoder.layer.6.output.dense.bias tensor(0.6353, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.weight tensor(0.5157, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.bias tensor(0.9313, device='cuda:0')
bert.encoder.layer.7.attention.self.query.weight tensor(15.7118, device='cuda:0')
bert.encoder.layer.7.attention.self.query.bias tensor(0.0307, device='cuda:0')
bert.encoder.layer.7.attention.self.key.weight tensor(13.2979, device='cuda:0')
bert.encoder.layer.7.attention.self.key.bias tensor(9.6515e-09, device='cuda:0')
bert.encoder.layer.7.attention.self.value.weight tensor(50.6340, device='cuda:0')
bert.encoder.layer.7.attention.self.value.bias tensor(0.6342, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.weight tensor(42.4814, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.bias tensor(0.6821, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.weight tensor(0.5544, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.bias tensor(0.7227, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.weight tensor(304.6202, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.bias tensor(0.6201, device='cuda:0')
bert.encoder.layer.7.output.dense.weight tensor(294.1937, device='cuda:0')
bert.encoder.layer.7.output.dense.bias tensor(0.6624, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.weight tensor(0.5419, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.bias tensor(1.0494, device='cuda:0')
bert.encoder.layer.8.attention.self.query.weight tensor(64.8774, device='cuda:0')
bert.encoder.layer.8.attention.self.query.bias tensor(0.1352, device='cuda:0')
bert.encoder.layer.8.attention.self.key.weight tensor(54.9962, device='cuda:0')
bert.encoder.layer.8.attention.self.key.bias tensor(2.7096e-08, device='cuda:0')
bert.encoder.layer.8.attention.self.value.weight tensor(105.1277, device='cuda:0')
bert.encoder.layer.8.attention.self.value.bias tensor(0.6592, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.weight tensor(97.4736, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.bias tensor(0.6397, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.weight tensor(0.4994, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.bias tensor(0.6256, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.weight tensor(286.8506, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.bias tensor(0.5672, device='cuda:0')
bert.encoder.layer.8.output.dense.weight tensor(261.1052, device='cuda:0')
bert.encoder.layer.8.output.dense.bias tensor(0.5595, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.weight tensor(0.5610, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.bias tensor(0.9060, device='cuda:0')
bert.encoder.layer.9.attention.self.query.weight tensor(65.3141, device='cuda:0')
bert.encoder.layer.9.attention.self.query.bias tensor(0.2015, device='cuda:0')
bert.encoder.layer.9.attention.self.key.weight tensor(62.6821, device='cuda:0')
bert.encoder.layer.9.attention.self.key.bias tensor(3.2864e-08, device='cuda:0')
bert.encoder.layer.9.attention.self.value.weight tensor(93.0076, device='cuda:0')
bert.encoder.layer.9.attention.self.value.bias tensor(0.5826, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.weight tensor(86.3393, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.bias tensor(0.5711, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.weight tensor(0.4106, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.bias tensor(0.5623, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.weight tensor(238.6763, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.bias tensor(0.4550, device='cuda:0')
bert.encoder.layer.9.output.dense.weight tensor(232.8978, device='cuda:0')
bert.encoder.layer.9.output.dense.bias tensor(0.5126, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.weight tensor(0.4804, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.bias tensor(0.5488, device='cuda:0')
bert.encoder.layer.10.attention.self.query.weight tensor(40.5091, device='cuda:0')
bert.encoder.layer.10.attention.self.query.bias tensor(0.0725, device='cuda:0')
bert.encoder.layer.10.attention.self.key.weight tensor(41.8124, device='cuda:0')
bert.encoder.layer.10.attention.self.key.bias tensor(3.3915e-08, device='cuda:0')
bert.encoder.layer.10.attention.self.value.weight tensor(144.6175, device='cuda:0')
bert.encoder.layer.10.attention.self.value.bias tensor(0.2718, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.weight tensor(161.7161, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.bias tensor(0.2899, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.weight tensor(0.4272, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.bias tensor(0.5004, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.weight tensor(184.2863, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.bias tensor(0.3239, device='cuda:0')
bert.encoder.layer.10.output.dense.weight tensor(183.2498, device='cuda:0')
bert.encoder.layer.10.output.dense.bias tensor(0.3695, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.weight tensor(0.4348, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.bias tensor(0.4306, device='cuda:0')
bert.encoder.layer.11.attention.self.query.weight tensor(41.5822, device='cuda:0')
bert.encoder.layer.11.attention.self.query.bias tensor(0.0696, device='cuda:0')
bert.encoder.layer.11.attention.self.key.weight tensor(56.6298, device='cuda:0')
bert.encoder.layer.11.attention.self.key.bias tensor(2.1605e-08, device='cuda:0')
bert.encoder.layer.11.attention.self.value.weight tensor(78.1726, device='cuda:0')
bert.encoder.layer.11.attention.self.value.bias tensor(0.2004, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.weight tensor(82.0728, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.bias tensor(0.1791, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.weight tensor(0.2505, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.bias tensor(0.3292, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.weight tensor(114.9406, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.bias tensor(0.1945, device='cuda:0')
bert.encoder.layer.11.output.dense.weight tensor(100.8047, device='cuda:0')
bert.encoder.layer.11.output.dense.bias tensor(0.2434, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.weight tensor(0.3416, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.bias tensor(0.5040, device='cuda:0')
bert.pooler.dense.weight tensor(378.6334, device='cuda:0')
bert.pooler.dense.bias tensor(1.2261, device='cuda:0')
classifier.weight tensor(77.5201, device='cuda:0')
classifier.bias tensor(0.1378, device='cuda:0')
**************************************************
LOSS  0.43332445663058
Accuracy  0.8207475580608263
Precision  0.8377197824878886
Recall  0.7800668945162665
F1  0.8078660536508933
Curr loss  tensor(0.5228, device='cuda:0', grad_fn=<DivBackward1>)
Curr prec  0.8
Curr recall  0.5714285714285714
tensor([[-0.8950, -0.1621],
        [-1.7041,  1.0061],
        [ 1.5411, -1.2270],
        [-1.8449,  1.5523],
        [ 1.4275, -1.0633],
        [ 1.4347, -0.9471],
        [ 1.0490, -0.6547],
        [-1.6530,  0.7902],
        [-1.2712,  0.1828],
        [ 0.8679, -0.3974],
        [ 1.1045, -0.6844],
        [ 1.5202, -1.1976],
        [ 1.4624, -1.0804],
        [-1.3098, -0.0180],
        [ 1.0143, -0.5790],
        [ 1.0038, -0.5482],
        [-1.2616,  0.1627],
        [ 0.4722, -0.5168],
        [ 1.4284, -1.0045],
        [ 0.0166, -0.5282],
        [ 1.1127, -0.7038],
        [ 1.3737, -0.1391],
        [-1.6591,  1.0156],
        [ 1.0517, -0.6819],
        [ 1.4555, -0.9948],
        [ 0.8637, -0.4128],
        [ 1.3870, -1.0342],
        [-1.6869,  0.9709],
        [ 1.4113, -0.2278],
        [ 1.5562, -0.8979],
        [-1.5961,  1.1085],
        [ 1.5352, -1.2111]], device='cuda:0', grad_fn=<AddmmBackward0>)
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
  0%|                                                                                                                         | 401/625000 [02:00<51:20:43,  3.38it/s]bert.embeddings.word_embeddings.weight tensor(28.0925, device='cuda:0')
bert.embeddings.position_embeddings.weight tensor(20.2427, device='cuda:0')
bert.embeddings.token_type_embeddings.weight tensor(4.5485, device='cuda:0')
bert.embeddings.LayerNorm.weight tensor(0.2115, device='cuda:0')
bert.embeddings.LayerNorm.bias tensor(0.3013, device='cuda:0')
bert.encoder.layer.0.attention.self.query.weight tensor(21.2585, device='cuda:0')
bert.encoder.layer.0.attention.self.query.bias tensor(0.0353, device='cuda:0')
bert.encoder.layer.0.attention.self.key.weight tensor(18.2769, device='cuda:0')
bert.encoder.layer.0.attention.self.key.bias tensor(1.4548e-08, device='cuda:0')
bert.encoder.layer.0.attention.self.value.weight tensor(40.0721, device='cuda:0')
bert.encoder.layer.0.attention.self.value.bias tensor(0.1845, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.weight tensor(44.0747, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.bias tensor(0.2520, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.weight tensor(0.2105, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.bias tensor(0.2197, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.weight tensor(140.5123, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.bias tensor(0.2168, device='cuda:0')
bert.encoder.layer.0.output.dense.weight tensor(118.3095, device='cuda:0')
bert.encoder.layer.0.output.dense.bias tensor(0.2025, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.weight tensor(0.2280, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.bias tensor(0.2694, device='cuda:0')
bert.encoder.layer.1.attention.self.query.weight tensor(16.2608, device='cuda:0')
bert.encoder.layer.1.attention.self.query.bias tensor(0.0253, device='cuda:0')
bert.encoder.layer.1.attention.self.key.weight tensor(11.8869, device='cuda:0')
bert.encoder.layer.1.attention.self.key.bias tensor(1.1701e-08, device='cuda:0')
bert.encoder.layer.1.attention.self.value.weight tensor(26.6388, device='cuda:0')
bert.encoder.layer.1.attention.self.value.bias tensor(0.1823, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.weight tensor(27.2695, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.bias tensor(0.2600, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.weight tensor(0.2253, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.bias tensor(0.2458, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.weight tensor(138.2182, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.bias tensor(0.2221, device='cuda:0')
bert.encoder.layer.1.output.dense.weight tensor(107.1008, device='cuda:0')
bert.encoder.layer.1.output.dense.bias tensor(0.2446, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.weight tensor(0.2534, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.bias tensor(0.3395, device='cuda:0')
bert.encoder.layer.2.attention.self.query.weight tensor(22.6142, device='cuda:0')
bert.encoder.layer.2.attention.self.query.bias tensor(0.0371, device='cuda:0')
bert.encoder.layer.2.attention.self.key.weight tensor(18.3893, device='cuda:0')
bert.encoder.layer.2.attention.self.key.bias tensor(1.8917e-08, device='cuda:0')
bert.encoder.layer.2.attention.self.value.weight tensor(48.0829, device='cuda:0')
bert.encoder.layer.2.attention.self.value.bias tensor(0.1931, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.weight tensor(45.3107, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.bias tensor(0.2410, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.weight tensor(0.2384, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.bias tensor(0.2383, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.weight tensor(137.2563, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.bias tensor(0.2326, device='cuda:0')
bert.encoder.layer.2.output.dense.weight tensor(105.6363, device='cuda:0')
bert.encoder.layer.2.output.dense.bias tensor(0.2178, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.weight tensor(0.2348, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.bias tensor(0.2849, device='cuda:0')
bert.encoder.layer.3.attention.self.query.weight tensor(19.6784, device='cuda:0')
bert.encoder.layer.3.attention.self.query.bias tensor(0.0321, device='cuda:0')
bert.encoder.layer.3.attention.self.key.weight tensor(17.6620, device='cuda:0')
bert.encoder.layer.3.attention.self.key.bias tensor(1.7434e-08, device='cuda:0')
bert.encoder.layer.3.attention.self.value.weight tensor(32.4908, device='cuda:0')
bert.encoder.layer.3.attention.self.value.bias tensor(0.1821, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.weight tensor(33.6750, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.bias tensor(0.2225, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.weight tensor(0.2255, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.bias tensor(0.2291, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.weight tensor(132.4422, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.bias tensor(0.2199, device='cuda:0')
bert.encoder.layer.3.output.dense.weight tensor(105.6641, device='cuda:0')
bert.encoder.layer.3.output.dense.bias tensor(0.2213, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.weight tensor(0.2310, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.bias tensor(0.3143, device='cuda:0')
bert.encoder.layer.4.attention.self.query.weight tensor(24.1372, device='cuda:0')
bert.encoder.layer.4.attention.self.query.bias tensor(0.0450, device='cuda:0')
bert.encoder.layer.4.attention.self.key.weight tensor(18.0792, device='cuda:0')
bert.encoder.layer.4.attention.self.key.bias tensor(1.3588e-08, device='cuda:0')
bert.encoder.layer.4.attention.self.value.weight tensor(29.7420, device='cuda:0')
bert.encoder.layer.4.attention.self.value.bias tensor(0.1933, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.weight tensor(34.4273, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.bias tensor(0.2211, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.weight tensor(0.2229, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.bias tensor(0.2421, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.weight tensor(135.7883, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.bias tensor(0.2435, device='cuda:0')
bert.encoder.layer.4.output.dense.weight tensor(111.5789, device='cuda:0')
bert.encoder.layer.4.output.dense.bias tensor(0.2044, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.weight tensor(0.2166, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.bias tensor(0.2712, device='cuda:0')
bert.encoder.layer.5.attention.self.query.weight tensor(16.1258, device='cuda:0')
bert.encoder.layer.5.attention.self.query.bias tensor(0.0319, device='cuda:0')
bert.encoder.layer.5.attention.self.key.weight tensor(15.6757, device='cuda:0')
bert.encoder.layer.5.attention.self.key.bias tensor(1.1713e-08, device='cuda:0')
bert.encoder.layer.5.attention.self.value.weight tensor(33.1878, device='cuda:0')
bert.encoder.layer.5.attention.self.value.bias tensor(0.1709, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.weight tensor(31.3978, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.bias tensor(0.2046, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.weight tensor(0.2018, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.bias tensor(0.2217, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.weight tensor(111.7968, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.bias tensor(0.2019, device='cuda:0')
bert.encoder.layer.5.output.dense.weight tensor(91.5077, device='cuda:0')
bert.encoder.layer.5.output.dense.bias tensor(0.2017, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.weight tensor(0.1960, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.bias tensor(0.2830, device='cuda:0')
bert.encoder.layer.6.attention.self.query.weight tensor(14.9815, device='cuda:0')
bert.encoder.layer.6.attention.self.query.bias tensor(0.0277, device='cuda:0')
bert.encoder.layer.6.attention.self.key.weight tensor(12.6666, device='cuda:0')
bert.encoder.layer.6.attention.self.key.bias tensor(9.3356e-09, device='cuda:0')
bert.encoder.layer.6.attention.self.value.weight tensor(25.4020, device='cuda:0')
bert.encoder.layer.6.attention.self.value.bias tensor(0.1611, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.weight tensor(23.6568, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.bias tensor(0.1950, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.weight tensor(0.1748, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.bias tensor(0.2045, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.weight tensor(98.8611, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.bias tensor(0.1823, device='cuda:0')
bert.encoder.layer.6.output.dense.weight tensor(82.5787, device='cuda:0')
bert.encoder.layer.6.output.dense.bias tensor(0.1851, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.weight tensor(0.1840, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.bias tensor(0.2668, device='cuda:0')
bert.encoder.layer.7.attention.self.query.weight tensor(13.8417, device='cuda:0')
bert.encoder.layer.7.attention.self.query.bias tensor(0.0258, device='cuda:0')
bert.encoder.layer.7.attention.self.key.weight tensor(11.3025, device='cuda:0')
bert.encoder.layer.7.attention.self.key.bias tensor(9.6991e-09, device='cuda:0')
bert.encoder.layer.7.attention.self.value.weight tensor(17.4882, device='cuda:0')
bert.encoder.layer.7.attention.self.value.bias tensor(0.1502, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.weight tensor(17.5282, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.bias tensor(0.1828, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.weight tensor(0.1524, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.bias tensor(0.1798, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.weight tensor(93.1890, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.bias tensor(0.1673, device='cuda:0')
bert.encoder.layer.7.output.dense.weight tensor(78.5857, device='cuda:0')
bert.encoder.layer.7.output.dense.bias tensor(0.1606, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.weight tensor(0.1380, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.bias tensor(0.2925, device='cuda:0')
bert.encoder.layer.8.attention.self.query.weight tensor(14.7524, device='cuda:0')
bert.encoder.layer.8.attention.self.query.bias tensor(0.0344, device='cuda:0')
bert.encoder.layer.8.attention.self.key.weight tensor(13.3906, device='cuda:0')
bert.encoder.layer.8.attention.self.key.bias tensor(9.3483e-09, device='cuda:0')
bert.encoder.layer.8.attention.self.value.weight tensor(26.1446, device='cuda:0')
bert.encoder.layer.8.attention.self.value.bias tensor(0.1534, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.weight tensor(27.1302, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.bias tensor(0.1726, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.weight tensor(0.1451, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.bias tensor(0.1779, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.weight tensor(79.1290, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.bias tensor(0.1477, device='cuda:0')
bert.encoder.layer.8.output.dense.weight tensor(70.9561, device='cuda:0')
bert.encoder.layer.8.output.dense.bias tensor(0.1681, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.weight tensor(0.1388, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.bias tensor(0.2393, device='cuda:0')
bert.encoder.layer.9.attention.self.query.weight tensor(12.0747, device='cuda:0')
bert.encoder.layer.9.attention.self.query.bias tensor(0.0308, device='cuda:0')
bert.encoder.layer.9.attention.self.key.weight tensor(13.1225, device='cuda:0')
bert.encoder.layer.9.attention.self.key.bias tensor(1.1989e-08, device='cuda:0')
bert.encoder.layer.9.attention.self.value.weight tensor(30.1239, device='cuda:0')
bert.encoder.layer.9.attention.self.value.bias tensor(0.1629, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.weight tensor(29.5183, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.bias tensor(0.1768, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.weight tensor(0.1423, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.bias tensor(0.1754, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.weight tensor(75.9073, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.bias tensor(0.1350, device='cuda:0')
bert.encoder.layer.9.output.dense.weight tensor(70.4477, device='cuda:0')
bert.encoder.layer.9.output.dense.bias tensor(0.1608, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.weight tensor(0.1419, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.bias tensor(0.1841, device='cuda:0')
bert.encoder.layer.10.attention.self.query.weight tensor(11.7213, device='cuda:0')
bert.encoder.layer.10.attention.self.query.bias tensor(0.0211, device='cuda:0')
bert.encoder.layer.10.attention.self.key.weight tensor(13.6403, device='cuda:0')
bert.encoder.layer.10.attention.self.key.bias tensor(1.3161e-08, device='cuda:0')
bert.encoder.layer.10.attention.self.value.weight tensor(45.3326, device='cuda:0')
bert.encoder.layer.10.attention.self.value.bias tensor(0.0961, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.weight tensor(44.5629, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.bias tensor(0.0931, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.weight tensor(0.1295, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.bias tensor(0.1510, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.weight tensor(67.5245, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.bias tensor(0.1177, device='cuda:0')
bert.encoder.layer.10.output.dense.weight tensor(66.0245, device='cuda:0')
bert.encoder.layer.10.output.dense.bias tensor(0.1426, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.weight tensor(0.1230, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.bias tensor(0.1616, device='cuda:0')
bert.encoder.layer.11.attention.self.query.weight tensor(8.7569, device='cuda:0')
bert.encoder.layer.11.attention.self.query.bias tensor(0.0134, device='cuda:0')
bert.encoder.layer.11.attention.self.key.weight tensor(11.3525, device='cuda:0')
bert.encoder.layer.11.attention.self.key.bias tensor(1.2270e-08, device='cuda:0')
bert.encoder.layer.11.attention.self.value.weight tensor(44.1181, device='cuda:0')
bert.encoder.layer.11.attention.self.value.bias tensor(0.0914, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.weight tensor(49.2792, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.bias tensor(0.0905, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.weight tensor(0.1483, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.bias tensor(0.1959, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.weight tensor(92.5384, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.bias tensor(0.1646, device='cuda:0')
bert.encoder.layer.11.output.dense.weight tensor(81.0368, device='cuda:0')
bert.encoder.layer.11.output.dense.bias tensor(0.1465, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.weight tensor(0.2321, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.bias tensor(0.5168, device='cuda:0')
bert.pooler.dense.weight tensor(249.3680, device='cuda:0')
bert.pooler.dense.bias tensor(1.1529, device='cuda:0')
classifier.weight tensor(36.1855, device='cuda:0')
classifier.bias tensor(0.1476, device='cuda:0')
**************************************************
LOSS  0.3885661152521114
Accuracy  0.8077865153097975
Precision  0.7915777643886241
Recall  0.8392196477891941
F1  0.8147028044897883
Curr loss  tensor(0.3084, device='cuda:0', grad_fn=<DivBackward1>)
Curr prec  0.7
Curr recall  0.9333333333333333
tensor([[ 1.5087, -1.1546],
        [ 1.3974, -1.4844],
        [-1.3808,  0.0726],
        [-0.3487,  0.0029],
        [ 1.5298, -0.8675],
        [ 1.4218, -1.4835],
        [-1.2835,  1.5581],
        [-1.6167,  1.5148],
        [ 1.4233, -1.4759],
        [-0.0561,  0.1153],
        [-0.3262, -0.0984],
        [-1.8161,  1.8029],
        [-0.1102,  0.1116],
        [ 0.9265, -0.0500],
        [-0.5737,  0.2956],
        [ 1.4127, -0.3919],
        [-1.6100,  1.5249],
        [-0.3298, -0.1098],
        [-0.5998, -0.0231],
        [ 0.8487, -0.0280],
        [-1.8129,  1.8100],
        [ 1.3963, -0.8711],
        [-1.3579,  0.0226],
        [-1.3323,  1.5763],
        [-1.8099,  1.8072],
        [-1.3424,  1.5764],
        [-1.6131,  1.5257],
        [ 1.3917, -0.8643],
        [ 0.0897,  0.1104],
        [ 1.3582, -0.1236],
        [ 1.1298, -0.7401],
        [-1.2809,  1.5545]], device='cuda:0', grad_fn=<AddmmBackward0>)
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
  0%|                                                                                                                         | 451/625000 [02:15<51:29:35,  3.37it/s]bert.embeddings.word_embeddings.weight tensor(17.4793, device='cuda:0')
bert.embeddings.position_embeddings.weight tensor(9.1764, device='cuda:0')
bert.embeddings.token_type_embeddings.weight tensor(2.9199, device='cuda:0')
bert.embeddings.LayerNorm.weight tensor(0.1342, device='cuda:0')
bert.embeddings.LayerNorm.bias tensor(0.1916, device='cuda:0')
bert.encoder.layer.0.attention.self.query.weight tensor(11.7002, device='cuda:0')
bert.encoder.layer.0.attention.self.query.bias tensor(0.0226, device='cuda:0')
bert.encoder.layer.0.attention.self.key.weight tensor(9.9816, device='cuda:0')
bert.encoder.layer.0.attention.self.key.bias tensor(7.4143e-09, device='cuda:0')
bert.encoder.layer.0.attention.self.value.weight tensor(26.0789, device='cuda:0')
bert.encoder.layer.0.attention.self.value.bias tensor(0.1280, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.weight tensor(28.4402, device='cuda:0')
bert.encoder.layer.0.attention.output.dense.bias tensor(0.1588, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.weight tensor(0.1386, device='cuda:0')
bert.encoder.layer.0.attention.output.LayerNorm.bias tensor(0.1396, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.weight tensor(93.2712, device='cuda:0')
bert.encoder.layer.0.intermediate.dense.bias tensor(0.1477, device='cuda:0')
bert.encoder.layer.0.output.dense.weight tensor(79.5027, device='cuda:0')
bert.encoder.layer.0.output.dense.bias tensor(0.1252, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.weight tensor(0.1438, device='cuda:0')
bert.encoder.layer.0.output.LayerNorm.bias tensor(0.1749, device='cuda:0')
bert.encoder.layer.1.attention.self.query.weight tensor(11.0379, device='cuda:0')
bert.encoder.layer.1.attention.self.query.bias tensor(0.0185, device='cuda:0')
bert.encoder.layer.1.attention.self.key.weight tensor(7.4828, device='cuda:0')
bert.encoder.layer.1.attention.self.key.bias tensor(5.8352e-09, device='cuda:0')
bert.encoder.layer.1.attention.self.value.weight tensor(17.8052, device='cuda:0')
bert.encoder.layer.1.attention.self.value.bias tensor(0.1219, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.weight tensor(19.0720, device='cuda:0')
bert.encoder.layer.1.attention.output.dense.bias tensor(0.1604, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.weight tensor(0.1397, device='cuda:0')
bert.encoder.layer.1.attention.output.LayerNorm.bias tensor(0.1558, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.weight tensor(94.8505, device='cuda:0')
bert.encoder.layer.1.intermediate.dense.bias tensor(0.1565, device='cuda:0')
bert.encoder.layer.1.output.dense.weight tensor(75.0994, device='cuda:0')
bert.encoder.layer.1.output.dense.bias tensor(0.1454, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.weight tensor(0.1600, device='cuda:0')
bert.encoder.layer.1.output.LayerNorm.bias tensor(0.2051, device='cuda:0')
bert.encoder.layer.2.attention.self.query.weight tensor(13.9662, device='cuda:0')
bert.encoder.layer.2.attention.self.query.bias tensor(0.0243, device='cuda:0')
bert.encoder.layer.2.attention.self.key.weight tensor(12.3453, device='cuda:0')
bert.encoder.layer.2.attention.self.key.bias tensor(9.2157e-09, device='cuda:0')
bert.encoder.layer.2.attention.self.value.weight tensor(31.7516, device='cuda:0')
bert.encoder.layer.2.attention.self.value.bias tensor(0.1247, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.weight tensor(31.6063, device='cuda:0')
bert.encoder.layer.2.attention.output.dense.bias tensor(0.1352, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.weight tensor(0.1456, device='cuda:0')
bert.encoder.layer.2.attention.output.LayerNorm.bias tensor(0.1444, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.weight tensor(93.5080, device='cuda:0')
bert.encoder.layer.2.intermediate.dense.bias tensor(0.1530, device='cuda:0')
bert.encoder.layer.2.output.dense.weight tensor(79.3329, device='cuda:0')
bert.encoder.layer.2.output.dense.bias tensor(0.1204, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.weight tensor(0.1424, device='cuda:0')
bert.encoder.layer.2.output.LayerNorm.bias tensor(0.1599, device='cuda:0')
bert.encoder.layer.3.attention.self.query.weight tensor(14.1629, device='cuda:0')
bert.encoder.layer.3.attention.self.query.bias tensor(0.0197, device='cuda:0')
bert.encoder.layer.3.attention.self.key.weight tensor(9.5624, device='cuda:0')
bert.encoder.layer.3.attention.self.key.bias tensor(7.0355e-09, device='cuda:0')
bert.encoder.layer.3.attention.self.value.weight tensor(23.9484, device='cuda:0')
bert.encoder.layer.3.attention.self.value.bias tensor(0.1044, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.weight tensor(24.9842, device='cuda:0')
bert.encoder.layer.3.attention.output.dense.bias tensor(0.1318, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.weight tensor(0.1548, device='cuda:0')
bert.encoder.layer.3.attention.output.LayerNorm.bias tensor(0.1395, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.weight tensor(86.5375, device='cuda:0')
bert.encoder.layer.3.intermediate.dense.bias tensor(0.1373, device='cuda:0')
bert.encoder.layer.3.output.dense.weight tensor(70.4075, device='cuda:0')
bert.encoder.layer.3.output.dense.bias tensor(0.1190, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.weight tensor(0.1444, device='cuda:0')
bert.encoder.layer.3.output.LayerNorm.bias tensor(0.2019, device='cuda:0')
bert.encoder.layer.4.attention.self.query.weight tensor(8.6387, device='cuda:0')
bert.encoder.layer.4.attention.self.query.bias tensor(0.0131, device='cuda:0')
bert.encoder.layer.4.attention.self.key.weight tensor(5.6277, device='cuda:0')
bert.encoder.layer.4.attention.self.key.bias tensor(4.3422e-09, device='cuda:0')
bert.encoder.layer.4.attention.self.value.weight tensor(14.9848, device='cuda:0')
bert.encoder.layer.4.attention.self.value.bias tensor(0.1224, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.weight tensor(14.6355, device='cuda:0')
bert.encoder.layer.4.attention.output.dense.bias tensor(0.1345, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.weight tensor(0.1542, device='cuda:0')
bert.encoder.layer.4.attention.output.LayerNorm.bias tensor(0.1388, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.weight tensor(84.3296, device='cuda:0')
bert.encoder.layer.4.intermediate.dense.bias tensor(0.1455, device='cuda:0')
bert.encoder.layer.4.output.dense.weight tensor(70.6442, device='cuda:0')
bert.encoder.layer.4.output.dense.bias tensor(0.1222, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.weight tensor(0.1259, device='cuda:0')
bert.encoder.layer.4.output.LayerNorm.bias tensor(0.1781, device='cuda:0')
bert.encoder.layer.5.attention.self.query.weight tensor(6.0606, device='cuda:0')
bert.encoder.layer.5.attention.self.query.bias tensor(0.0105, device='cuda:0')
bert.encoder.layer.5.attention.self.key.weight tensor(4.7746, device='cuda:0')
bert.encoder.layer.5.attention.self.key.bias tensor(3.7308e-09, device='cuda:0')
bert.encoder.layer.5.attention.self.value.weight tensor(15.7823, device='cuda:0')
bert.encoder.layer.5.attention.self.value.bias tensor(0.1032, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.weight tensor(15.9067, device='cuda:0')
bert.encoder.layer.5.attention.output.dense.bias tensor(0.1281, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.weight tensor(0.1480, device='cuda:0')
bert.encoder.layer.5.attention.output.LayerNorm.bias tensor(0.1380, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.weight tensor(80.5676, device='cuda:0')
bert.encoder.layer.5.intermediate.dense.bias tensor(0.1362, device='cuda:0')
bert.encoder.layer.5.output.dense.weight tensor(68.7134, device='cuda:0')
bert.encoder.layer.5.output.dense.bias tensor(0.1248, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.weight tensor(0.1316, device='cuda:0')
bert.encoder.layer.5.output.LayerNorm.bias tensor(0.1894, device='cuda:0')
bert.encoder.layer.6.attention.self.query.weight tensor(6.0792, device='cuda:0')
bert.encoder.layer.6.attention.self.query.bias tensor(0.0101, device='cuda:0')
bert.encoder.layer.6.attention.self.key.weight tensor(4.7600, device='cuda:0')
bert.encoder.layer.6.attention.self.key.bias tensor(4.9517e-09, device='cuda:0')
bert.encoder.layer.6.attention.self.value.weight tensor(14.0997, device='cuda:0')
bert.encoder.layer.6.attention.self.value.bias tensor(0.1116, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.weight tensor(15.4074, device='cuda:0')
bert.encoder.layer.6.attention.output.dense.bias tensor(0.1283, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.weight tensor(0.1447, device='cuda:0')
bert.encoder.layer.6.attention.output.LayerNorm.bias tensor(0.1397, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.weight tensor(83.5143, device='cuda:0')
bert.encoder.layer.6.intermediate.dense.bias tensor(0.1468, device='cuda:0')
bert.encoder.layer.6.output.dense.weight tensor(70.4314, device='cuda:0')
bert.encoder.layer.6.output.dense.bias tensor(0.1286, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.weight tensor(0.1362, device='cuda:0')
bert.encoder.layer.6.output.LayerNorm.bias tensor(0.2084, device='cuda:0')
bert.encoder.layer.7.attention.self.query.weight tensor(4.3351, device='cuda:0')
bert.encoder.layer.7.attention.self.query.bias tensor(0.0079, device='cuda:0')
bert.encoder.layer.7.attention.self.key.weight tensor(3.4415, device='cuda:0')
bert.encoder.layer.7.attention.self.key.bias tensor(3.0804e-09, device='cuda:0')
bert.encoder.layer.7.attention.self.value.weight tensor(11.6877, device='cuda:0')
bert.encoder.layer.7.attention.self.value.bias tensor(0.1276, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.weight tensor(10.5466, device='cuda:0')
bert.encoder.layer.7.attention.output.dense.bias tensor(0.1427, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.weight tensor(0.1500, device='cuda:0')
bert.encoder.layer.7.attention.output.LayerNorm.bias tensor(0.1504, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.weight tensor(95.0092, device='cuda:0')
bert.encoder.layer.7.intermediate.dense.bias tensor(0.1632, device='cuda:0')
bert.encoder.layer.7.output.dense.weight tensor(78.2826, device='cuda:0')
bert.encoder.layer.7.output.dense.bias tensor(0.1320, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.weight tensor(0.1503, device='cuda:0')
bert.encoder.layer.7.output.LayerNorm.bias tensor(0.2371, device='cuda:0')
bert.encoder.layer.8.attention.self.query.weight tensor(15.7346, device='cuda:0')
bert.encoder.layer.8.attention.self.query.bias tensor(0.0329, device='cuda:0')
bert.encoder.layer.8.attention.self.key.weight tensor(15.5269, device='cuda:0')
bert.encoder.layer.8.attention.self.key.bias tensor(1.3843e-08, device='cuda:0')
bert.encoder.layer.8.attention.self.value.weight tensor(34.7959, device='cuda:0')
bert.encoder.layer.8.attention.self.value.bias tensor(0.1323, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.weight tensor(35.0558, device='cuda:0')
bert.encoder.layer.8.attention.output.dense.bias tensor(0.1404, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.weight tensor(0.1424, device='cuda:0')
bert.encoder.layer.8.attention.output.LayerNorm.bias tensor(0.1605, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.weight tensor(83.9862, device='cuda:0')
bert.encoder.layer.8.intermediate.dense.bias tensor(0.1510, device='cuda:0')
bert.encoder.layer.8.output.dense.weight tensor(68.5795, device='cuda:0')
bert.encoder.layer.8.output.dense.bias tensor(0.1306, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.weight tensor(0.1480, device='cuda:0')
bert.encoder.layer.8.output.LayerNorm.bias tensor(0.1853, device='cuda:0')
bert.encoder.layer.9.attention.self.query.weight tensor(13.9491, device='cuda:0')
bert.encoder.layer.9.attention.self.query.bias tensor(0.0310, device='cuda:0')
bert.encoder.layer.9.attention.self.key.weight tensor(13.5772, device='cuda:0')
bert.encoder.layer.9.attention.self.key.bias tensor(2.2793e-08, device='cuda:0')
bert.encoder.layer.9.attention.self.value.weight tensor(33.1030, device='cuda:0')
bert.encoder.layer.9.attention.self.value.bias tensor(0.1172, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.weight tensor(37.5317, device='cuda:0')
bert.encoder.layer.9.attention.output.dense.bias tensor(0.1235, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.weight tensor(0.1315, device='cuda:0')
bert.encoder.layer.9.attention.output.LayerNorm.bias tensor(0.1464, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.weight tensor(74.7203, device='cuda:0')
bert.encoder.layer.9.intermediate.dense.bias tensor(0.1355, device='cuda:0')
bert.encoder.layer.9.output.dense.weight tensor(67.5768, device='cuda:0')
bert.encoder.layer.9.output.dense.bias tensor(0.1236, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.weight tensor(0.1172, device='cuda:0')
bert.encoder.layer.9.output.LayerNorm.bias tensor(0.1433, device='cuda:0')
bert.encoder.layer.10.attention.self.query.weight tensor(6.6017, device='cuda:0')
bert.encoder.layer.10.attention.self.query.bias tensor(0.0115, device='cuda:0')
bert.encoder.layer.10.attention.self.key.weight tensor(7.2444, device='cuda:0')
bert.encoder.layer.10.attention.self.key.bias tensor(1.1841e-08, device='cuda:0')
bert.encoder.layer.10.attention.self.value.weight tensor(44.1672, device='cuda:0')
bert.encoder.layer.10.attention.self.value.bias tensor(0.0722, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.weight tensor(43.1118, device='cuda:0')
bert.encoder.layer.10.attention.output.dense.bias tensor(0.0682, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.weight tensor(0.1288, device='cuda:0')
bert.encoder.layer.10.attention.output.LayerNorm.bias tensor(0.1329, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.weight tensor(72.8822, device='cuda:0')
bert.encoder.layer.10.intermediate.dense.bias tensor(0.1238, device='cuda:0')
bert.encoder.layer.10.output.dense.weight tensor(68.7833, device='cuda:0')
bert.encoder.layer.10.output.dense.bias tensor(0.1134, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.weight tensor(0.1109, device='cuda:0')
bert.encoder.layer.10.output.LayerNorm.bias tensor(0.1275, device='cuda:0')
bert.encoder.layer.11.attention.self.query.weight tensor(3.3185, device='cuda:0')
bert.encoder.layer.11.attention.self.query.bias tensor(0.0060, device='cuda:0')
bert.encoder.layer.11.attention.self.key.weight tensor(3.8181, device='cuda:0')
bert.encoder.layer.11.attention.self.key.bias tensor(2.1562e-08, device='cuda:0')
bert.encoder.layer.11.attention.self.value.weight tensor(43.8110, device='cuda:0')
bert.encoder.layer.11.attention.self.value.bias tensor(0.0737, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.weight tensor(44.1227, device='cuda:0')
bert.encoder.layer.11.attention.output.dense.bias tensor(0.0637, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.weight tensor(0.1388, device='cuda:0')
bert.encoder.layer.11.attention.output.LayerNorm.bias tensor(0.1560, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.weight tensor(72.2097, device='cuda:0')
bert.encoder.layer.11.intermediate.dense.bias tensor(0.1132, device='cuda:0')
bert.encoder.layer.11.output.dense.weight tensor(62.6669, device='cuda:0')
bert.encoder.layer.11.output.dense.bias tensor(0.1309, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.weight tensor(0.1594, device='cuda:0')
bert.encoder.layer.11.output.LayerNorm.bias tensor(0.3865, device='cuda:0')
bert.pooler.dense.weight tensor(175.6582, device='cuda:0')
bert.pooler.dense.bias tensor(0.9905, device='cuda:0')
classifier.weight tensor(45.8561, device='cuda:0')
classifier.bias tensor(0.1358, device='cuda:0')
**************************************************
LOSS  0.4316439567238949
Accuracy  0.796412551375369
Precision  0.8385019497884718
Recall  0.7310235237356315
F1  0.781082767159246
Curr loss  tensor(0.4742, device='cuda:0', grad_fn=<DivBackward1>)
Curr prec  0.875
Curr recall  0.5
tensor([[ 0.2654, -0.1378],
        [ 0.9180, -0.5306],
        [-1.7092,  0.8196],
        [ 0.9984, -0.2409],
        [ 1.4210, -1.0747],
        [-1.6072,  0.7065],
        [ 0.9372, -0.6805],
        [ 1.0694, -0.5836],
        [ 1.4177, -1.0436],
        [-1.4985,  0.9520],
        [-1.4981,  0.9490],
        [ 1.3372, -0.9041],
        [ 0.7565, -0.2676],
        [-1.6472,  0.7422],
        [-0.1877, -0.2250],
        [ 1.2888, -0.8637],
        [ 1.2996, -0.8973],
        [ 1.4154, -1.0634],
        [ 1.3192, -0.8872],
        [-1.3911,  0.0498],
        [ 1.3856, -1.0438],
        [ 1.3408, -0.8725],
        [-0.0583, -0.1265],
        [ 0.9287, -0.5834],
        [-1.3586,  0.0572],
        [ 1.4021, -1.0749],
        [-1.6052,  0.7088],
        [ 0.9372, -0.5794],
        [ 1.4407, -0.9688],
        [ 1.0100, -0.2244],
        [ 0.4818, -0.3996],
        [-0.0539, -0.1256]], device='cuda:0', grad_fn=<AddmmBackward0>)
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
  0%|                                                                                                                         | 499/625000 [02:29<51:12:37,  3.39it/s]
Editor is loading...