opus-em-deberta-3-large-v2 / trainer_state.json
Kerem P
End of training
f18fc82
raw
history blame
22.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 3590,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11,
"learning_rate": 2e-05,
"loss": 95.2205,
"step": 20
},
{
"epoch": 0.22,
"learning_rate": 2e-05,
"loss": 48.3233,
"step": 40
},
{
"epoch": 0.33,
"learning_rate": 2e-05,
"loss": 13.9954,
"step": 60
},
{
"epoch": 0.45,
"learning_rate": 2e-05,
"loss": 2.0884,
"step": 80
},
{
"epoch": 0.56,
"learning_rate": 2e-05,
"loss": 1.5549,
"step": 100
},
{
"epoch": 0.67,
"learning_rate": 2e-05,
"loss": 2.0528,
"step": 120
},
{
"epoch": 0.78,
"learning_rate": 2e-05,
"loss": 3.102,
"step": 140
},
{
"epoch": 0.89,
"learning_rate": 2e-05,
"loss": 1.2929,
"step": 160
},
{
"epoch": 1.0,
"eval_f1": 0.1941564561734213,
"eval_loss": 13.452169418334961,
"eval_runtime": 23.1921,
"eval_samples_per_second": 82.614,
"eval_steps_per_second": 1.294,
"step": 179
},
{
"epoch": 1.0,
"learning_rate": 2e-05,
"loss": 0.5906,
"step": 180
},
{
"epoch": 1.11,
"learning_rate": 2e-05,
"loss": 0.3674,
"step": 200
},
{
"epoch": 1.23,
"learning_rate": 2e-05,
"loss": 0.5531,
"step": 220
},
{
"epoch": 1.34,
"learning_rate": 2e-05,
"loss": 0.2448,
"step": 240
},
{
"epoch": 1.45,
"learning_rate": 2e-05,
"loss": 0.1894,
"step": 260
},
{
"epoch": 1.56,
"learning_rate": 2e-05,
"loss": 0.2385,
"step": 280
},
{
"epoch": 1.67,
"learning_rate": 2e-05,
"loss": 0.2773,
"step": 300
},
{
"epoch": 1.78,
"learning_rate": 2e-05,
"loss": 0.1879,
"step": 320
},
{
"epoch": 1.89,
"learning_rate": 2e-05,
"loss": 0.1541,
"step": 340
},
{
"epoch": 2.0,
"eval_f1": 0.1941564561734213,
"eval_loss": 8.468379020690918,
"eval_runtime": 22.9961,
"eval_samples_per_second": 83.318,
"eval_steps_per_second": 1.305,
"step": 359
},
{
"epoch": 2.01,
"learning_rate": 2e-05,
"loss": 0.1381,
"step": 360
},
{
"epoch": 2.12,
"learning_rate": 2e-05,
"loss": 0.1464,
"step": 380
},
{
"epoch": 2.23,
"learning_rate": 2e-05,
"loss": 0.2856,
"step": 400
},
{
"epoch": 2.34,
"learning_rate": 2e-05,
"loss": 0.3727,
"step": 420
},
{
"epoch": 2.45,
"learning_rate": 2e-05,
"loss": 0.2145,
"step": 440
},
{
"epoch": 2.56,
"learning_rate": 2e-05,
"loss": 0.0867,
"step": 460
},
{
"epoch": 2.67,
"learning_rate": 2e-05,
"loss": 0.232,
"step": 480
},
{
"epoch": 2.79,
"learning_rate": 2e-05,
"loss": 0.1407,
"step": 500
},
{
"epoch": 2.9,
"learning_rate": 2e-05,
"loss": 0.1257,
"step": 520
},
{
"epoch": 3.0,
"eval_f1": 0.1941564561734213,
"eval_loss": 7.637044906616211,
"eval_runtime": 22.6323,
"eval_samples_per_second": 84.658,
"eval_steps_per_second": 1.326,
"step": 538
},
{
"epoch": 3.01,
"learning_rate": 2e-05,
"loss": 0.1963,
"step": 540
},
{
"epoch": 3.12,
"learning_rate": 2e-05,
"loss": 0.1485,
"step": 560
},
{
"epoch": 3.23,
"learning_rate": 2e-05,
"loss": 0.1373,
"step": 580
},
{
"epoch": 3.34,
"learning_rate": 2e-05,
"loss": 0.0971,
"step": 600
},
{
"epoch": 3.45,
"learning_rate": 2e-05,
"loss": 0.2036,
"step": 620
},
{
"epoch": 3.57,
"learning_rate": 2e-05,
"loss": 0.2205,
"step": 640
},
{
"epoch": 3.68,
"learning_rate": 2e-05,
"loss": 0.0547,
"step": 660
},
{
"epoch": 3.79,
"learning_rate": 2e-05,
"loss": 0.0957,
"step": 680
},
{
"epoch": 3.9,
"learning_rate": 2e-05,
"loss": 0.1684,
"step": 700
},
{
"epoch": 4.0,
"eval_f1": 0.6376360808709176,
"eval_loss": 0.7054294943809509,
"eval_runtime": 22.968,
"eval_samples_per_second": 83.42,
"eval_steps_per_second": 1.306,
"step": 718
},
{
"epoch": 4.01,
"learning_rate": 2e-05,
"loss": 0.145,
"step": 720
},
{
"epoch": 4.12,
"learning_rate": 2e-05,
"loss": 0.1186,
"step": 740
},
{
"epoch": 4.23,
"learning_rate": 2e-05,
"loss": 0.0227,
"step": 760
},
{
"epoch": 4.35,
"learning_rate": 2e-05,
"loss": 0.0556,
"step": 780
},
{
"epoch": 4.46,
"learning_rate": 2e-05,
"loss": 0.141,
"step": 800
},
{
"epoch": 4.57,
"learning_rate": 2e-05,
"loss": 0.1328,
"step": 820
},
{
"epoch": 4.68,
"learning_rate": 2e-05,
"loss": 0.0992,
"step": 840
},
{
"epoch": 4.79,
"learning_rate": 2e-05,
"loss": 0.1691,
"step": 860
},
{
"epoch": 4.9,
"learning_rate": 2e-05,
"loss": 0.0911,
"step": 880
},
{
"epoch": 5.0,
"eval_f1": 0.1941564561734213,
"eval_loss": 5.119464874267578,
"eval_runtime": 22.7812,
"eval_samples_per_second": 84.104,
"eval_steps_per_second": 1.317,
"step": 897
},
{
"epoch": 5.01,
"learning_rate": 2e-05,
"loss": 0.0993,
"step": 900
},
{
"epoch": 5.13,
"learning_rate": 2e-05,
"loss": 0.0972,
"step": 920
},
{
"epoch": 5.24,
"learning_rate": 2e-05,
"loss": 0.0389,
"step": 940
},
{
"epoch": 5.35,
"learning_rate": 2e-05,
"loss": 0.1366,
"step": 960
},
{
"epoch": 5.46,
"learning_rate": 2e-05,
"loss": 0.0833,
"step": 980
},
{
"epoch": 5.57,
"learning_rate": 2e-05,
"loss": 0.1634,
"step": 1000
},
{
"epoch": 5.68,
"learning_rate": 2e-05,
"loss": 0.0691,
"step": 1020
},
{
"epoch": 5.79,
"learning_rate": 2e-05,
"loss": 0.1487,
"step": 1040
},
{
"epoch": 5.91,
"learning_rate": 2e-05,
"loss": 0.145,
"step": 1060
},
{
"epoch": 6.0,
"eval_f1": 0.7984031936127745,
"eval_loss": 0.2693595290184021,
"eval_runtime": 22.9118,
"eval_samples_per_second": 83.625,
"eval_steps_per_second": 1.309,
"step": 1077
},
{
"epoch": 6.02,
"learning_rate": 2e-05,
"loss": 0.0373,
"step": 1080
},
{
"epoch": 6.13,
"learning_rate": 2e-05,
"loss": 0.0409,
"step": 1100
},
{
"epoch": 6.24,
"learning_rate": 2e-05,
"loss": 0.0714,
"step": 1120
},
{
"epoch": 6.35,
"learning_rate": 2e-05,
"loss": 0.0915,
"step": 1140
},
{
"epoch": 6.46,
"learning_rate": 2e-05,
"loss": 0.1359,
"step": 1160
},
{
"epoch": 6.57,
"learning_rate": 2e-05,
"loss": 0.1016,
"step": 1180
},
{
"epoch": 6.69,
"learning_rate": 2e-05,
"loss": 0.0346,
"step": 1200
},
{
"epoch": 6.8,
"learning_rate": 2e-05,
"loss": 0.0437,
"step": 1220
},
{
"epoch": 6.91,
"learning_rate": 2e-05,
"loss": 0.1191,
"step": 1240
},
{
"epoch": 7.0,
"eval_f1": 0.20265617314313825,
"eval_loss": 2.941455602645874,
"eval_runtime": 22.6391,
"eval_samples_per_second": 84.632,
"eval_steps_per_second": 1.325,
"step": 1256
},
{
"epoch": 7.02,
"learning_rate": 2e-05,
"loss": 0.069,
"step": 1260
},
{
"epoch": 7.13,
"learning_rate": 2e-05,
"loss": 0.009,
"step": 1280
},
{
"epoch": 7.24,
"learning_rate": 2e-05,
"loss": 0.0485,
"step": 1300
},
{
"epoch": 7.35,
"learning_rate": 2e-05,
"loss": 0.0105,
"step": 1320
},
{
"epoch": 7.47,
"learning_rate": 2e-05,
"loss": 0.0835,
"step": 1340
},
{
"epoch": 7.58,
"learning_rate": 2e-05,
"loss": 0.1458,
"step": 1360
},
{
"epoch": 7.69,
"learning_rate": 2e-05,
"loss": 0.0553,
"step": 1380
},
{
"epoch": 7.8,
"learning_rate": 2e-05,
"loss": 0.005,
"step": 1400
},
{
"epoch": 7.91,
"learning_rate": 2e-05,
"loss": 0.1008,
"step": 1420
},
{
"epoch": 8.0,
"eval_f1": 0.9023255813953488,
"eval_loss": 0.17851048707962036,
"eval_runtime": 22.8798,
"eval_samples_per_second": 83.742,
"eval_steps_per_second": 1.311,
"step": 1436
},
{
"epoch": 8.02,
"learning_rate": 2e-05,
"loss": 0.0414,
"step": 1440
},
{
"epoch": 8.13,
"learning_rate": 2e-05,
"loss": 0.0351,
"step": 1460
},
{
"epoch": 8.25,
"learning_rate": 2e-05,
"loss": 0.115,
"step": 1480
},
{
"epoch": 8.36,
"learning_rate": 2e-05,
"loss": 0.0453,
"step": 1500
},
{
"epoch": 4.23,
"learning_rate": 0.001,
"loss": 18.7223,
"step": 1520
},
{
"epoch": 4.29,
"learning_rate": 0.001,
"loss": 5.5701,
"step": 1540
},
{
"epoch": 4.35,
"learning_rate": 0.001,
"loss": 1.2935,
"step": 1560
},
{
"epoch": 4.4,
"learning_rate": 0.001,
"loss": 0.6161,
"step": 1580
},
{
"epoch": 4.46,
"learning_rate": 0.001,
"loss": 0.757,
"step": 1600
},
{
"epoch": 4.51,
"learning_rate": 0.001,
"loss": 0.6241,
"step": 1620
},
{
"epoch": 4.57,
"learning_rate": 0.001,
"loss": 0.5211,
"step": 1640
},
{
"epoch": 4.62,
"learning_rate": 0.001,
"loss": 0.4467,
"step": 1660
},
{
"epoch": 4.68,
"learning_rate": 0.001,
"loss": 0.424,
"step": 1680
},
{
"epoch": 4.74,
"learning_rate": 0.001,
"loss": 0.3741,
"step": 1700
},
{
"epoch": 4.79,
"learning_rate": 0.001,
"loss": 0.3276,
"step": 1720
},
{
"epoch": 4.85,
"learning_rate": 0.001,
"loss": 0.3692,
"step": 1740
},
{
"epoch": 4.9,
"learning_rate": 0.001,
"loss": 0.3626,
"step": 1760
},
{
"epoch": 4.96,
"learning_rate": 0.001,
"loss": 0.3698,
"step": 1780
},
{
"epoch": 5.0,
"eval_f1": 0.0,
"eval_loss": 0.3513816297054291,
"eval_runtime": 20.8512,
"eval_samples_per_second": 91.889,
"eval_steps_per_second": 1.439,
"step": 1795
},
{
"epoch": 5.01,
"learning_rate": 0.001,
"loss": 0.3672,
"step": 1800
},
{
"epoch": 5.07,
"learning_rate": 0.001,
"loss": 0.3879,
"step": 1820
},
{
"epoch": 5.13,
"learning_rate": 0.001,
"loss": 0.458,
"step": 1840
},
{
"epoch": 5.18,
"learning_rate": 0.001,
"loss": 0.3949,
"step": 1860
},
{
"epoch": 5.24,
"learning_rate": 0.001,
"loss": 0.372,
"step": 1880
},
{
"epoch": 5.29,
"learning_rate": 0.001,
"loss": 0.3578,
"step": 1900
},
{
"epoch": 5.35,
"learning_rate": 0.001,
"loss": 0.3906,
"step": 1920
},
{
"epoch": 5.4,
"learning_rate": 0.001,
"loss": 0.3888,
"step": 1940
},
{
"epoch": 5.46,
"learning_rate": 0.001,
"loss": 0.4049,
"step": 1960
},
{
"epoch": 5.52,
"learning_rate": 0.001,
"loss": 0.3692,
"step": 1980
},
{
"epoch": 5.57,
"learning_rate": 0.001,
"loss": 0.3299,
"step": 2000
},
{
"epoch": 5.63,
"learning_rate": 0.001,
"loss": 0.3714,
"step": 2020
},
{
"epoch": 5.68,
"learning_rate": 0.001,
"loss": 0.3423,
"step": 2040
},
{
"epoch": 5.74,
"learning_rate": 0.001,
"loss": 0.3534,
"step": 2060
},
{
"epoch": 5.79,
"learning_rate": 0.001,
"loss": 0.3426,
"step": 2080
},
{
"epoch": 5.85,
"learning_rate": 0.001,
"loss": 0.3684,
"step": 2100
},
{
"epoch": 5.91,
"learning_rate": 0.001,
"loss": 0.3472,
"step": 2120
},
{
"epoch": 5.96,
"learning_rate": 0.001,
"loss": 0.299,
"step": 2140
},
{
"epoch": 6.0,
"eval_f1": 0.0,
"eval_loss": 0.3469391465187073,
"eval_runtime": 20.5335,
"eval_samples_per_second": 93.311,
"eval_steps_per_second": 1.461,
"step": 2154
},
{
"epoch": 6.02,
"learning_rate": 0.001,
"loss": 0.3336,
"step": 2160
},
{
"epoch": 6.07,
"learning_rate": 0.001,
"loss": 0.4366,
"step": 2180
},
{
"epoch": 6.13,
"learning_rate": 0.001,
"loss": 0.3709,
"step": 2200
},
{
"epoch": 6.18,
"learning_rate": 0.001,
"loss": 0.3357,
"step": 2220
},
{
"epoch": 6.24,
"learning_rate": 0.001,
"loss": 0.4034,
"step": 2240
},
{
"epoch": 6.3,
"learning_rate": 0.001,
"loss": 0.3868,
"step": 2260
},
{
"epoch": 6.35,
"learning_rate": 0.001,
"loss": 0.3328,
"step": 2280
},
{
"epoch": 6.41,
"learning_rate": 0.001,
"loss": 0.3974,
"step": 2300
},
{
"epoch": 6.46,
"learning_rate": 0.001,
"loss": 0.3707,
"step": 2320
},
{
"epoch": 6.52,
"learning_rate": 0.001,
"loss": 0.3753,
"step": 2340
},
{
"epoch": 6.57,
"learning_rate": 0.001,
"loss": 0.3255,
"step": 2360
},
{
"epoch": 6.63,
"learning_rate": 0.001,
"loss": 0.4284,
"step": 2380
},
{
"epoch": 6.69,
"learning_rate": 0.001,
"loss": 0.3699,
"step": 2400
},
{
"epoch": 6.74,
"learning_rate": 0.001,
"loss": 0.3705,
"step": 2420
},
{
"epoch": 6.8,
"learning_rate": 0.001,
"loss": 0.2841,
"step": 2440
},
{
"epoch": 6.85,
"learning_rate": 0.001,
"loss": 0.2687,
"step": 2460
},
{
"epoch": 6.91,
"learning_rate": 0.001,
"loss": 0.3294,
"step": 2480
},
{
"epoch": 6.96,
"learning_rate": 0.001,
"loss": 0.3531,
"step": 2500
},
{
"epoch": 7.0,
"eval_f1": 0.0,
"eval_loss": 0.3420043885707855,
"eval_runtime": 20.5195,
"eval_samples_per_second": 93.374,
"eval_steps_per_second": 1.462,
"step": 2513
},
{
"epoch": 7.02,
"learning_rate": 0.001,
"loss": 0.3396,
"step": 2520
},
{
"epoch": 7.08,
"learning_rate": 0.001,
"loss": 0.3824,
"step": 2540
},
{
"epoch": 7.13,
"learning_rate": 0.001,
"loss": 0.2518,
"step": 2560
},
{
"epoch": 7.19,
"learning_rate": 0.001,
"loss": 0.3822,
"step": 2580
},
{
"epoch": 7.24,
"learning_rate": 0.001,
"loss": 0.3969,
"step": 2600
},
{
"epoch": 7.3,
"learning_rate": 0.001,
"loss": 0.2551,
"step": 2620
},
{
"epoch": 7.35,
"learning_rate": 0.001,
"loss": 0.3387,
"step": 2640
},
{
"epoch": 7.41,
"learning_rate": 0.001,
"loss": 0.3761,
"step": 2660
},
{
"epoch": 7.47,
"learning_rate": 0.001,
"loss": 0.3899,
"step": 2680
},
{
"epoch": 7.52,
"learning_rate": 0.001,
"loss": 0.3691,
"step": 2700
},
{
"epoch": 7.58,
"learning_rate": 0.001,
"loss": 0.3172,
"step": 2720
},
{
"epoch": 7.63,
"learning_rate": 0.001,
"loss": 0.3358,
"step": 2740
},
{
"epoch": 7.69,
"learning_rate": 0.001,
"loss": 0.3459,
"step": 2760
},
{
"epoch": 7.74,
"learning_rate": 0.001,
"loss": 0.3347,
"step": 2780
},
{
"epoch": 7.8,
"learning_rate": 0.001,
"loss": 0.3459,
"step": 2800
},
{
"epoch": 7.86,
"learning_rate": 0.001,
"loss": 0.3797,
"step": 2820
},
{
"epoch": 7.91,
"learning_rate": 0.001,
"loss": 0.3721,
"step": 2840
},
{
"epoch": 7.97,
"learning_rate": 0.001,
"loss": 0.3892,
"step": 2860
},
{
"epoch": 8.0,
"eval_f1": 0.0,
"eval_loss": 0.34283891320228577,
"eval_runtime": 20.5477,
"eval_samples_per_second": 93.247,
"eval_steps_per_second": 1.46,
"step": 2872
},
{
"epoch": 8.02,
"learning_rate": 0.001,
"loss": 0.3234,
"step": 2880
},
{
"epoch": 8.08,
"learning_rate": 0.001,
"loss": 0.3979,
"step": 2900
},
{
"epoch": 8.13,
"learning_rate": 0.001,
"loss": 0.4032,
"step": 2920
},
{
"epoch": 8.19,
"learning_rate": 0.001,
"loss": 0.3787,
"step": 2940
},
{
"epoch": 8.25,
"learning_rate": 0.001,
"loss": 0.3144,
"step": 2960
},
{
"epoch": 8.3,
"learning_rate": 0.001,
"loss": 0.4071,
"step": 2980
},
{
"epoch": 8.36,
"learning_rate": 0.001,
"loss": 0.3192,
"step": 3000
},
{
"epoch": 8.41,
"learning_rate": 0.001,
"loss": 0.3194,
"step": 3020
},
{
"epoch": 8.47,
"learning_rate": 0.001,
"loss": 0.3468,
"step": 3040
},
{
"epoch": 8.52,
"learning_rate": 0.001,
"loss": 0.325,
"step": 3060
},
{
"epoch": 8.58,
"learning_rate": 0.001,
"loss": 0.3631,
"step": 3080
},
{
"epoch": 8.64,
"learning_rate": 0.001,
"loss": 0.3464,
"step": 3100
},
{
"epoch": 8.69,
"learning_rate": 0.001,
"loss": 0.3378,
"step": 3120
},
{
"epoch": 8.75,
"learning_rate": 0.001,
"loss": 0.3808,
"step": 3140
},
{
"epoch": 8.8,
"learning_rate": 0.001,
"loss": 0.3668,
"step": 3160
},
{
"epoch": 8.86,
"learning_rate": 0.001,
"loss": 0.3045,
"step": 3180
},
{
"epoch": 8.91,
"learning_rate": 0.001,
"loss": 0.2805,
"step": 3200
},
{
"epoch": 8.97,
"learning_rate": 0.001,
"loss": 0.3706,
"step": 3220
},
{
"epoch": 9.0,
"eval_f1": 0.0,
"eval_loss": 0.3420598804950714,
"eval_runtime": 20.5266,
"eval_samples_per_second": 93.342,
"eval_steps_per_second": 1.462,
"step": 3231
},
{
"epoch": 9.03,
"learning_rate": 0.001,
"loss": 0.3502,
"step": 3240
},
{
"epoch": 9.08,
"learning_rate": 0.001,
"loss": 0.3414,
"step": 3260
},
{
"epoch": 9.14,
"learning_rate": 0.001,
"loss": 0.4037,
"step": 3280
},
{
"epoch": 9.19,
"learning_rate": 0.001,
"loss": 0.3548,
"step": 3300
},
{
"epoch": 9.25,
"learning_rate": 0.001,
"loss": 0.3426,
"step": 3320
},
{
"epoch": 9.3,
"learning_rate": 0.001,
"loss": 0.3614,
"step": 3340
},
{
"epoch": 9.36,
"learning_rate": 0.001,
"loss": 0.2505,
"step": 3360
},
{
"epoch": 9.42,
"learning_rate": 0.001,
"loss": 0.402,
"step": 3380
},
{
"epoch": 9.47,
"learning_rate": 0.001,
"loss": 0.3029,
"step": 3400
},
{
"epoch": 9.53,
"learning_rate": 0.001,
"loss": 0.2799,
"step": 3420
},
{
"epoch": 9.58,
"learning_rate": 0.001,
"loss": 0.3046,
"step": 3440
},
{
"epoch": 9.64,
"learning_rate": 0.001,
"loss": 0.3707,
"step": 3460
},
{
"epoch": 9.69,
"learning_rate": 0.001,
"loss": 0.3417,
"step": 3480
},
{
"epoch": 9.75,
"learning_rate": 0.001,
"loss": 0.3826,
"step": 3500
},
{
"epoch": 9.81,
"learning_rate": 0.001,
"loss": 0.3658,
"step": 3520
},
{
"epoch": 9.86,
"learning_rate": 0.001,
"loss": 0.3185,
"step": 3540
},
{
"epoch": 9.92,
"learning_rate": 0.001,
"loss": 0.3596,
"step": 3560
},
{
"epoch": 9.97,
"learning_rate": 0.001,
"loss": 0.3863,
"step": 3580
},
{
"epoch": 10.0,
"eval_f1": 0.0,
"eval_loss": 0.34448280930519104,
"eval_runtime": 20.4875,
"eval_samples_per_second": 93.52,
"eval_steps_per_second": 1.464,
"step": 3590
},
{
"epoch": 10.0,
"step": 3590,
"total_flos": 3792562624069632.0,
"train_loss": 0.3510875488058106,
"train_runtime": 3949.5247,
"train_samples_per_second": 14.541,
"train_steps_per_second": 0.909
}
],
"logging_steps": 20,
"max_steps": 3590,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 3792562624069632.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}