0x1202's picture
Training in progress, epoch 0, checkpoint
2b79a3b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.05117543579082041,
"eval_steps": 100,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00012793858947705101,
"eval_loss": 2.7024941444396973,
"eval_runtime": 240.6903,
"eval_samples_per_second": 13.677,
"eval_steps_per_second": 6.839,
"step": 1
},
{
"epoch": 0.0006396929473852551,
"grad_norm": 1.2781825065612793,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.7362,
"step": 5
},
{
"epoch": 0.0012793858947705101,
"grad_norm": 1.6901952028274536,
"learning_rate": 3.3333333333333335e-05,
"loss": 2.0195,
"step": 10
},
{
"epoch": 0.0019190788421557653,
"grad_norm": 2.2342793941497803,
"learning_rate": 5e-05,
"loss": 2.0461,
"step": 15
},
{
"epoch": 0.0025587717895410203,
"grad_norm": 1.6336160898208618,
"learning_rate": 6.666666666666667e-05,
"loss": 1.6357,
"step": 20
},
{
"epoch": 0.0031984647369262755,
"grad_norm": 2.752976894378662,
"learning_rate": 8.333333333333334e-05,
"loss": 1.1627,
"step": 25
},
{
"epoch": 0.0038381576843115306,
"grad_norm": 1.8429460525512695,
"learning_rate": 0.0001,
"loss": 0.6154,
"step": 30
},
{
"epoch": 0.004477850631696785,
"grad_norm": 1.7854329347610474,
"learning_rate": 9.995494831023409e-05,
"loss": 0.321,
"step": 35
},
{
"epoch": 0.005117543579082041,
"grad_norm": 1.596530556678772,
"learning_rate": 9.981987442712633e-05,
"loss": 0.2499,
"step": 40
},
{
"epoch": 0.005757236526467296,
"grad_norm": 1.2020143270492554,
"learning_rate": 9.959502176294383e-05,
"loss": 0.2349,
"step": 45
},
{
"epoch": 0.006396929473852551,
"grad_norm": 1.5480122566223145,
"learning_rate": 9.928079551738543e-05,
"loss": 0.2652,
"step": 50
},
{
"epoch": 0.007036622421237806,
"grad_norm": 1.2814526557922363,
"learning_rate": 9.887776194738432e-05,
"loss": 0.1417,
"step": 55
},
{
"epoch": 0.007676315368623061,
"grad_norm": 0.6458909511566162,
"learning_rate": 9.838664734667495e-05,
"loss": 0.1702,
"step": 60
},
{
"epoch": 0.008316008316008316,
"grad_norm": 0.6655418276786804,
"learning_rate": 9.780833673696254e-05,
"loss": 0.1657,
"step": 65
},
{
"epoch": 0.00895570126339357,
"grad_norm": 0.7218236327171326,
"learning_rate": 9.714387227305422e-05,
"loss": 0.1586,
"step": 70
},
{
"epoch": 0.009595394210778827,
"grad_norm": 0.7546175718307495,
"learning_rate": 9.639445136482548e-05,
"loss": 0.1706,
"step": 75
},
{
"epoch": 0.010235087158164081,
"grad_norm": 0.8237160444259644,
"learning_rate": 9.55614245194068e-05,
"loss": 0.1836,
"step": 80
},
{
"epoch": 0.010874780105549335,
"grad_norm": 0.7809853553771973,
"learning_rate": 9.464629290747842e-05,
"loss": 0.1837,
"step": 85
},
{
"epoch": 0.011514473052934592,
"grad_norm": 0.6915144920349121,
"learning_rate": 9.365070565805941e-05,
"loss": 0.1868,
"step": 90
},
{
"epoch": 0.012154166000319846,
"grad_norm": 0.8375842571258545,
"learning_rate": 9.257645688666556e-05,
"loss": 0.2455,
"step": 95
},
{
"epoch": 0.012793858947705102,
"grad_norm": 0.9935116767883301,
"learning_rate": 9.142548246219212e-05,
"loss": 0.2471,
"step": 100
},
{
"epoch": 0.012793858947705102,
"eval_loss": 0.18092785775661469,
"eval_runtime": 244.1307,
"eval_samples_per_second": 13.485,
"eval_steps_per_second": 6.742,
"step": 100
},
{
"epoch": 0.013433551895090356,
"grad_norm": 0.42624446749687195,
"learning_rate": 9.019985651834703e-05,
"loss": 0.1207,
"step": 105
},
{
"epoch": 0.014073244842475612,
"grad_norm": 0.3908662796020508,
"learning_rate": 8.890178771592199e-05,
"loss": 0.1382,
"step": 110
},
{
"epoch": 0.014712937789860867,
"grad_norm": 0.524199366569519,
"learning_rate": 8.753361526263621e-05,
"loss": 0.1309,
"step": 115
},
{
"epoch": 0.015352630737246123,
"grad_norm": 0.5209535360336304,
"learning_rate": 8.609780469772623e-05,
"loss": 0.1512,
"step": 120
},
{
"epoch": 0.01599232368463138,
"grad_norm": 0.4131312370300293,
"learning_rate": 8.459694344887732e-05,
"loss": 0.1443,
"step": 125
},
{
"epoch": 0.016632016632016633,
"grad_norm": 0.5860401391983032,
"learning_rate": 8.303373616950408e-05,
"loss": 0.1718,
"step": 130
},
{
"epoch": 0.017271709579401887,
"grad_norm": 0.5590812563896179,
"learning_rate": 8.141099986478212e-05,
"loss": 0.1659,
"step": 135
},
{
"epoch": 0.01791140252678714,
"grad_norm": 0.4526077210903168,
"learning_rate": 7.973165881521434e-05,
"loss": 0.1862,
"step": 140
},
{
"epoch": 0.018551095474172396,
"grad_norm": 0.5904171466827393,
"learning_rate": 7.799873930687978e-05,
"loss": 0.2245,
"step": 145
},
{
"epoch": 0.019190788421557654,
"grad_norm": 0.8725635409355164,
"learning_rate": 7.621536417786159e-05,
"loss": 0.2688,
"step": 150
},
{
"epoch": 0.019830481368942908,
"grad_norm": 0.32253244519233704,
"learning_rate": 7.438474719068173e-05,
"loss": 0.1232,
"step": 155
},
{
"epoch": 0.020470174316328162,
"grad_norm": 0.3636815845966339,
"learning_rate": 7.251018724088367e-05,
"loss": 0.1412,
"step": 160
},
{
"epoch": 0.021109867263713417,
"grad_norm": 0.42371389269828796,
"learning_rate": 7.059506241219965e-05,
"loss": 0.1479,
"step": 165
},
{
"epoch": 0.02174956021109867,
"grad_norm": 0.28215959668159485,
"learning_rate": 6.864282388901544e-05,
"loss": 0.1513,
"step": 170
},
{
"epoch": 0.02238925315848393,
"grad_norm": 0.6038751006126404,
"learning_rate": 6.665698973710288e-05,
"loss": 0.1708,
"step": 175
},
{
"epoch": 0.023028946105869183,
"grad_norm": 0.3659113943576813,
"learning_rate": 6.464113856382752e-05,
"loss": 0.1717,
"step": 180
},
{
"epoch": 0.023668639053254437,
"grad_norm": 0.4708649218082428,
"learning_rate": 6.259890306925627e-05,
"loss": 0.1729,
"step": 185
},
{
"epoch": 0.02430833200063969,
"grad_norm": 0.9622364640235901,
"learning_rate": 6.0533963499786314e-05,
"loss": 0.2008,
"step": 190
},
{
"epoch": 0.02494802494802495,
"grad_norm": 0.5879449844360352,
"learning_rate": 5.8450041016092464e-05,
"loss": 0.2236,
"step": 195
},
{
"epoch": 0.025587717895410204,
"grad_norm": 0.7524951696395874,
"learning_rate": 5.6350890987343944e-05,
"loss": 0.2424,
"step": 200
},
{
"epoch": 0.025587717895410204,
"eval_loss": 0.1732141375541687,
"eval_runtime": 244.2514,
"eval_samples_per_second": 13.478,
"eval_steps_per_second": 6.739,
"step": 200
},
{
"epoch": 0.026227410842795458,
"grad_norm": 0.2855418026447296,
"learning_rate": 5.4240296223775465e-05,
"loss": 0.0997,
"step": 205
},
{
"epoch": 0.026867103790180712,
"grad_norm": 0.3188900947570801,
"learning_rate": 5.212206015980742e-05,
"loss": 0.1139,
"step": 210
},
{
"epoch": 0.027506796737565967,
"grad_norm": 0.38831430673599243,
"learning_rate": 5e-05,
"loss": 0.136,
"step": 215
},
{
"epoch": 0.028146489684951224,
"grad_norm": 0.39263615012168884,
"learning_rate": 4.78779398401926e-05,
"loss": 0.1322,
"step": 220
},
{
"epoch": 0.02878618263233648,
"grad_norm": 0.6268131136894226,
"learning_rate": 4.575970377622456e-05,
"loss": 0.1702,
"step": 225
},
{
"epoch": 0.029425875579721733,
"grad_norm": 0.3940875828266144,
"learning_rate": 4.364910901265606e-05,
"loss": 0.175,
"step": 230
},
{
"epoch": 0.030065568527106987,
"grad_norm": 0.43840473890304565,
"learning_rate": 4.1549958983907555e-05,
"loss": 0.1645,
"step": 235
},
{
"epoch": 0.030705261474492245,
"grad_norm": 0.5527442693710327,
"learning_rate": 3.94660365002137e-05,
"loss": 0.1824,
"step": 240
},
{
"epoch": 0.0313449544218775,
"grad_norm": 0.5612766146659851,
"learning_rate": 3.740109693074375e-05,
"loss": 0.2346,
"step": 245
},
{
"epoch": 0.03198464736926276,
"grad_norm": 0.9959267973899841,
"learning_rate": 3.5358861436172485e-05,
"loss": 0.2655,
"step": 250
},
{
"epoch": 0.03262434031664801,
"grad_norm": 0.36881548166275024,
"learning_rate": 3.334301026289712e-05,
"loss": 0.1131,
"step": 255
},
{
"epoch": 0.033264033264033266,
"grad_norm": 0.31683406233787537,
"learning_rate": 3.135717611098458e-05,
"loss": 0.1326,
"step": 260
},
{
"epoch": 0.03390372621141852,
"grad_norm": 0.3013302683830261,
"learning_rate": 2.9404937587800375e-05,
"loss": 0.1368,
"step": 265
},
{
"epoch": 0.034543419158803775,
"grad_norm": 0.3135119676589966,
"learning_rate": 2.748981275911633e-05,
"loss": 0.1464,
"step": 270
},
{
"epoch": 0.03518311210618903,
"grad_norm": 0.40254464745521545,
"learning_rate": 2.5615252809318284e-05,
"loss": 0.1499,
"step": 275
},
{
"epoch": 0.03582280505357428,
"grad_norm": 0.28640156984329224,
"learning_rate": 2.3784635822138424e-05,
"loss": 0.1573,
"step": 280
},
{
"epoch": 0.03646249800095954,
"grad_norm": 0.322263240814209,
"learning_rate": 2.2001260693120233e-05,
"loss": 0.1658,
"step": 285
},
{
"epoch": 0.03710219094834479,
"grad_norm": 0.31009647250175476,
"learning_rate": 2.026834118478567e-05,
"loss": 0.1563,
"step": 290
},
{
"epoch": 0.03774188389573005,
"grad_norm": 0.5385209918022156,
"learning_rate": 1.858900013521788e-05,
"loss": 0.2028,
"step": 295
},
{
"epoch": 0.03838157684311531,
"grad_norm": 0.5858637094497681,
"learning_rate": 1.6966263830495936e-05,
"loss": 0.245,
"step": 300
},
{
"epoch": 0.03838157684311531,
"eval_loss": 0.16857710480690002,
"eval_runtime": 244.1346,
"eval_samples_per_second": 13.484,
"eval_steps_per_second": 6.742,
"step": 300
},
{
"epoch": 0.03902126979050056,
"grad_norm": 0.20234771072864532,
"learning_rate": 1.5403056551122697e-05,
"loss": 0.1184,
"step": 305
},
{
"epoch": 0.039660962737885816,
"grad_norm": 0.265678346157074,
"learning_rate": 1.3902195302273779e-05,
"loss": 0.123,
"step": 310
},
{
"epoch": 0.04030065568527107,
"grad_norm": 0.4162317216396332,
"learning_rate": 1.246638473736378e-05,
"loss": 0.1302,
"step": 315
},
{
"epoch": 0.040940348632656325,
"grad_norm": 0.3373229503631592,
"learning_rate": 1.1098212284078036e-05,
"loss": 0.1346,
"step": 320
},
{
"epoch": 0.04158004158004158,
"grad_norm": 0.3058888018131256,
"learning_rate": 9.800143481652979e-06,
"loss": 0.1455,
"step": 325
},
{
"epoch": 0.04221973452742683,
"grad_norm": 0.31154316663742065,
"learning_rate": 8.574517537807897e-06,
"loss": 0.1691,
"step": 330
},
{
"epoch": 0.04285942747481209,
"grad_norm": 0.47663167119026184,
"learning_rate": 7.423543113334436e-06,
"loss": 0.1658,
"step": 335
},
{
"epoch": 0.04349912042219734,
"grad_norm": 0.4618123769760132,
"learning_rate": 6.349294341940593e-06,
"loss": 0.1846,
"step": 340
},
{
"epoch": 0.0441388133695826,
"grad_norm": 0.37711745500564575,
"learning_rate": 5.353707092521582e-06,
"loss": 0.1828,
"step": 345
},
{
"epoch": 0.04477850631696786,
"grad_norm": 0.541806161403656,
"learning_rate": 4.43857548059321e-06,
"loss": 0.2395,
"step": 350
},
{
"epoch": 0.04541819926435311,
"grad_norm": 0.2688872218132019,
"learning_rate": 3.605548635174533e-06,
"loss": 0.1082,
"step": 355
},
{
"epoch": 0.046057892211738366,
"grad_norm": 0.26909637451171875,
"learning_rate": 2.85612772694579e-06,
"loss": 0.1264,
"step": 360
},
{
"epoch": 0.046697585159123624,
"grad_norm": 0.3528990149497986,
"learning_rate": 2.191663263037458e-06,
"loss": 0.1372,
"step": 365
},
{
"epoch": 0.047337278106508875,
"grad_norm": 0.3015151619911194,
"learning_rate": 1.6133526533250565e-06,
"loss": 0.1726,
"step": 370
},
{
"epoch": 0.04797697105389413,
"grad_norm": 0.288101464509964,
"learning_rate": 1.1222380526156928e-06,
"loss": 0.1504,
"step": 375
},
{
"epoch": 0.04861666400127938,
"grad_norm": 0.45127633213996887,
"learning_rate": 7.192044826145771e-07,
"loss": 0.1526,
"step": 380
},
{
"epoch": 0.04925635694866464,
"grad_norm": 0.3559752404689789,
"learning_rate": 4.049782370561583e-07,
"loss": 0.1684,
"step": 385
},
{
"epoch": 0.0498960498960499,
"grad_norm": 0.41711530089378357,
"learning_rate": 1.8012557287367392e-07,
"loss": 0.1758,
"step": 390
},
{
"epoch": 0.05053574284343515,
"grad_norm": 0.4933544397354126,
"learning_rate": 4.5051689765929214e-08,
"loss": 0.196,
"step": 395
},
{
"epoch": 0.05117543579082041,
"grad_norm": 1.311772346496582,
"learning_rate": 0.0,
"loss": 0.2441,
"step": 400
},
{
"epoch": 0.05117543579082041,
"eval_loss": 0.16797442734241486,
"eval_runtime": 244.0328,
"eval_samples_per_second": 13.49,
"eval_steps_per_second": 6.745,
"step": 400
}
],
"logging_steps": 5,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.394108836872192e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}