mn-inf-qlora-mm / last-checkpoint /trainer_state.json
ToastyPigeon's picture
Training in progress, epoch 1, checkpoint
15bd2b3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0119047619047619,
"eval_steps": 17,
"global_step": 85,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011904761904761904,
"grad_norm": 0.9544092626224188,
"learning_rate": 2e-05,
"loss": 2.5697,
"step": 1
},
{
"epoch": 0.011904761904761904,
"eval_loss": 2.4926321506500244,
"eval_runtime": 84.5031,
"eval_samples_per_second": 0.237,
"eval_steps_per_second": 0.118,
"step": 1
},
{
"epoch": 0.023809523809523808,
"grad_norm": 0.8699237107804378,
"learning_rate": 4e-05,
"loss": 2.5045,
"step": 2
},
{
"epoch": 0.03571428571428571,
"grad_norm": 0.8471542581095772,
"learning_rate": 6e-05,
"loss": 2.444,
"step": 3
},
{
"epoch": 0.047619047619047616,
"grad_norm": 0.6760804223199605,
"learning_rate": 8e-05,
"loss": 2.4135,
"step": 4
},
{
"epoch": 0.05952380952380952,
"grad_norm": 0.3645239609928067,
"learning_rate": 0.0001,
"loss": 2.5502,
"step": 5
},
{
"epoch": 0.07142857142857142,
"grad_norm": 1.0191533159070523,
"learning_rate": 9.996530663083255e-05,
"loss": 2.4001,
"step": 6
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.7536562391615925,
"learning_rate": 9.986128001799077e-05,
"loss": 2.4241,
"step": 7
},
{
"epoch": 0.09523809523809523,
"grad_norm": 0.47058800200722606,
"learning_rate": 9.96880805629717e-05,
"loss": 2.3783,
"step": 8
},
{
"epoch": 0.10714285714285714,
"grad_norm": 0.42488376754431106,
"learning_rate": 9.94459753267812e-05,
"loss": 2.4496,
"step": 9
},
{
"epoch": 0.11904761904761904,
"grad_norm": 0.2964058549391389,
"learning_rate": 9.913533761814537e-05,
"loss": 2.4123,
"step": 10
},
{
"epoch": 0.13095238095238096,
"grad_norm": 0.274543387752409,
"learning_rate": 9.875664641789545e-05,
"loss": 2.3733,
"step": 11
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.4783689100368163,
"learning_rate": 9.831048564041413e-05,
"loss": 2.3535,
"step": 12
},
{
"epoch": 0.15476190476190477,
"grad_norm": 1.3750248629360309,
"learning_rate": 9.779754323328192e-05,
"loss": 2.2893,
"step": 13
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.3128166343225632,
"learning_rate": 9.72186101165118e-05,
"loss": 2.3495,
"step": 14
},
{
"epoch": 0.17857142857142858,
"grad_norm": 0.2978734846133746,
"learning_rate": 9.657457896300791e-05,
"loss": 2.3489,
"step": 15
},
{
"epoch": 0.19047619047619047,
"grad_norm": 0.3296693493448657,
"learning_rate": 9.586644282212866e-05,
"loss": 2.2308,
"step": 16
},
{
"epoch": 0.20238095238095238,
"grad_norm": 0.35894978658993554,
"learning_rate": 9.509529358847655e-05,
"loss": 2.2991,
"step": 17
},
{
"epoch": 0.20238095238095238,
"eval_loss": 2.3356270790100098,
"eval_runtime": 84.8559,
"eval_samples_per_second": 0.236,
"eval_steps_per_second": 0.118,
"step": 17
},
{
"epoch": 0.21428571428571427,
"grad_norm": 0.556925508907772,
"learning_rate": 9.426232031827588e-05,
"loss": 2.3245,
"step": 18
},
{
"epoch": 0.2261904761904762,
"grad_norm": 0.2596512913089154,
"learning_rate": 9.336880739593416e-05,
"loss": 2.2259,
"step": 19
},
{
"epoch": 0.23809523809523808,
"grad_norm": 0.23810127330848288,
"learning_rate": 9.241613255361455e-05,
"loss": 2.3589,
"step": 20
},
{
"epoch": 0.25,
"grad_norm": 0.2486358353351647,
"learning_rate": 9.140576474687264e-05,
"loss": 2.3491,
"step": 21
},
{
"epoch": 0.2619047619047619,
"grad_norm": 0.2569987072732923,
"learning_rate": 9.033926188963352e-05,
"loss": 2.3966,
"step": 22
},
{
"epoch": 0.27380952380952384,
"grad_norm": 0.2818819247779108,
"learning_rate": 8.921826845200139e-05,
"loss": 2.319,
"step": 23
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.3051353561467919,
"learning_rate": 8.804451292460585e-05,
"loss": 2.3764,
"step": 24
},
{
"epoch": 0.2976190476190476,
"grad_norm": 0.28378754273403445,
"learning_rate": 8.681980515339464e-05,
"loss": 2.2516,
"step": 25
},
{
"epoch": 0.30952380952380953,
"grad_norm": 0.23671187368883853,
"learning_rate": 8.554603354898238e-05,
"loss": 2.213,
"step": 26
},
{
"epoch": 0.32142857142857145,
"grad_norm": 0.2366234206615884,
"learning_rate": 8.422516217485826e-05,
"loss": 2.2728,
"step": 27
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.29302002288238443,
"learning_rate": 8.285922771894254e-05,
"loss": 2.3657,
"step": 28
},
{
"epoch": 0.34523809523809523,
"grad_norm": 0.27328082985574287,
"learning_rate": 8.14503363531613e-05,
"loss": 2.293,
"step": 29
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.2629290179604872,
"learning_rate": 8.000066048588211e-05,
"loss": 2.3183,
"step": 30
},
{
"epoch": 0.36904761904761907,
"grad_norm": 0.2769635403979301,
"learning_rate": 7.85124354122177e-05,
"loss": 2.2946,
"step": 31
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.2322670372127043,
"learning_rate": 7.698795586736298e-05,
"loss": 2.1595,
"step": 32
},
{
"epoch": 0.39285714285714285,
"grad_norm": 0.2733558129218707,
"learning_rate": 7.542957248827961e-05,
"loss": 2.3469,
"step": 33
},
{
"epoch": 0.40476190476190477,
"grad_norm": 0.24834436859405049,
"learning_rate": 7.383968818918426e-05,
"loss": 2.199,
"step": 34
},
{
"epoch": 0.40476190476190477,
"eval_loss": 2.299875259399414,
"eval_runtime": 85.3272,
"eval_samples_per_second": 0.234,
"eval_steps_per_second": 0.117,
"step": 34
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.27121410447461325,
"learning_rate": 7.222075445642904e-05,
"loss": 2.1842,
"step": 35
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.31502528289581183,
"learning_rate": 7.057526756848719e-05,
"loss": 2.2298,
"step": 36
},
{
"epoch": 0.44047619047619047,
"grad_norm": 0.24007323900338268,
"learning_rate": 6.890576474687263e-05,
"loss": 2.4688,
"step": 37
},
{
"epoch": 0.4523809523809524,
"grad_norm": 0.2608935989516304,
"learning_rate": 6.721482024392835e-05,
"loss": 2.2838,
"step": 38
},
{
"epoch": 0.4642857142857143,
"grad_norm": 0.29766343141422075,
"learning_rate": 6.550504137351576e-05,
"loss": 2.1777,
"step": 39
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.29532812127434044,
"learning_rate": 6.377906449072578e-05,
"loss": 2.2005,
"step": 40
},
{
"epoch": 0.4880952380952381,
"grad_norm": 0.25346420479015375,
"learning_rate": 6.203955092681039e-05,
"loss": 2.2493,
"step": 41
},
{
"epoch": 0.5,
"grad_norm": 0.2733242470847782,
"learning_rate": 6.0289182885602704e-05,
"loss": 2.264,
"step": 42
},
{
"epoch": 0.5119047619047619,
"grad_norm": 0.2832250530342488,
"learning_rate": 5.8530659307753036e-05,
"loss": 2.3321,
"step": 43
},
{
"epoch": 0.5238095238095238,
"grad_norm": 0.23261141435639465,
"learning_rate": 5.6766691709158096e-05,
"loss": 2.2584,
"step": 44
},
{
"epoch": 0.5357142857142857,
"grad_norm": 0.24734669584168956,
"learning_rate": 5.500000000000001e-05,
"loss": 2.3196,
"step": 45
},
{
"epoch": 0.5476190476190477,
"grad_norm": 0.3610266226821954,
"learning_rate": 5.3233308290841935e-05,
"loss": 2.2857,
"step": 46
},
{
"epoch": 0.5595238095238095,
"grad_norm": 0.33827980816046865,
"learning_rate": 5.1469340692246995e-05,
"loss": 2.099,
"step": 47
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.25979841286218003,
"learning_rate": 4.9710817114397314e-05,
"loss": 2.2922,
"step": 48
},
{
"epoch": 0.5833333333333334,
"grad_norm": 0.2966248351989265,
"learning_rate": 4.7960449073189606e-05,
"loss": 2.3001,
"step": 49
},
{
"epoch": 0.5952380952380952,
"grad_norm": 0.2649319822020958,
"learning_rate": 4.6220935509274235e-05,
"loss": 2.0389,
"step": 50
},
{
"epoch": 0.6071428571428571,
"grad_norm": 0.3040829342691192,
"learning_rate": 4.4494958626484276e-05,
"loss": 2.3336,
"step": 51
},
{
"epoch": 0.6071428571428571,
"eval_loss": 2.2863776683807373,
"eval_runtime": 85.1891,
"eval_samples_per_second": 0.235,
"eval_steps_per_second": 0.117,
"step": 51
},
{
"epoch": 0.6190476190476191,
"grad_norm": 0.25961905620657355,
"learning_rate": 4.278517975607167e-05,
"loss": 2.257,
"step": 52
},
{
"epoch": 0.6309523809523809,
"grad_norm": 0.2520680871081291,
"learning_rate": 4.109423525312738e-05,
"loss": 2.3646,
"step": 53
},
{
"epoch": 0.6428571428571429,
"grad_norm": 0.2563922206690732,
"learning_rate": 3.942473243151281e-05,
"loss": 2.1605,
"step": 54
},
{
"epoch": 0.6547619047619048,
"grad_norm": 0.23710847982590777,
"learning_rate": 3.777924554357096e-05,
"loss": 2.2253,
"step": 55
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.2358700389619207,
"learning_rate": 3.616031181081575e-05,
"loss": 2.2685,
"step": 56
},
{
"epoch": 0.6785714285714286,
"grad_norm": 0.25688706639252046,
"learning_rate": 3.45704275117204e-05,
"loss": 2.306,
"step": 57
},
{
"epoch": 0.6904761904761905,
"grad_norm": 0.28553196847033946,
"learning_rate": 3.301204413263704e-05,
"loss": 2.394,
"step": 58
},
{
"epoch": 0.7023809523809523,
"grad_norm": 0.24843175796925013,
"learning_rate": 3.1487564587782306e-05,
"loss": 2.2661,
"step": 59
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.24662937375083732,
"learning_rate": 2.9999339514117912e-05,
"loss": 2.3021,
"step": 60
},
{
"epoch": 0.7261904761904762,
"grad_norm": 0.3580921716501969,
"learning_rate": 2.854966364683872e-05,
"loss": 2.2545,
"step": 61
},
{
"epoch": 0.7380952380952381,
"grad_norm": 0.2601716169771101,
"learning_rate": 2.7140772281057468e-05,
"loss": 2.4256,
"step": 62
},
{
"epoch": 0.75,
"grad_norm": 0.2729633905977987,
"learning_rate": 2.577483782514174e-05,
"loss": 2.4064,
"step": 63
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.35811707072589155,
"learning_rate": 2.445396645101762e-05,
"loss": 2.3352,
"step": 64
},
{
"epoch": 0.7738095238095238,
"grad_norm": 0.27727284509496897,
"learning_rate": 2.3180194846605367e-05,
"loss": 2.5192,
"step": 65
},
{
"epoch": 0.7857142857142857,
"grad_norm": 0.2803742538067055,
"learning_rate": 2.195548707539416e-05,
"loss": 2.3278,
"step": 66
},
{
"epoch": 0.7976190476190477,
"grad_norm": 0.25468361529041794,
"learning_rate": 2.0781731547998614e-05,
"loss": 2.2444,
"step": 67
},
{
"epoch": 0.8095238095238095,
"grad_norm": 0.4651703351900801,
"learning_rate": 1.966073811036649e-05,
"loss": 2.1637,
"step": 68
},
{
"epoch": 0.8095238095238095,
"eval_loss": 2.2794995307922363,
"eval_runtime": 84.9665,
"eval_samples_per_second": 0.235,
"eval_steps_per_second": 0.118,
"step": 68
},
{
"epoch": 0.8214285714285714,
"grad_norm": 0.2419165423914357,
"learning_rate": 1.8594235253127375e-05,
"loss": 2.1522,
"step": 69
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.5579840019563456,
"learning_rate": 1.758386744638546e-05,
"loss": 2.3066,
"step": 70
},
{
"epoch": 0.8452380952380952,
"grad_norm": 0.33911931890037594,
"learning_rate": 1.6631192604065855e-05,
"loss": 2.1133,
"step": 71
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.23476812251683926,
"learning_rate": 1.573767968172413e-05,
"loss": 2.368,
"step": 72
},
{
"epoch": 0.8690476190476191,
"grad_norm": 0.2783320668382118,
"learning_rate": 1.490470641152345e-05,
"loss": 2.2878,
"step": 73
},
{
"epoch": 0.8809523809523809,
"grad_norm": 0.28487408965300465,
"learning_rate": 1.413355717787134e-05,
"loss": 2.1781,
"step": 74
},
{
"epoch": 0.8928571428571429,
"grad_norm": 0.2342499729484135,
"learning_rate": 1.3425421036992098e-05,
"loss": 2.2248,
"step": 75
},
{
"epoch": 0.9047619047619048,
"grad_norm": 0.3276211876914205,
"learning_rate": 1.2781389883488218e-05,
"loss": 2.0437,
"step": 76
},
{
"epoch": 0.9166666666666666,
"grad_norm": 0.23498872906974066,
"learning_rate": 1.2202456766718093e-05,
"loss": 2.1563,
"step": 77
},
{
"epoch": 0.9285714285714286,
"grad_norm": 0.24216692635411755,
"learning_rate": 1.168951435958588e-05,
"loss": 2.3686,
"step": 78
},
{
"epoch": 0.9404761904761905,
"grad_norm": 0.24267162761351987,
"learning_rate": 1.1243353582104556e-05,
"loss": 2.1837,
"step": 79
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.25968272691520106,
"learning_rate": 1.0864662381854632e-05,
"loss": 2.2079,
"step": 80
},
{
"epoch": 0.9642857142857143,
"grad_norm": 0.2437768972745406,
"learning_rate": 1.0554024673218807e-05,
"loss": 2.1451,
"step": 81
},
{
"epoch": 0.9761904761904762,
"grad_norm": 0.315983035460693,
"learning_rate": 1.0311919437028318e-05,
"loss": 2.3015,
"step": 82
},
{
"epoch": 0.9880952380952381,
"grad_norm": 0.2704189055909359,
"learning_rate": 1.0138719982009242e-05,
"loss": 2.2505,
"step": 83
},
{
"epoch": 1.0,
"grad_norm": 0.2530162105544282,
"learning_rate": 1.003469336916747e-05,
"loss": 2.3508,
"step": 84
},
{
"epoch": 1.0119047619047619,
"grad_norm": 0.2513361659364113,
"learning_rate": 1e-05,
"loss": 2.2057,
"step": 85
},
{
"epoch": 1.0119047619047619,
"eval_loss": 2.2760229110717773,
"eval_runtime": 85.0943,
"eval_samples_per_second": 0.235,
"eval_steps_per_second": 0.118,
"step": 85
}
],
"logging_steps": 1,
"max_steps": 85,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 50739334348800.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}