|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0119047619047619, |
|
"eval_steps": 17, |
|
"global_step": 85, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011904761904761904, |
|
"grad_norm": 0.9544092626224188, |
|
"learning_rate": 2e-05, |
|
"loss": 2.5697, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011904761904761904, |
|
"eval_loss": 2.4926321506500244, |
|
"eval_runtime": 84.5031, |
|
"eval_samples_per_second": 0.237, |
|
"eval_steps_per_second": 0.118, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.023809523809523808, |
|
"grad_norm": 0.8699237107804378, |
|
"learning_rate": 4e-05, |
|
"loss": 2.5045, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03571428571428571, |
|
"grad_norm": 0.8471542581095772, |
|
"learning_rate": 6e-05, |
|
"loss": 2.444, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.047619047619047616, |
|
"grad_norm": 0.6760804223199605, |
|
"learning_rate": 8e-05, |
|
"loss": 2.4135, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05952380952380952, |
|
"grad_norm": 0.3645239609928067, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5502, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 1.0191533159070523, |
|
"learning_rate": 9.996530663083255e-05, |
|
"loss": 2.4001, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.08333333333333333, |
|
"grad_norm": 0.7536562391615925, |
|
"learning_rate": 9.986128001799077e-05, |
|
"loss": 2.4241, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.09523809523809523, |
|
"grad_norm": 0.47058800200722606, |
|
"learning_rate": 9.96880805629717e-05, |
|
"loss": 2.3783, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.10714285714285714, |
|
"grad_norm": 0.42488376754431106, |
|
"learning_rate": 9.94459753267812e-05, |
|
"loss": 2.4496, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.11904761904761904, |
|
"grad_norm": 0.2964058549391389, |
|
"learning_rate": 9.913533761814537e-05, |
|
"loss": 2.4123, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.13095238095238096, |
|
"grad_norm": 0.274543387752409, |
|
"learning_rate": 9.875664641789545e-05, |
|
"loss": 2.3733, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.4783689100368163, |
|
"learning_rate": 9.831048564041413e-05, |
|
"loss": 2.3535, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.15476190476190477, |
|
"grad_norm": 1.3750248629360309, |
|
"learning_rate": 9.779754323328192e-05, |
|
"loss": 2.2893, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 0.3128166343225632, |
|
"learning_rate": 9.72186101165118e-05, |
|
"loss": 2.3495, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 0.2978734846133746, |
|
"learning_rate": 9.657457896300791e-05, |
|
"loss": 2.3489, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": 0.3296693493448657, |
|
"learning_rate": 9.586644282212866e-05, |
|
"loss": 2.2308, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.20238095238095238, |
|
"grad_norm": 0.35894978658993554, |
|
"learning_rate": 9.509529358847655e-05, |
|
"loss": 2.2991, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.20238095238095238, |
|
"eval_loss": 2.3356270790100098, |
|
"eval_runtime": 84.8559, |
|
"eval_samples_per_second": 0.236, |
|
"eval_steps_per_second": 0.118, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 0.556925508907772, |
|
"learning_rate": 9.426232031827588e-05, |
|
"loss": 2.3245, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2261904761904762, |
|
"grad_norm": 0.2596512913089154, |
|
"learning_rate": 9.336880739593416e-05, |
|
"loss": 2.2259, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 0.23810127330848288, |
|
"learning_rate": 9.241613255361455e-05, |
|
"loss": 2.3589, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.2486358353351647, |
|
"learning_rate": 9.140576474687264e-05, |
|
"loss": 2.3491, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2619047619047619, |
|
"grad_norm": 0.2569987072732923, |
|
"learning_rate": 9.033926188963352e-05, |
|
"loss": 2.3966, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.27380952380952384, |
|
"grad_norm": 0.2818819247779108, |
|
"learning_rate": 8.921826845200139e-05, |
|
"loss": 2.319, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.3051353561467919, |
|
"learning_rate": 8.804451292460585e-05, |
|
"loss": 2.3764, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2976190476190476, |
|
"grad_norm": 0.28378754273403445, |
|
"learning_rate": 8.681980515339464e-05, |
|
"loss": 2.2516, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.30952380952380953, |
|
"grad_norm": 0.23671187368883853, |
|
"learning_rate": 8.554603354898238e-05, |
|
"loss": 2.213, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.32142857142857145, |
|
"grad_norm": 0.2366234206615884, |
|
"learning_rate": 8.422516217485826e-05, |
|
"loss": 2.2728, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.29302002288238443, |
|
"learning_rate": 8.285922771894254e-05, |
|
"loss": 2.3657, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.34523809523809523, |
|
"grad_norm": 0.27328082985574287, |
|
"learning_rate": 8.14503363531613e-05, |
|
"loss": 2.293, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 0.2629290179604872, |
|
"learning_rate": 8.000066048588211e-05, |
|
"loss": 2.3183, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.36904761904761907, |
|
"grad_norm": 0.2769635403979301, |
|
"learning_rate": 7.85124354122177e-05, |
|
"loss": 2.2946, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 0.2322670372127043, |
|
"learning_rate": 7.698795586736298e-05, |
|
"loss": 2.1595, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.39285714285714285, |
|
"grad_norm": 0.2733558129218707, |
|
"learning_rate": 7.542957248827961e-05, |
|
"loss": 2.3469, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.40476190476190477, |
|
"grad_norm": 0.24834436859405049, |
|
"learning_rate": 7.383968818918426e-05, |
|
"loss": 2.199, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.40476190476190477, |
|
"eval_loss": 2.299875259399414, |
|
"eval_runtime": 85.3272, |
|
"eval_samples_per_second": 0.234, |
|
"eval_steps_per_second": 0.117, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 0.27121410447461325, |
|
"learning_rate": 7.222075445642904e-05, |
|
"loss": 2.1842, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.31502528289581183, |
|
"learning_rate": 7.057526756848719e-05, |
|
"loss": 2.2298, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.44047619047619047, |
|
"grad_norm": 0.24007323900338268, |
|
"learning_rate": 6.890576474687263e-05, |
|
"loss": 2.4688, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.4523809523809524, |
|
"grad_norm": 0.2608935989516304, |
|
"learning_rate": 6.721482024392835e-05, |
|
"loss": 2.2838, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.4642857142857143, |
|
"grad_norm": 0.29766343141422075, |
|
"learning_rate": 6.550504137351576e-05, |
|
"loss": 2.1777, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 0.29532812127434044, |
|
"learning_rate": 6.377906449072578e-05, |
|
"loss": 2.2005, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4880952380952381, |
|
"grad_norm": 0.25346420479015375, |
|
"learning_rate": 6.203955092681039e-05, |
|
"loss": 2.2493, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.2733242470847782, |
|
"learning_rate": 6.0289182885602704e-05, |
|
"loss": 2.264, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.5119047619047619, |
|
"grad_norm": 0.2832250530342488, |
|
"learning_rate": 5.8530659307753036e-05, |
|
"loss": 2.3321, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.5238095238095238, |
|
"grad_norm": 0.23261141435639465, |
|
"learning_rate": 5.6766691709158096e-05, |
|
"loss": 2.2584, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 0.24734669584168956, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 2.3196, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.5476190476190477, |
|
"grad_norm": 0.3610266226821954, |
|
"learning_rate": 5.3233308290841935e-05, |
|
"loss": 2.2857, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.5595238095238095, |
|
"grad_norm": 0.33827980816046865, |
|
"learning_rate": 5.1469340692246995e-05, |
|
"loss": 2.099, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.25979841286218003, |
|
"learning_rate": 4.9710817114397314e-05, |
|
"loss": 2.2922, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5833333333333334, |
|
"grad_norm": 0.2966248351989265, |
|
"learning_rate": 4.7960449073189606e-05, |
|
"loss": 2.3001, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5952380952380952, |
|
"grad_norm": 0.2649319822020958, |
|
"learning_rate": 4.6220935509274235e-05, |
|
"loss": 2.0389, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6071428571428571, |
|
"grad_norm": 0.3040829342691192, |
|
"learning_rate": 4.4494958626484276e-05, |
|
"loss": 2.3336, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.6071428571428571, |
|
"eval_loss": 2.2863776683807373, |
|
"eval_runtime": 85.1891, |
|
"eval_samples_per_second": 0.235, |
|
"eval_steps_per_second": 0.117, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.6190476190476191, |
|
"grad_norm": 0.25961905620657355, |
|
"learning_rate": 4.278517975607167e-05, |
|
"loss": 2.257, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.6309523809523809, |
|
"grad_norm": 0.2520680871081291, |
|
"learning_rate": 4.109423525312738e-05, |
|
"loss": 2.3646, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 0.2563922206690732, |
|
"learning_rate": 3.942473243151281e-05, |
|
"loss": 2.1605, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.6547619047619048, |
|
"grad_norm": 0.23710847982590777, |
|
"learning_rate": 3.777924554357096e-05, |
|
"loss": 2.2253, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.2358700389619207, |
|
"learning_rate": 3.616031181081575e-05, |
|
"loss": 2.2685, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.6785714285714286, |
|
"grad_norm": 0.25688706639252046, |
|
"learning_rate": 3.45704275117204e-05, |
|
"loss": 2.306, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.6904761904761905, |
|
"grad_norm": 0.28553196847033946, |
|
"learning_rate": 3.301204413263704e-05, |
|
"loss": 2.394, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.7023809523809523, |
|
"grad_norm": 0.24843175796925013, |
|
"learning_rate": 3.1487564587782306e-05, |
|
"loss": 2.2661, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.24662937375083732, |
|
"learning_rate": 2.9999339514117912e-05, |
|
"loss": 2.3021, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7261904761904762, |
|
"grad_norm": 0.3580921716501969, |
|
"learning_rate": 2.854966364683872e-05, |
|
"loss": 2.2545, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.7380952380952381, |
|
"grad_norm": 0.2601716169771101, |
|
"learning_rate": 2.7140772281057468e-05, |
|
"loss": 2.4256, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.2729633905977987, |
|
"learning_rate": 2.577483782514174e-05, |
|
"loss": 2.4064, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 0.35811707072589155, |
|
"learning_rate": 2.445396645101762e-05, |
|
"loss": 2.3352, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.7738095238095238, |
|
"grad_norm": 0.27727284509496897, |
|
"learning_rate": 2.3180194846605367e-05, |
|
"loss": 2.5192, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"grad_norm": 0.2803742538067055, |
|
"learning_rate": 2.195548707539416e-05, |
|
"loss": 2.3278, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.7976190476190477, |
|
"grad_norm": 0.25468361529041794, |
|
"learning_rate": 2.0781731547998614e-05, |
|
"loss": 2.2444, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.8095238095238095, |
|
"grad_norm": 0.4651703351900801, |
|
"learning_rate": 1.966073811036649e-05, |
|
"loss": 2.1637, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.8095238095238095, |
|
"eval_loss": 2.2794995307922363, |
|
"eval_runtime": 84.9665, |
|
"eval_samples_per_second": 0.235, |
|
"eval_steps_per_second": 0.118, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.8214285714285714, |
|
"grad_norm": 0.2419165423914357, |
|
"learning_rate": 1.8594235253127375e-05, |
|
"loss": 2.1522, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.5579840019563456, |
|
"learning_rate": 1.758386744638546e-05, |
|
"loss": 2.3066, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.8452380952380952, |
|
"grad_norm": 0.33911931890037594, |
|
"learning_rate": 1.6631192604065855e-05, |
|
"loss": 2.1133, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.23476812251683926, |
|
"learning_rate": 1.573767968172413e-05, |
|
"loss": 2.368, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.8690476190476191, |
|
"grad_norm": 0.2783320668382118, |
|
"learning_rate": 1.490470641152345e-05, |
|
"loss": 2.2878, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.8809523809523809, |
|
"grad_norm": 0.28487408965300465, |
|
"learning_rate": 1.413355717787134e-05, |
|
"loss": 2.1781, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 0.2342499729484135, |
|
"learning_rate": 1.3425421036992098e-05, |
|
"loss": 2.2248, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.9047619047619048, |
|
"grad_norm": 0.3276211876914205, |
|
"learning_rate": 1.2781389883488218e-05, |
|
"loss": 2.0437, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.9166666666666666, |
|
"grad_norm": 0.23498872906974066, |
|
"learning_rate": 1.2202456766718093e-05, |
|
"loss": 2.1563, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"grad_norm": 0.24216692635411755, |
|
"learning_rate": 1.168951435958588e-05, |
|
"loss": 2.3686, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.9404761904761905, |
|
"grad_norm": 0.24267162761351987, |
|
"learning_rate": 1.1243353582104556e-05, |
|
"loss": 2.1837, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.25968272691520106, |
|
"learning_rate": 1.0864662381854632e-05, |
|
"loss": 2.2079, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9642857142857143, |
|
"grad_norm": 0.2437768972745406, |
|
"learning_rate": 1.0554024673218807e-05, |
|
"loss": 2.1451, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.9761904761904762, |
|
"grad_norm": 0.315983035460693, |
|
"learning_rate": 1.0311919437028318e-05, |
|
"loss": 2.3015, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.9880952380952381, |
|
"grad_norm": 0.2704189055909359, |
|
"learning_rate": 1.0138719982009242e-05, |
|
"loss": 2.2505, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2530162105544282, |
|
"learning_rate": 1.003469336916747e-05, |
|
"loss": 2.3508, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.0119047619047619, |
|
"grad_norm": 0.2513361659364113, |
|
"learning_rate": 1e-05, |
|
"loss": 2.2057, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.0119047619047619, |
|
"eval_loss": 2.2760229110717773, |
|
"eval_runtime": 85.0943, |
|
"eval_samples_per_second": 0.235, |
|
"eval_steps_per_second": 0.118, |
|
"step": 85 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 85, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 50739334348800.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|