|
{ |
|
"best_metric": 6.436838150024414, |
|
"best_model_checkpoint": "./results/models/checkpoint-434265", |
|
"epoch": 17.0, |
|
"eval_steps": 500, |
|
"global_step": 434265, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.019573302016050106, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.000999608533959679, |
|
"loss": 7.0053, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03914660403210021, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.000999217067919358, |
|
"loss": 6.8806, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.058719906048150326, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.0009988256018790371, |
|
"loss": 6.8512, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.07829320806420043, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 0.000998434135838716, |
|
"loss": 6.8494, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.09786651008025053, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.000998042669798395, |
|
"loss": 6.8296, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.11743981209630065, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.000997651203758074, |
|
"loss": 6.8209, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.13701311411235076, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.0009972597377177531, |
|
"loss": 6.8119, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.15658641612840085, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.000996868271677432, |
|
"loss": 6.8096, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.17615971814445097, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.0009964768056371109, |
|
"loss": 6.7987, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.19573302016050106, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00099608533959679, |
|
"loss": 6.7946, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.21530632217655118, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 0.000995693873556469, |
|
"loss": 6.7825, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.2348796241926013, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.000995302407516148, |
|
"loss": 6.7724, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.2544529262086514, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.0009949109414758269, |
|
"loss": 6.7732, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.2740262282247015, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.000994519475435506, |
|
"loss": 6.7652, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.29359953024075164, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.000994128009395185, |
|
"loss": 6.7589, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.3131728322568017, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.000993736543354864, |
|
"loss": 6.7583, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.3327461342728518, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.0009933450773145429, |
|
"loss": 6.7515, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.35231943628890194, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.000992953611274222, |
|
"loss": 6.7467, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.37189273830495206, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.000992562145233901, |
|
"loss": 6.7366, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.39146604032100213, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.00099217067919358, |
|
"loss": 6.7365, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.41103934233705225, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.000991779213153259, |
|
"loss": 6.7253, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.43061264435310237, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.000991387747112938, |
|
"loss": 6.7356, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.4501859463691525, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.000990996281072617, |
|
"loss": 6.7177, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.4697592483852026, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 0.000990604815032296, |
|
"loss": 6.7195, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.4893325504012527, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.000990213348991975, |
|
"loss": 6.7202, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.5089058524173028, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.000989821882951654, |
|
"loss": 6.7183, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.5284791544333529, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.000989430416911333, |
|
"loss": 6.7186, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.548052456449403, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.000989038950871012, |
|
"loss": 6.7221, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.5676257584654532, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.000988647484830691, |
|
"loss": 6.7046, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.5871990604815033, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00098825601879037, |
|
"loss": 6.7078, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.6067723624975533, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.000987864552750049, |
|
"loss": 6.7063, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.6263456645136034, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 0.000987473086709728, |
|
"loss": 6.6989, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.6459189665296535, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.000987081620669407, |
|
"loss": 6.6951, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.6654922685457036, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.000986690154629086, |
|
"loss": 6.6921, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.6850655705617538, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 0.0009862986885887648, |
|
"loss": 6.6921, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.7046388725778039, |
|
"grad_norm": 7.75, |
|
"learning_rate": 0.000985907222548444, |
|
"loss": 6.6884, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.724212174593854, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.000985515756508123, |
|
"loss": 6.6841, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.7437854766099041, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 0.000985124290467802, |
|
"loss": 6.6904, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.7633587786259542, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.0009847328244274808, |
|
"loss": 6.6847, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.7829320806420043, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.00098434135838716, |
|
"loss": 6.6799, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.8025053826580544, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.000983949892346839, |
|
"loss": 6.6814, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.8220786846741045, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.000983558426306518, |
|
"loss": 6.6767, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.8416519866901546, |
|
"grad_norm": 4.5, |
|
"learning_rate": 0.0009831669602661968, |
|
"loss": 6.6714, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.8612252887062047, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.000982775494225876, |
|
"loss": 6.6725, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.8807985907222549, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.000982384028185555, |
|
"loss": 6.6646, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.900371892738305, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.000981992562145234, |
|
"loss": 6.6636, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.9199451947543551, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.0009816010961049128, |
|
"loss": 6.6506, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.9395184967704052, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 0.000981209630064592, |
|
"loss": 6.6546, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.9590917987864552, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.000980818164024271, |
|
"loss": 6.6504, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.9786651008025053, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 0.0009804266979839499, |
|
"loss": 6.6499, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.9982384028185555, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 0.0009800352319436288, |
|
"loss": 6.6422, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 6.643181800842285, |
|
"eval_runtime": 23.6647, |
|
"eval_samples_per_second": 84.514, |
|
"eval_steps_per_second": 5.282, |
|
"step": 25545 |
|
}, |
|
{ |
|
"epoch": 1.0178117048346056, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 0.0009796437659033079, |
|
"loss": 6.6453, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.0373850068506556, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.000979252299862987, |
|
"loss": 6.6465, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.0569583088667058, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.0009788608338226659, |
|
"loss": 6.6497, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.0765316108827558, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.0009784693677823448, |
|
"loss": 6.642, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.096104912898806, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.0009780779017420239, |
|
"loss": 6.6384, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.115678214914856, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.000977686435701703, |
|
"loss": 6.6389, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.1352515169309063, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.0009772949696613819, |
|
"loss": 6.6361, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.1548248189469563, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.0009769035036210608, |
|
"loss": 6.6393, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.1743981209630066, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.00097651203758074, |
|
"loss": 6.6413, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.1939714229790566, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0009761205715404189, |
|
"loss": 6.6334, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.2135447249951068, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.0009757291055000979, |
|
"loss": 6.6316, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.2331180270111568, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.000975337639459777, |
|
"loss": 6.6257, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.2526913290272068, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.000974946173419456, |
|
"loss": 6.6277, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.272264631043257, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.0009745547073791348, |
|
"loss": 6.6333, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.291837933059307, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.0009741632413388138, |
|
"loss": 6.6241, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.3114112350753573, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.000973771775298493, |
|
"loss": 6.6336, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.3309845370914073, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 0.0009733803092581718, |
|
"loss": 6.6298, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.3505578391074575, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.0009729888432178508, |
|
"loss": 6.6248, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.3701311411235075, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.0009725973771775298, |
|
"loss": 6.6291, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.3897044431395575, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.0009722059111372089, |
|
"loss": 6.6224, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.4092777451556078, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.0009718144450968878, |
|
"loss": 6.6194, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.4288510471716578, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 0.0009714229790565668, |
|
"loss": 6.6127, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 1.448424349187708, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0009710315130162458, |
|
"loss": 6.6062, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.467997651203758, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.0009706400469759249, |
|
"loss": 6.6188, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 1.4875709532198083, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.0009702485809356038, |
|
"loss": 6.6132, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.5071442552358583, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 0.0009698571148952828, |
|
"loss": 6.6034, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 1.5267175572519083, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 0.0009694656488549618, |
|
"loss": 6.6049, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.5462908592679585, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 0.0009690741828146409, |
|
"loss": 6.6077, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 1.5658641612840087, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.0009686827167743198, |
|
"loss": 6.6054, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.5854374633000587, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 0.0009682912507339988, |
|
"loss": 6.6109, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 1.6050107653161088, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.0009678997846936779, |
|
"loss": 6.6122, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.624584067332159, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.0009675083186533569, |
|
"loss": 6.6049, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 1.644157369348209, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 0.0009671168526130358, |
|
"loss": 6.6004, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.663730671364259, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 0.0009667253865727148, |
|
"loss": 6.608, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 1.6833039733803092, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.0009663339205323939, |
|
"loss": 6.5973, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.7028772753963595, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 0.0009659424544920729, |
|
"loss": 6.5984, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 1.7224505774124095, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 0.0009655509884517518, |
|
"loss": 6.6001, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.7420238794284595, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.0009651595224114308, |
|
"loss": 6.6016, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.7615971814445097, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 0.0009647680563711099, |
|
"loss": 6.598, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.78117048346056, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.0009643765903307889, |
|
"loss": 6.6052, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.80074378547661, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0009639851242904678, |
|
"loss": 6.5902, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.82031708749266, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.0009635936582501468, |
|
"loss": 6.5847, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.8398903895087102, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 0.0009632021922098259, |
|
"loss": 6.5948, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.8594636915247602, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 0.0009628107261695049, |
|
"loss": 6.6071, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.8790369935408102, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 0.0009624192601291838, |
|
"loss": 6.5973, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.8986102955568605, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 0.0009620277940888628, |
|
"loss": 6.5917, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.9181835975729107, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.0009616363280485419, |
|
"loss": 6.5937, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.9377568995889607, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 0.0009612448620082208, |
|
"loss": 6.5919, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.9573302016050107, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 0.0009608533959678998, |
|
"loss": 6.5885, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.976903503621061, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 0.0009604619299275788, |
|
"loss": 6.6004, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.996476805637111, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 0.0009600704638872579, |
|
"loss": 6.5952, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 6.583548545837402, |
|
"eval_runtime": 20.5988, |
|
"eval_samples_per_second": 97.093, |
|
"eval_steps_per_second": 6.068, |
|
"step": 51090 |
|
}, |
|
{ |
|
"epoch": 2.016050107653161, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.0009596789978469367, |
|
"loss": 6.5932, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 2.035623409669211, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.0009592875318066157, |
|
"loss": 6.5895, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.0551967116852614, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 0.0009588960657662949, |
|
"loss": 6.5814, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 2.074770013701311, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 0.0009585045997259738, |
|
"loss": 6.5826, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.0943433157173614, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.0009581131336856527, |
|
"loss": 6.591, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 2.1139166177334117, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 0.0009577216676453317, |
|
"loss": 6.5807, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.133489919749462, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 0.0009573302016050108, |
|
"loss": 6.5793, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 2.1530632217655117, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 0.0009569387355646898, |
|
"loss": 6.5854, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 2.172636523781562, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 0.0009565472695243687, |
|
"loss": 6.5796, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 2.192209825797612, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.0009561558034840477, |
|
"loss": 6.5693, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 2.2117831278136624, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 0.0009557643374437268, |
|
"loss": 6.5837, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 2.231356429829712, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 0.0009553728714034058, |
|
"loss": 6.5791, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 2.2509297318457624, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 0.0009549814053630847, |
|
"loss": 6.5746, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 2.2705030338618126, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.0009545899393227637, |
|
"loss": 6.5766, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 2.2900763358778624, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 0.0009541984732824428, |
|
"loss": 6.5832, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 2.3096496378939126, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 0.0009538070072421218, |
|
"loss": 6.5732, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 2.329222939909963, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.0009534155412018007, |
|
"loss": 6.5774, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 2.348796241926013, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 0.0009530240751614797, |
|
"loss": 6.5692, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 2.368369543942063, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 0.0009526326091211588, |
|
"loss": 6.5789, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 2.387942845958113, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.0009522411430808378, |
|
"loss": 6.576, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 2.4075161479741634, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 0.0009518496770405167, |
|
"loss": 6.5807, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 2.4270894499902136, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 0.0009514582110001958, |
|
"loss": 6.5753, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 2.4466627520062634, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 0.0009510667449598748, |
|
"loss": 6.5799, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 2.4662360540223136, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 0.0009506752789195538, |
|
"loss": 6.5678, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 2.485809356038364, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0009502838128792327, |
|
"loss": 6.5683, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 2.5053826580544136, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 0.0009498923468389118, |
|
"loss": 6.5674, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 2.524955960070464, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.0009495008807985908, |
|
"loss": 6.5606, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 2.544529262086514, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.0009491094147582697, |
|
"loss": 6.5669, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.0009487179487179487, |
|
"loss": 6.5519, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 2.583675866118614, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.0009483264826776278, |
|
"loss": 6.5573, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 2.6032491681346643, |
|
"grad_norm": 3.125, |
|
"learning_rate": 0.0009479350166373068, |
|
"loss": 6.5511, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 2.6228224701507146, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.0009475435505969857, |
|
"loss": 6.5593, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 2.642395772166765, |
|
"grad_norm": 7.375, |
|
"learning_rate": 0.0009471520845566647, |
|
"loss": 6.5606, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 2.6619690741828146, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.0009467606185163438, |
|
"loss": 6.5601, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 2.681542376198865, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 0.0009463691524760228, |
|
"loss": 6.5607, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 2.701115678214915, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 0.0009459776864357017, |
|
"loss": 6.5462, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 2.720688980230965, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.0009455862203953807, |
|
"loss": 6.567, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 2.740262282247015, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.0009451947543550598, |
|
"loss": 6.5566, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 2.7598355842630653, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 0.0009448032883147388, |
|
"loss": 6.5543, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 2.779408886279115, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 0.0009444118222744176, |
|
"loss": 6.5527, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 2.7989821882951653, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0009440203562340968, |
|
"loss": 6.557, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 2.8185554903112156, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0009436288901937757, |
|
"loss": 6.5603, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 2.8381287923272653, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 0.0009432374241534547, |
|
"loss": 6.5515, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 2.8577020943433156, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 0.0009428459581131336, |
|
"loss": 6.5506, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 2.877275396359366, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.0009424544920728127, |
|
"loss": 6.5583, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 2.896848698375416, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.0009420630260324917, |
|
"loss": 6.5545, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 2.9164220003914663, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 0.0009416715599921707, |
|
"loss": 6.5564, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 2.935995302407516, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.0009412800939518496, |
|
"loss": 6.5502, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.9555686044235663, |
|
"grad_norm": 6.375, |
|
"learning_rate": 0.0009408886279115287, |
|
"loss": 6.5434, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 2.9751419064396165, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.0009404971618712077, |
|
"loss": 6.5457, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 2.9947152084556663, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 0.0009401056958308867, |
|
"loss": 6.5616, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 6.546030044555664, |
|
"eval_runtime": 20.7765, |
|
"eval_samples_per_second": 96.262, |
|
"eval_steps_per_second": 6.016, |
|
"step": 76635 |
|
}, |
|
{ |
|
"epoch": 3.0142885104717165, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 0.0009397142297905656, |
|
"loss": 6.545, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 3.0338618124877668, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.0009393227637502447, |
|
"loss": 6.5504, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 3.053435114503817, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0009389312977099237, |
|
"loss": 6.5508, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 3.073008416519867, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 0.0009385398316696027, |
|
"loss": 6.5495, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 3.092581718535917, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 0.0009381483656292816, |
|
"loss": 6.5518, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 3.1121550205519672, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.0009377568995889607, |
|
"loss": 6.5535, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 3.131728322568017, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.0009373654335486397, |
|
"loss": 6.5451, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 3.1513016245840673, |
|
"grad_norm": 7.125, |
|
"learning_rate": 0.0009369739675083186, |
|
"loss": 6.545, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 3.1708749266001175, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 0.0009365825014679976, |
|
"loss": 6.5379, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 3.1904482286161677, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.0009361910354276767, |
|
"loss": 6.5455, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 3.2100215306322175, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.0009357995693873557, |
|
"loss": 6.5449, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 3.2295948326482677, |
|
"grad_norm": 2.375, |
|
"learning_rate": 0.0009354081033470346, |
|
"loss": 6.5413, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 3.249168134664318, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 0.0009350166373067137, |
|
"loss": 6.5442, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 3.2687414366803678, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.0009346251712663927, |
|
"loss": 6.5454, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 3.288314738696418, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.0009342337052260717, |
|
"loss": 6.5383, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 3.3078880407124682, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.0009338422391857506, |
|
"loss": 6.5521, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 3.3274613427285185, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 0.0009334507731454297, |
|
"loss": 6.5454, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 3.3470346447445682, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.0009330593071051087, |
|
"loss": 6.5394, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 3.3666079467606185, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0009326678410647877, |
|
"loss": 6.5398, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 3.3861812487766687, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.0009322763750244666, |
|
"loss": 6.5434, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 3.405754550792719, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.0009318849089841457, |
|
"loss": 6.5355, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 3.4253278528087687, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.0009314934429438247, |
|
"loss": 6.5417, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 3.444901154824819, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0009311019769035037, |
|
"loss": 6.5454, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 3.464474456840869, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 0.0009307105108631826, |
|
"loss": 6.5316, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 3.484047758856919, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 0.0009303190448228617, |
|
"loss": 6.5429, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 3.503621060872969, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 0.0009299275787825407, |
|
"loss": 6.5424, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 3.5231943628890194, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 0.0009295361127422197, |
|
"loss": 6.5447, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 3.5427676649050692, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.0009291446467018985, |
|
"loss": 6.5418, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 3.5623409669211195, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.0009287531806615776, |
|
"loss": 6.5405, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 3.5819142689371697, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.0009283617146212566, |
|
"loss": 6.5403, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 3.60148757095322, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.0009279702485809356, |
|
"loss": 6.5384, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 3.62106087296927, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.0009275787825406146, |
|
"loss": 6.542, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 3.64063417498532, |
|
"grad_norm": 3.75, |
|
"learning_rate": 0.0009271873165002936, |
|
"loss": 6.538, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 3.66020747700137, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.0009267958504599726, |
|
"loss": 6.5282, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 3.6797807790174204, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 0.0009264043844196516, |
|
"loss": 6.5442, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 3.69935408103347, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 0.0009260129183793306, |
|
"loss": 6.5425, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 3.7189273830495204, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.0009256214523390096, |
|
"loss": 6.5375, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 3.7385006850655707, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.0009252299862986886, |
|
"loss": 6.5488, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 3.7580739870816204, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.0009248385202583675, |
|
"loss": 6.5497, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 3.7776472890976707, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.0009244470542180466, |
|
"loss": 6.5467, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 3.797220591113721, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 0.0009240555881777256, |
|
"loss": 6.5442, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 3.816793893129771, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.0009236641221374046, |
|
"loss": 6.5393, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 3.8363671951458214, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.0009232726560970835, |
|
"loss": 6.5378, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 3.855940497161871, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.0009228811900567626, |
|
"loss": 6.5469, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 3.8755137991779214, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.0009224897240164416, |
|
"loss": 6.5322, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 3.8950871011939716, |
|
"grad_norm": 62.75, |
|
"learning_rate": 0.0009220982579761206, |
|
"loss": 6.5472, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 3.9146604032100214, |
|
"grad_norm": 8.0, |
|
"learning_rate": 0.0009217067919357995, |
|
"loss": 6.5364, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 3.9342337052260716, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 0.0009213153258954786, |
|
"loss": 6.5356, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 3.953807007242122, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.0009209238598551576, |
|
"loss": 6.5267, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 3.9733803092581716, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 0.0009205323938148366, |
|
"loss": 6.5332, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 3.992953611274222, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 0.0009201409277745156, |
|
"loss": 6.5363, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 6.529191493988037, |
|
"eval_runtime": 20.9814, |
|
"eval_samples_per_second": 95.322, |
|
"eval_steps_per_second": 5.958, |
|
"step": 102180 |
|
}, |
|
{ |
|
"epoch": 4.012526913290272, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.0009197494617341946, |
|
"loss": 6.541, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 4.032100215306322, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0009193579956938736, |
|
"loss": 6.5328, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 4.051673517322373, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.0009189665296535526, |
|
"loss": 6.5279, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 4.071246819338422, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.0009185750636132316, |
|
"loss": 6.5302, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 4.090820121354472, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.0009181835975729106, |
|
"loss": 6.5393, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 4.110393423370523, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.0009177921315325896, |
|
"loss": 6.5323, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 4.129966725386573, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.0009174006654922686, |
|
"loss": 6.5308, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 4.149540027402622, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 0.0009170091994519476, |
|
"loss": 6.5302, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 4.169113329418673, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.0009166177334116266, |
|
"loss": 6.5328, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 4.188686631434723, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 0.0009162262673713056, |
|
"loss": 6.5346, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 4.2082599334507735, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 0.0009158348013309846, |
|
"loss": 6.5306, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 4.227833235466823, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.0009154433352906636, |
|
"loss": 6.5278, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 4.247406537482873, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 0.0009150518692503426, |
|
"loss": 6.5298, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 4.266979839498924, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 0.0009146604032100216, |
|
"loss": 6.5243, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 4.286553141514974, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.0009142689371697005, |
|
"loss": 6.5258, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 4.306126443531023, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 0.0009138774711293795, |
|
"loss": 6.5237, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 4.325699745547074, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 0.0009134860050890585, |
|
"loss": 6.5307, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 4.345273047563124, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 0.0009130945390487375, |
|
"loss": 6.5236, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 4.364846349579174, |
|
"grad_norm": 5.625, |
|
"learning_rate": 0.0009127030730084165, |
|
"loss": 6.5264, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 4.384419651595224, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.0009123116069680955, |
|
"loss": 6.5254, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 4.403992953611274, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 0.0009119201409277745, |
|
"loss": 6.5323, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 4.423566255627325, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.0009115286748874535, |
|
"loss": 6.5362, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 4.4431395576433745, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 0.0009111372088471325, |
|
"loss": 6.5292, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 4.462712859659424, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.0009107457428068115, |
|
"loss": 6.5248, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 4.482286161675475, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.0009103542767664905, |
|
"loss": 6.5216, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 4.501859463691525, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.0009099628107261695, |
|
"loss": 6.5157, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 4.521432765707575, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 0.0009095713446858485, |
|
"loss": 6.5172, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 4.541006067723625, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.0009091798786455275, |
|
"loss": 6.524, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 4.560579369739675, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0009087884126052065, |
|
"loss": 6.5227, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 4.580152671755725, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.0009083969465648855, |
|
"loss": 6.5148, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 4.5997259737717755, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 0.0009080054805245645, |
|
"loss": 6.5208, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 4.619299275787825, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 0.0009076140144842435, |
|
"loss": 6.5183, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 4.638872577803875, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 0.0009072225484439225, |
|
"loss": 6.5221, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 4.658445879819926, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 0.0009068310824036015, |
|
"loss": 6.5201, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 4.6780191818359755, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.0009064396163632805, |
|
"loss": 6.5176, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 4.697592483852026, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.0009060481503229595, |
|
"loss": 6.5137, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 4.717165785868076, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 0.0009056566842826385, |
|
"loss": 6.5178, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 4.736739087884126, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.0009052652182423175, |
|
"loss": 6.5129, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 4.7563123899001765, |
|
"grad_norm": 11.375, |
|
"learning_rate": 0.0009048737522019965, |
|
"loss": 6.5149, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 4.775885691916226, |
|
"grad_norm": 2.375, |
|
"learning_rate": 0.0009044822861616755, |
|
"loss": 6.5124, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 4.795458993932276, |
|
"grad_norm": 6.375, |
|
"learning_rate": 0.0009040908201213545, |
|
"loss": 6.5112, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 4.815032295948327, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.0009036993540810336, |
|
"loss": 6.5083, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 4.8346055979643765, |
|
"grad_norm": 8.0, |
|
"learning_rate": 0.0009033078880407125, |
|
"loss": 6.5072, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 4.854178899980427, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.0009029164220003915, |
|
"loss": 6.5118, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 4.873752201996477, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.0009025249559600705, |
|
"loss": 6.5117, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 4.893325504012527, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 0.0009021334899197496, |
|
"loss": 6.5099, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 4.912898806028577, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 0.0009017420238794285, |
|
"loss": 6.5095, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 4.932472108044627, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.0009013505578391075, |
|
"loss": 6.5088, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 4.952045410060677, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.0009009590917987865, |
|
"loss": 6.5085, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 4.971618712076728, |
|
"grad_norm": 3.75, |
|
"learning_rate": 0.0009005676257584656, |
|
"loss": 6.5163, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 4.9911920140927775, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 0.0009001761597181445, |
|
"loss": 6.513, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 6.507379055023193, |
|
"eval_runtime": 22.1406, |
|
"eval_samples_per_second": 90.332, |
|
"eval_steps_per_second": 5.646, |
|
"step": 127725 |
|
}, |
|
{ |
|
"epoch": 5.010765316108827, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.0008997846936778235, |
|
"loss": 6.5096, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 5.030338618124878, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.0008993932276375024, |
|
"loss": 6.5089, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 5.049911920140928, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 0.0008990017615971816, |
|
"loss": 6.5087, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 5.0694852221569775, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 0.0008986102955568604, |
|
"loss": 6.5135, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 5.089058524173028, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 0.0008982188295165394, |
|
"loss": 6.5038, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 5.108631826189078, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 0.0008978273634762184, |
|
"loss": 6.5035, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 0.0008974358974358974, |
|
"loss": 6.5167, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 5.147778430221178, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 0.0008970444313955764, |
|
"loss": 6.5109, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 5.167351732237228, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.0008966529653552554, |
|
"loss": 6.5009, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 5.186925034253279, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0008962614993149345, |
|
"loss": 6.5078, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 5.206498336269329, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 0.0008958700332746134, |
|
"loss": 6.5062, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 5.2260716382853785, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 0.0008954785672342924, |
|
"loss": 6.5045, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 5.245644940301429, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 0.0008950871011939714, |
|
"loss": 6.4964, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 5.265218242317479, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.0008946956351536505, |
|
"loss": 6.5082, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 5.284791544333529, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 0.0008943041691133294, |
|
"loss": 6.5047, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 5.304364846349579, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0008939127030730084, |
|
"loss": 6.5025, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 5.323938148365629, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 0.0008935212370326874, |
|
"loss": 6.4966, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 5.34351145038168, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.0008931297709923665, |
|
"loss": 6.503, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 5.36308475239773, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 0.0008927383049520454, |
|
"loss": 6.5022, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 5.382658054413779, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.0008923468389117244, |
|
"loss": 6.5025, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 5.40223135642983, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 0.0008919553728714034, |
|
"loss": 6.5053, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 5.42180465844588, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 0.0008915639068310825, |
|
"loss": 6.5127, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 5.44137796046193, |
|
"grad_norm": 14.375, |
|
"learning_rate": 0.0008911724407907614, |
|
"loss": 6.5019, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 5.46095126247798, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 0.0008907809747504404, |
|
"loss": 6.5024, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 5.48052456449403, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.0008903895087101194, |
|
"loss": 6.5059, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 5.50009786651008, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 0.0008899980426697985, |
|
"loss": 6.5027, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 5.519671168526131, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 0.0008896065766294774, |
|
"loss": 6.4974, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 5.53924447054218, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 0.0008892151105891564, |
|
"loss": 6.4957, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 5.55881777255823, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 0.0008888236445488354, |
|
"loss": 6.5043, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 5.578391074574281, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 0.0008884321785085145, |
|
"loss": 6.4968, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 5.597964376590331, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 0.0008880407124681934, |
|
"loss": 6.5016, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 5.61753767860638, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.0008876492464278724, |
|
"loss": 6.5012, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 5.637110980622431, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 0.0008872577803875515, |
|
"loss": 6.5061, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 5.656684282638481, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.0008868663143472305, |
|
"loss": 6.5026, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 5.676257584654532, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.0008864748483069094, |
|
"loss": 6.4981, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 5.695830886670581, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.0008860833822665884, |
|
"loss": 6.5063, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 5.715404188686631, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.0008856919162262675, |
|
"loss": 6.5071, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 5.734977490702682, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 0.0008853004501859464, |
|
"loss": 6.5037, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 5.754550792718732, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 0.0008849089841456254, |
|
"loss": 6.504, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 5.774124094734781, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.0008845175181053043, |
|
"loss": 6.4992, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 5.793697396750832, |
|
"grad_norm": 25.25, |
|
"learning_rate": 0.0008841260520649835, |
|
"loss": 6.5052, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 5.813270698766882, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 0.0008837345860246623, |
|
"loss": 6.4979, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 5.8328440007829325, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.0008833431199843413, |
|
"loss": 6.5023, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 5.852417302798982, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.0008829516539440203, |
|
"loss": 6.4971, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 5.871990604815032, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.0008825601879036994, |
|
"loss": 6.4968, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 5.891563906831083, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.0008821687218633783, |
|
"loss": 6.5053, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 5.911137208847133, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.0008817772558230573, |
|
"loss": 6.4988, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 5.930710510863182, |
|
"grad_norm": 4.25, |
|
"learning_rate": 0.0008813857897827363, |
|
"loss": 6.5071, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 5.950283812879233, |
|
"grad_norm": 3.125, |
|
"learning_rate": 0.0008809943237424154, |
|
"loss": 6.5014, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 5.969857114895283, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.0008806028577020943, |
|
"loss": 6.5002, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 5.989430416911333, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 0.0008802113916617733, |
|
"loss": 6.4993, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 6.495845794677734, |
|
"eval_runtime": 21.9172, |
|
"eval_samples_per_second": 91.253, |
|
"eval_steps_per_second": 5.703, |
|
"step": 153270 |
|
}, |
|
{ |
|
"epoch": 6.009003718927383, |
|
"grad_norm": 8.25, |
|
"learning_rate": 0.0008798199256214524, |
|
"loss": 6.4952, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 6.028577020943433, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.0008794284595811314, |
|
"loss": 6.4967, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 6.048150322959483, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.0008790369935408103, |
|
"loss": 6.5073, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 6.0677236249755335, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.0008786455275004893, |
|
"loss": 6.4969, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 6.087296926991583, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.0008782540614601684, |
|
"loss": 6.487, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 6.106870229007634, |
|
"grad_norm": 1.875, |
|
"learning_rate": 0.0008778625954198474, |
|
"loss": 6.49, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 6.126443531023684, |
|
"grad_norm": 5.25, |
|
"learning_rate": 0.0008774711293795263, |
|
"loss": 6.4948, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 6.146016833039734, |
|
"grad_norm": 14.25, |
|
"learning_rate": 0.0008770796633392053, |
|
"loss": 6.4921, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 6.165590135055784, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.0008766881972988844, |
|
"loss": 6.4909, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 6.185163437071834, |
|
"grad_norm": 3.375, |
|
"learning_rate": 0.0008762967312585634, |
|
"loss": 6.4917, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 6.204736739087884, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 0.0008759052652182423, |
|
"loss": 6.494, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 6.2243100411039345, |
|
"grad_norm": 5.0, |
|
"learning_rate": 0.0008755137991779213, |
|
"loss": 6.4909, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 6.243883343119984, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.0008751223331376004, |
|
"loss": 6.498, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 6.263456645136034, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.0008747308670972794, |
|
"loss": 6.4899, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 6.283029947152085, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.0008743394010569583, |
|
"loss": 6.4991, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 6.3026032491681345, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 0.0008739479350166373, |
|
"loss": 6.4939, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 6.322176551184185, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 0.0008735564689763164, |
|
"loss": 6.4888, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 6.341749853200235, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 0.0008731650029359953, |
|
"loss": 6.4918, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 6.361323155216285, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.0008727735368956743, |
|
"loss": 6.4992, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 6.3808964572323355, |
|
"grad_norm": 2.625, |
|
"learning_rate": 0.0008723820708553534, |
|
"loss": 6.4958, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 6.400469759248385, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.0008719906048150324, |
|
"loss": 6.4894, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 6.420043061264435, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.0008715991387747113, |
|
"loss": 6.4953, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 6.439616363280486, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.0008712076727343903, |
|
"loss": 6.4929, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 6.4591896652965355, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.0008708162066940694, |
|
"loss": 6.4848, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 6.478762967312585, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0008704247406537484, |
|
"loss": 6.4871, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 6.498336269328636, |
|
"grad_norm": 3.75, |
|
"learning_rate": 0.0008700332746134272, |
|
"loss": 6.4914, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 6.517909571344686, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 0.0008696418085731062, |
|
"loss": 6.4888, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 6.5374828733607355, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 0.0008692503425327854, |
|
"loss": 6.4932, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 6.557056175376786, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 0.0008688588764924644, |
|
"loss": 6.4992, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 6.576629477392836, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 0.0008684674104521432, |
|
"loss": 6.4881, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 6.596202779408887, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 0.0008680759444118222, |
|
"loss": 6.4944, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 6.6157760814249365, |
|
"grad_norm": 19.0, |
|
"learning_rate": 0.0008676844783715013, |
|
"loss": 6.4874, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 6.635349383440986, |
|
"grad_norm": 8.25, |
|
"learning_rate": 0.0008672930123311803, |
|
"loss": 6.4871, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 6.654922685457037, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.0008669015462908592, |
|
"loss": 6.4978, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 6.674495987473087, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 0.0008665100802505382, |
|
"loss": 6.4946, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 6.6940692894891365, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 0.0008661186142102173, |
|
"loss": 6.4868, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 6.713642591505187, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 0.0008657271481698963, |
|
"loss": 6.4876, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 6.733215893521237, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.0008653356821295752, |
|
"loss": 6.4887, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 6.752789195537288, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 0.0008649442160892542, |
|
"loss": 6.4897, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 6.772362497553337, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.0008645527500489333, |
|
"loss": 6.4857, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 6.791935799569387, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.0008641612840086123, |
|
"loss": 6.4867, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 6.811509101585438, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.0008637698179682912, |
|
"loss": 6.4883, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 6.831082403601488, |
|
"grad_norm": 1.875, |
|
"learning_rate": 0.0008633783519279703, |
|
"loss": 6.4783, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 6.8506557056175374, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.0008629868858876493, |
|
"loss": 6.4862, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 6.870229007633588, |
|
"grad_norm": 20.875, |
|
"learning_rate": 0.0008625954198473283, |
|
"loss": 6.486, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 6.889802309649638, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.0008622039538070072, |
|
"loss": 6.4941, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 6.909375611665688, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.0008618124877666863, |
|
"loss": 6.4904, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 6.928948913681738, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.0008614210217263653, |
|
"loss": 6.4864, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 6.948522215697788, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 0.0008610295556860442, |
|
"loss": 6.4876, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 6.968095517713838, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.0008606380896457232, |
|
"loss": 6.4811, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 6.987668819729889, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.0008602466236054023, |
|
"loss": 6.4881, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 6.482935905456543, |
|
"eval_runtime": 22.9737, |
|
"eval_samples_per_second": 87.056, |
|
"eval_steps_per_second": 5.441, |
|
"step": 178815 |
|
}, |
|
{ |
|
"epoch": 7.007242121745938, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.0008598551575650813, |
|
"loss": 6.4776, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 7.026815423761989, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.0008594636915247602, |
|
"loss": 6.4777, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 7.046388725778039, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 0.0008590722254844392, |
|
"loss": 6.4789, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 7.065962027794089, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.0008586807594441183, |
|
"loss": 6.4884, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 7.085535329810139, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 0.0008582892934037973, |
|
"loss": 6.4847, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 7.105108631826189, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 0.0008578978273634762, |
|
"loss": 6.4811, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 7.124681933842239, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.0008575063613231552, |
|
"loss": 6.4872, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 7.14425523585829, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.0008571148952828343, |
|
"loss": 6.486, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 7.163828537874339, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 0.0008567234292425133, |
|
"loss": 6.4848, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 7.183401839890389, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.0008563319632021922, |
|
"loss": 6.4808, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 7.20297514190644, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0008559404971618713, |
|
"loss": 6.4858, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 7.22254844392249, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 0.0008555490311215503, |
|
"loss": 6.4875, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 7.242121745938539, |
|
"grad_norm": 10.375, |
|
"learning_rate": 0.0008551575650812293, |
|
"loss": 6.483, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 7.26169504795459, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.0008547660990409081, |
|
"loss": 6.4803, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 7.28126834997064, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 0.0008543746330005873, |
|
"loss": 6.4823, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 7.3008416519866906, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.0008539831669602662, |
|
"loss": 6.4725, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 7.32041495400274, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.0008535917009199452, |
|
"loss": 6.4783, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 7.33998825601879, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.0008532002348796241, |
|
"loss": 6.4714, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 7.359561558034841, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.0008528087688393032, |
|
"loss": 6.4807, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 7.379134860050891, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.0008524173027989822, |
|
"loss": 6.471, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 7.39870816206694, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0008520258367586612, |
|
"loss": 6.4799, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 7.418281464082991, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.0008516343707183401, |
|
"loss": 6.48, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 7.437854766099041, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.0008512429046780192, |
|
"loss": 6.4835, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 7.457428068115091, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 0.0008508514386376982, |
|
"loss": 6.4747, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 7.477001370131141, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.0008504599725973772, |
|
"loss": 6.4759, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 7.496574672147191, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.0008500685065570561, |
|
"loss": 6.4765, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 7.516147974163241, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0008496770405167352, |
|
"loss": 6.4753, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 7.5357212761792916, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0008492855744764142, |
|
"loss": 6.4782, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 7.555294578195341, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 0.0008488941084360931, |
|
"loss": 6.4725, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 7.574867880211392, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.0008485026423957722, |
|
"loss": 6.4825, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 7.594441182227442, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 0.0008481111763554512, |
|
"loss": 6.4758, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 7.614014484243492, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 0.0008477197103151302, |
|
"loss": 6.4732, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 7.633587786259542, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.0008473282442748091, |
|
"loss": 6.4812, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 7.653161088275592, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.0008469367782344882, |
|
"loss": 6.4711, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 7.672734390291642, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 0.0008465453121941672, |
|
"loss": 6.4718, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 9.625, |
|
"learning_rate": 0.0008461538461538462, |
|
"loss": 6.4793, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 7.711880994323742, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 0.0008457623801135251, |
|
"loss": 6.4758, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 7.731454296339793, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.0008453709140732042, |
|
"loss": 6.4706, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 7.751027598355843, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.0008449794480328832, |
|
"loss": 6.4838, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 7.7706009003718925, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 0.0008445879819925622, |
|
"loss": 6.475, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 7.790174202387943, |
|
"grad_norm": 2.75, |
|
"learning_rate": 0.0008441965159522411, |
|
"loss": 6.4766, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 7.809747504403993, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.0008438050499119202, |
|
"loss": 6.4833, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 7.829320806420043, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.0008434135838715992, |
|
"loss": 6.4786, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 7.8488941084360935, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.0008430221178312782, |
|
"loss": 6.4747, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 7.868467410452143, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 0.0008426306517909571, |
|
"loss": 6.4839, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 7.888040712468193, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.0008422391857506362, |
|
"loss": 6.477, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 7.907614014484244, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.0008418477197103152, |
|
"loss": 6.4823, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 7.9271873165002935, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 0.0008414562536699942, |
|
"loss": 6.4666, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 7.946760618516343, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.000841064787629673, |
|
"loss": 6.4716, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 7.966333920532394, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 0.0008406733215893522, |
|
"loss": 6.4678, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 7.985907222548444, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.0008402818555490312, |
|
"loss": 6.471, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 6.472127914428711, |
|
"eval_runtime": 24.0419, |
|
"eval_samples_per_second": 83.188, |
|
"eval_steps_per_second": 5.199, |
|
"step": 204360 |
|
}, |
|
{ |
|
"epoch": 8.005480524564494, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 0.0008398903895087102, |
|
"loss": 6.4769, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 8.025053826580544, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.0008394989234683892, |
|
"loss": 6.4651, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 8.044627128596595, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 0.0008391074574280681, |
|
"loss": 6.4746, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 8.064200430612644, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.0008387159913877471, |
|
"loss": 6.4704, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 8.083773732628694, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.0008383245253474261, |
|
"loss": 6.4648, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 8.103347034644745, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.0008379330593071051, |
|
"loss": 6.4735, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 8.122920336660794, |
|
"grad_norm": 17.0, |
|
"learning_rate": 0.0008375415932667841, |
|
"loss": 6.474, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 8.142493638676845, |
|
"grad_norm": 6.5, |
|
"learning_rate": 0.0008371501272264631, |
|
"loss": 6.4643, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 8.162066940692895, |
|
"grad_norm": 3.0, |
|
"learning_rate": 0.000836758661186142, |
|
"loss": 6.4722, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 8.181640242708944, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.0008363671951458211, |
|
"loss": 6.4671, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 8.201213544724995, |
|
"grad_norm": 5.0, |
|
"learning_rate": 0.0008359757291055001, |
|
"loss": 6.4788, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 8.220786846741046, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 0.0008355842630651791, |
|
"loss": 6.4751, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 8.240360148757095, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.000835192797024858, |
|
"loss": 6.4759, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 8.259933450773145, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 0.0008348013309845371, |
|
"loss": 6.4707, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 8.279506752789196, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 0.0008344098649442161, |
|
"loss": 6.4734, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 8.299080054805245, |
|
"grad_norm": 1.625, |
|
"learning_rate": 0.0008340183989038951, |
|
"loss": 6.4632, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 8.318653356821295, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.000833626932863574, |
|
"loss": 6.4708, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 8.338226658837346, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.0008332354668232531, |
|
"loss": 6.4614, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 8.357799960853397, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0008328440007829321, |
|
"loss": 6.4625, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 8.377373262869446, |
|
"grad_norm": 2.5, |
|
"learning_rate": 0.0008324525347426111, |
|
"loss": 6.4605, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 8.396946564885496, |
|
"grad_norm": 8.0, |
|
"learning_rate": 0.0008320610687022901, |
|
"loss": 6.4681, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 8.416519866901547, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 0.0008316696026619691, |
|
"loss": 6.46, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 8.436093168917596, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.0008312781366216481, |
|
"loss": 6.4718, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 8.455666470933647, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.0008308866705813271, |
|
"loss": 6.4647, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 8.475239772949697, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0008304952045410061, |
|
"loss": 6.4723, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 8.494813074965746, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0008301037385006851, |
|
"loss": 6.4798, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 8.514386376981797, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.0008297122724603641, |
|
"loss": 6.468, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 8.533959678997848, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0008293208064200431, |
|
"loss": 6.4728, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 8.553532981013896, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.0008289293403797221, |
|
"loss": 6.4703, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 8.573106283029947, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.0008285378743394011, |
|
"loss": 6.4645, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 8.592679585045998, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.0008281464082990801, |
|
"loss": 6.4694, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 8.612252887062047, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.0008277549422587591, |
|
"loss": 6.4632, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 8.631826189078097, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 0.0008273634762184381, |
|
"loss": 6.4733, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 8.651399491094148, |
|
"grad_norm": 3.25, |
|
"learning_rate": 0.0008269720101781171, |
|
"loss": 6.4682, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 8.670972793110197, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0008265805441377961, |
|
"loss": 6.4654, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 8.690546095126248, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 0.0008261890780974751, |
|
"loss": 6.4569, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 8.710119397142298, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.0008257976120571541, |
|
"loss": 6.4643, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 8.729692699158347, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.0008254061460168331, |
|
"loss": 6.4544, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 8.749266001174398, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.000825014679976512, |
|
"loss": 6.4667, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 8.768839303190449, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.0008246232139361912, |
|
"loss": 6.458, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 8.7884126052065, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 0.00082423174789587, |
|
"loss": 6.4643, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 8.807985907222548, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.000823840281855549, |
|
"loss": 6.4635, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 8.827559209238599, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.000823448815815228, |
|
"loss": 6.458, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 8.84713251125465, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.000823057349774907, |
|
"loss": 6.4666, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 8.866705813270698, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 0.000822665883734586, |
|
"loss": 6.4582, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 8.886279115286749, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 0.000822274417694265, |
|
"loss": 6.4728, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 8.9058524173028, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.000821882951653944, |
|
"loss": 6.4735, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 8.925425719318849, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.000821491485613623, |
|
"loss": 6.4586, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 8.9449990213349, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.000821100019573302, |
|
"loss": 6.4698, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 8.96457232335095, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.000820708553532981, |
|
"loss": 6.469, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 8.984145625366999, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 0.00082031708749266, |
|
"loss": 6.4725, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 6.4647536277771, |
|
"eval_runtime": 21.362, |
|
"eval_samples_per_second": 93.624, |
|
"eval_steps_per_second": 5.852, |
|
"step": 229905 |
|
}, |
|
{ |
|
"epoch": 9.00371892738305, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.000819925621452339, |
|
"loss": 6.4643, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 9.0232922293991, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 0.000819534155412018, |
|
"loss": 6.4632, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 9.04286553141515, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.000819142689371697, |
|
"loss": 6.4633, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 9.0624388334312, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 0.000818751223331376, |
|
"loss": 6.4673, |
|
"step": 231500 |
|
}, |
|
{ |
|
"epoch": 9.08201213544725, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 0.000818359757291055, |
|
"loss": 6.469, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 9.1015854374633, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.000817968291250734, |
|
"loss": 6.4532, |
|
"step": 232500 |
|
}, |
|
{ |
|
"epoch": 9.12115873947935, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.000817576825210413, |
|
"loss": 6.4608, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 9.1407320414954, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.000817185359170092, |
|
"loss": 6.4537, |
|
"step": 233500 |
|
}, |
|
{ |
|
"epoch": 9.16030534351145, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 0.000816793893129771, |
|
"loss": 6.444, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 9.1798786455275, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.00081640242708945, |
|
"loss": 6.4606, |
|
"step": 234500 |
|
}, |
|
{ |
|
"epoch": 9.199451947543551, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.000816010961049129, |
|
"loss": 6.4578, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 9.2190252495596, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 0.0008156194950088081, |
|
"loss": 6.4648, |
|
"step": 235500 |
|
}, |
|
{ |
|
"epoch": 9.23859855157565, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.000815228028968487, |
|
"loss": 6.4649, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 9.258171853591701, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 0.000814836562928166, |
|
"loss": 6.4617, |
|
"step": 236500 |
|
}, |
|
{ |
|
"epoch": 9.27774515560775, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 0.000814445096887845, |
|
"loss": 6.4598, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 9.2973184576238, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 0.0008140536308475241, |
|
"loss": 6.4565, |
|
"step": 237500 |
|
}, |
|
{ |
|
"epoch": 9.316891759639852, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.000813662164807203, |
|
"loss": 6.4612, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 9.336465061655902, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.000813270698766882, |
|
"loss": 6.4583, |
|
"step": 238500 |
|
}, |
|
{ |
|
"epoch": 9.356038363671951, |
|
"grad_norm": 18.5, |
|
"learning_rate": 0.000812879232726561, |
|
"loss": 6.4549, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 9.375611665688002, |
|
"grad_norm": 5.25, |
|
"learning_rate": 0.0008124877666862401, |
|
"loss": 6.4607, |
|
"step": 239500 |
|
}, |
|
{ |
|
"epoch": 9.395184967704052, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.000812096300645919, |
|
"loss": 6.4573, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 9.414758269720101, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.000811704834605598, |
|
"loss": 6.4706, |
|
"step": 240500 |
|
}, |
|
{ |
|
"epoch": 9.434331571736152, |
|
"grad_norm": 7.3125, |
|
"learning_rate": 0.000811313368565277, |
|
"loss": 6.4632, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 9.453904873752203, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 0.000810921902524956, |
|
"loss": 6.4549, |
|
"step": 241500 |
|
}, |
|
{ |
|
"epoch": 9.473478175768252, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 0.000810530436484635, |
|
"loss": 6.4705, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 9.493051477784302, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.000810138970444314, |
|
"loss": 6.4479, |
|
"step": 242500 |
|
}, |
|
{ |
|
"epoch": 9.512624779800353, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.000809747504403993, |
|
"loss": 6.4602, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 9.532198081816402, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.000809356038363672, |
|
"loss": 6.469, |
|
"step": 243500 |
|
}, |
|
{ |
|
"epoch": 9.551771383832452, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 0.0008089645723233509, |
|
"loss": 6.464, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 9.571344685848503, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 0.0008085731062830299, |
|
"loss": 6.4604, |
|
"step": 244500 |
|
}, |
|
{ |
|
"epoch": 9.590917987864552, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.000808181640242709, |
|
"loss": 6.4589, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 9.610491289880603, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.0008077901742023879, |
|
"loss": 6.4551, |
|
"step": 245500 |
|
}, |
|
{ |
|
"epoch": 9.630064591896653, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 0.0008073987081620669, |
|
"loss": 6.4583, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 9.649637893912702, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 0.0008070072421217459, |
|
"loss": 6.4529, |
|
"step": 246500 |
|
}, |
|
{ |
|
"epoch": 9.669211195928753, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.000806615776081425, |
|
"loss": 6.457, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 9.688784497944804, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.0008062243100411039, |
|
"loss": 6.4537, |
|
"step": 247500 |
|
}, |
|
{ |
|
"epoch": 9.708357799960853, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.0008058328440007829, |
|
"loss": 6.4652, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 9.727931101976903, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.0008054413779604619, |
|
"loss": 6.4547, |
|
"step": 248500 |
|
}, |
|
{ |
|
"epoch": 9.747504403992954, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.000805049911920141, |
|
"loss": 6.467, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 9.767077706009005, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.0008046584458798199, |
|
"loss": 6.4568, |
|
"step": 249500 |
|
}, |
|
{ |
|
"epoch": 9.786651008025053, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.0008042669798394989, |
|
"loss": 6.4616, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 9.806224310041104, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0008038755137991779, |
|
"loss": 6.455, |
|
"step": 250500 |
|
}, |
|
{ |
|
"epoch": 9.825797612057155, |
|
"grad_norm": 1.625, |
|
"learning_rate": 0.000803484047758857, |
|
"loss": 6.4547, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 9.845370914073204, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.0008030925817185359, |
|
"loss": 6.4668, |
|
"step": 251500 |
|
}, |
|
{ |
|
"epoch": 9.864944216089254, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.0008027011156782149, |
|
"loss": 6.4458, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 9.884517518105305, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.0008023096496378939, |
|
"loss": 6.4636, |
|
"step": 252500 |
|
}, |
|
{ |
|
"epoch": 9.904090820121354, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 0.000801918183597573, |
|
"loss": 6.4622, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 9.923664122137405, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.0008015267175572519, |
|
"loss": 6.4557, |
|
"step": 253500 |
|
}, |
|
{ |
|
"epoch": 9.943237424153455, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.0008011352515169309, |
|
"loss": 6.4513, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 9.962810726169504, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00080074378547661, |
|
"loss": 6.4529, |
|
"step": 254500 |
|
}, |
|
{ |
|
"epoch": 9.982384028185555, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.000800352319436289, |
|
"loss": 6.4559, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 6.455629825592041, |
|
"eval_runtime": 21.377, |
|
"eval_samples_per_second": 93.559, |
|
"eval_steps_per_second": 5.847, |
|
"step": 255450 |
|
}, |
|
{ |
|
"epoch": 10.001957330201606, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.0007999608533959679, |
|
"loss": 6.454, |
|
"step": 255500 |
|
}, |
|
{ |
|
"epoch": 10.021530632217654, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 0.0007995693873556469, |
|
"loss": 6.4444, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 10.041103934233705, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 0.000799177921315326, |
|
"loss": 6.4606, |
|
"step": 256500 |
|
}, |
|
{ |
|
"epoch": 10.060677236249756, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.000798786455275005, |
|
"loss": 6.4541, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 10.080250538265805, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 0.0007983949892346839, |
|
"loss": 6.4583, |
|
"step": 257500 |
|
}, |
|
{ |
|
"epoch": 10.099823840281855, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 0.0007980035231943629, |
|
"loss": 6.4542, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 10.119397142297906, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.000797612057154042, |
|
"loss": 6.4503, |
|
"step": 258500 |
|
}, |
|
{ |
|
"epoch": 10.138970444313955, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.0007972205911137209, |
|
"loss": 6.4531, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 10.158543746330006, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.0007968291250733999, |
|
"loss": 6.4552, |
|
"step": 259500 |
|
}, |
|
{ |
|
"epoch": 10.178117048346056, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 0.0007964376590330789, |
|
"loss": 6.4557, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 10.197690350362105, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 0.000796046192992758, |
|
"loss": 6.4519, |
|
"step": 260500 |
|
}, |
|
{ |
|
"epoch": 10.217263652378156, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.0007956547269524369, |
|
"loss": 6.4584, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 10.236836954394207, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.0007952632609121159, |
|
"loss": 6.4519, |
|
"step": 261500 |
|
}, |
|
{ |
|
"epoch": 10.256410256410255, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 0.0007948717948717948, |
|
"loss": 6.4576, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 10.275983558426306, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.000794480328831474, |
|
"loss": 6.4455, |
|
"step": 262500 |
|
}, |
|
{ |
|
"epoch": 10.295556860442357, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 0.0007940888627911528, |
|
"loss": 6.4499, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 10.315130162458408, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.0007936973967508318, |
|
"loss": 6.4572, |
|
"step": 263500 |
|
}, |
|
{ |
|
"epoch": 10.334703464474456, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.0007933059307105108, |
|
"loss": 6.4513, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 10.354276766490507, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.0007929144646701899, |
|
"loss": 6.4472, |
|
"step": 264500 |
|
}, |
|
{ |
|
"epoch": 10.373850068506558, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 0.0007925229986298688, |
|
"loss": 6.4385, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 10.393423370522607, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.0007921315325895478, |
|
"loss": 6.4558, |
|
"step": 265500 |
|
}, |
|
{ |
|
"epoch": 10.412996672538657, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 0.0007917400665492269, |
|
"loss": 6.4503, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 10.432569974554708, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.0007913486005089059, |
|
"loss": 6.4402, |
|
"step": 266500 |
|
}, |
|
{ |
|
"epoch": 10.452143276570757, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0007909571344685848, |
|
"loss": 6.4552, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 10.471716578586808, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.0007905656684282638, |
|
"loss": 6.4488, |
|
"step": 267500 |
|
}, |
|
{ |
|
"epoch": 10.491289880602858, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.0007901742023879429, |
|
"loss": 6.4577, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 10.510863182618907, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.0007897827363476219, |
|
"loss": 6.4526, |
|
"step": 268500 |
|
}, |
|
{ |
|
"epoch": 10.530436484634958, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 0.0007893912703073008, |
|
"loss": 6.4473, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 10.550009786651009, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.0007889998042669798, |
|
"loss": 6.4555, |
|
"step": 269500 |
|
}, |
|
{ |
|
"epoch": 10.569583088667057, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.0007886083382266589, |
|
"loss": 6.4522, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 10.589156390683108, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.0007882168721863379, |
|
"loss": 6.4481, |
|
"step": 270500 |
|
}, |
|
{ |
|
"epoch": 10.608729692699159, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0007878254061460168, |
|
"loss": 6.4498, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 10.628302994715208, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.0007874339401056958, |
|
"loss": 6.4581, |
|
"step": 271500 |
|
}, |
|
{ |
|
"epoch": 10.647876296731258, |
|
"grad_norm": 4.0, |
|
"learning_rate": 0.0007870424740653749, |
|
"loss": 6.4554, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 10.667449598747309, |
|
"grad_norm": 1.625, |
|
"learning_rate": 0.0007866510080250539, |
|
"loss": 6.4506, |
|
"step": 272500 |
|
}, |
|
{ |
|
"epoch": 10.68702290076336, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.0007862595419847328, |
|
"loss": 6.4586, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 10.706596202779409, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 0.0007858680759444118, |
|
"loss": 6.4467, |
|
"step": 273500 |
|
}, |
|
{ |
|
"epoch": 10.72616950479546, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.0007854766099040909, |
|
"loss": 6.4449, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 10.74574280681151, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 0.0007850851438637698, |
|
"loss": 6.4576, |
|
"step": 274500 |
|
}, |
|
{ |
|
"epoch": 10.765316108827559, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 0.0007846936778234488, |
|
"loss": 6.4427, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 10.78488941084361, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.0007843022117831279, |
|
"loss": 6.4571, |
|
"step": 275500 |
|
}, |
|
{ |
|
"epoch": 10.80446271285966, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.0007839107457428069, |
|
"loss": 6.4507, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 10.824036014875709, |
|
"grad_norm": 4.875, |
|
"learning_rate": 0.0007835192797024858, |
|
"loss": 6.4542, |
|
"step": 276500 |
|
}, |
|
{ |
|
"epoch": 10.84360931689176, |
|
"grad_norm": 18.125, |
|
"learning_rate": 0.0007831278136621648, |
|
"loss": 6.4592, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 10.86318261890781, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0007827363476218439, |
|
"loss": 6.4503, |
|
"step": 277500 |
|
}, |
|
{ |
|
"epoch": 10.88275592092386, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.0007823448815815229, |
|
"loss": 6.461, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 10.90232922293991, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 0.0007819534155412018, |
|
"loss": 6.4549, |
|
"step": 278500 |
|
}, |
|
{ |
|
"epoch": 10.92190252495596, |
|
"grad_norm": 26.375, |
|
"learning_rate": 0.0007815619495008808, |
|
"loss": 6.4469, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 10.94147582697201, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.0007811704834605599, |
|
"loss": 6.453, |
|
"step": 279500 |
|
}, |
|
{ |
|
"epoch": 10.96104912898806, |
|
"grad_norm": 25.5, |
|
"learning_rate": 0.0007807790174202389, |
|
"loss": 6.4485, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 10.980622431004111, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 0.0007803875513799178, |
|
"loss": 6.4544, |
|
"step": 280500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 6.449069499969482, |
|
"eval_runtime": 22.9095, |
|
"eval_samples_per_second": 87.3, |
|
"eval_steps_per_second": 5.456, |
|
"step": 280995 |
|
}, |
|
{ |
|
"epoch": 11.00019573302016, |
|
"grad_norm": 10.125, |
|
"learning_rate": 0.0007799960853395967, |
|
"loss": 6.4525, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 11.01976903503621, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.0007796046192992759, |
|
"loss": 6.4456, |
|
"step": 281500 |
|
}, |
|
{ |
|
"epoch": 11.039342337052261, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 0.0007792131532589549, |
|
"loss": 6.4479, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 11.05891563906831, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0007788216872186337, |
|
"loss": 6.4415, |
|
"step": 282500 |
|
}, |
|
{ |
|
"epoch": 11.07848894108436, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0007784302211783127, |
|
"loss": 6.4438, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 11.098062243100411, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.0007780387551379918, |
|
"loss": 6.4485, |
|
"step": 283500 |
|
}, |
|
{ |
|
"epoch": 11.11763554511646, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.0007776472890976708, |
|
"loss": 6.4491, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 11.137208847132511, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.0007772558230573497, |
|
"loss": 6.449, |
|
"step": 284500 |
|
}, |
|
{ |
|
"epoch": 11.156782149148562, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 0.0007768643570170288, |
|
"loss": 6.4526, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 11.17635545116461, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 0.0007764728909767078, |
|
"loss": 6.4408, |
|
"step": 285500 |
|
}, |
|
{ |
|
"epoch": 11.195928753180661, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.0007760814249363868, |
|
"loss": 6.4384, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 11.215502055196712, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.0007756899588960657, |
|
"loss": 6.4557, |
|
"step": 286500 |
|
}, |
|
{ |
|
"epoch": 11.235075357212763, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.0007752984928557448, |
|
"loss": 6.4432, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 11.254648659228812, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.0007749070268154238, |
|
"loss": 6.4451, |
|
"step": 287500 |
|
}, |
|
{ |
|
"epoch": 11.274221961244862, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 0.0007745155607751028, |
|
"loss": 6.4532, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 11.293795263260913, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.0007741240947347817, |
|
"loss": 6.4489, |
|
"step": 288500 |
|
}, |
|
{ |
|
"epoch": 11.313368565276962, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.0007737326286944608, |
|
"loss": 6.4448, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 11.332941867293012, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.0007733411626541398, |
|
"loss": 6.4421, |
|
"step": 289500 |
|
}, |
|
{ |
|
"epoch": 11.352515169309063, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.0007729496966138187, |
|
"loss": 6.4408, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 11.372088471325112, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.0007725582305734977, |
|
"loss": 6.4487, |
|
"step": 290500 |
|
}, |
|
{ |
|
"epoch": 11.391661773341163, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.0007721667645331768, |
|
"loss": 6.4488, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 11.411235075357213, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.0007717752984928558, |
|
"loss": 6.4394, |
|
"step": 291500 |
|
}, |
|
{ |
|
"epoch": 11.430808377373262, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.0007713838324525347, |
|
"loss": 6.4548, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 11.450381679389313, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.0007709923664122137, |
|
"loss": 6.454, |
|
"step": 292500 |
|
}, |
|
{ |
|
"epoch": 11.469954981405364, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.0007706009003718928, |
|
"loss": 6.4516, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 11.489528283421413, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.0007702094343315718, |
|
"loss": 6.4482, |
|
"step": 293500 |
|
}, |
|
{ |
|
"epoch": 11.509101585437463, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.0007698179682912507, |
|
"loss": 6.4602, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 11.528674887453514, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0007694265022509297, |
|
"loss": 6.4459, |
|
"step": 294500 |
|
}, |
|
{ |
|
"epoch": 11.548248189469563, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.0007690350362106088, |
|
"loss": 6.4508, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 11.567821491485613, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.0007686435701702878, |
|
"loss": 6.4459, |
|
"step": 295500 |
|
}, |
|
{ |
|
"epoch": 11.587394793501664, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 0.0007682521041299667, |
|
"loss": 6.4517, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 11.606968095517713, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.0007678606380896458, |
|
"loss": 6.4519, |
|
"step": 296500 |
|
}, |
|
{ |
|
"epoch": 11.626541397533764, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.0007674691720493248, |
|
"loss": 6.4474, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 11.646114699549814, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.0007670777060090038, |
|
"loss": 6.4506, |
|
"step": 297500 |
|
}, |
|
{ |
|
"epoch": 11.665688001565865, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.0007666862399686827, |
|
"loss": 6.4396, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 11.685261303581914, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0007662947739283618, |
|
"loss": 6.4453, |
|
"step": 298500 |
|
}, |
|
{ |
|
"epoch": 11.704834605597965, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 0.0007659033078880408, |
|
"loss": 6.4414, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 11.724407907614015, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.0007655118418477198, |
|
"loss": 6.4513, |
|
"step": 299500 |
|
}, |
|
{ |
|
"epoch": 11.743981209630064, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 0.0007651203758073986, |
|
"loss": 6.4501, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 11.763554511646115, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.0007647289097670778, |
|
"loss": 6.4471, |
|
"step": 300500 |
|
}, |
|
{ |
|
"epoch": 11.783127813662166, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 0.0007643374437267568, |
|
"loss": 6.4567, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 11.802701115678214, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.0007639459776864357, |
|
"loss": 6.4509, |
|
"step": 301500 |
|
}, |
|
{ |
|
"epoch": 11.822274417694265, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.0007635545116461146, |
|
"loss": 6.4451, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 11.841847719710316, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.0007631630456057937, |
|
"loss": 6.4524, |
|
"step": 302500 |
|
}, |
|
{ |
|
"epoch": 11.861421021726365, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 0.0007627715795654727, |
|
"loss": 6.4502, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 11.880994323742415, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.0007623801135251517, |
|
"loss": 6.4476, |
|
"step": 303500 |
|
}, |
|
{ |
|
"epoch": 11.900567625758466, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0007619886474848306, |
|
"loss": 6.4338, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 11.920140927774515, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 0.0007615971814445097, |
|
"loss": 6.4472, |
|
"step": 304500 |
|
}, |
|
{ |
|
"epoch": 11.939714229790566, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 0.0007612057154041887, |
|
"loss": 6.4465, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 11.959287531806616, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.0007608142493638676, |
|
"loss": 6.4529, |
|
"step": 305500 |
|
}, |
|
{ |
|
"epoch": 11.978860833822665, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.0007604227833235467, |
|
"loss": 6.4456, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 11.998434135838716, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.0007600313172832257, |
|
"loss": 6.4525, |
|
"step": 306500 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 6.446938514709473, |
|
"eval_runtime": 23.9647, |
|
"eval_samples_per_second": 83.456, |
|
"eval_steps_per_second": 5.216, |
|
"step": 306540 |
|
}, |
|
{ |
|
"epoch": 12.018007437854767, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.0007596398512429047, |
|
"loss": 6.4458, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 12.037580739870815, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.0007592483852025836, |
|
"loss": 6.4423, |
|
"step": 307500 |
|
}, |
|
{ |
|
"epoch": 12.057154041886866, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 0.0007588569191622627, |
|
"loss": 6.4446, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 12.076727343902917, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.0007584654531219417, |
|
"loss": 6.4461, |
|
"step": 308500 |
|
}, |
|
{ |
|
"epoch": 12.096300645918966, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.0007580739870816207, |
|
"loss": 6.4431, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 12.115873947935016, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.0007576825210412996, |
|
"loss": 6.4439, |
|
"step": 309500 |
|
}, |
|
{ |
|
"epoch": 12.135447249951067, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 0.0007572910550009787, |
|
"loss": 6.4428, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 12.155020551967118, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0007568995889606577, |
|
"loss": 6.4456, |
|
"step": 310500 |
|
}, |
|
{ |
|
"epoch": 12.174593853983167, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.0007565081229203367, |
|
"loss": 6.4479, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 12.194167155999217, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 0.0007561166568800156, |
|
"loss": 6.4457, |
|
"step": 311500 |
|
}, |
|
{ |
|
"epoch": 12.213740458015268, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.0007557251908396947, |
|
"loss": 6.4405, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 12.233313760031317, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 0.0007553337247993737, |
|
"loss": 6.4557, |
|
"step": 312500 |
|
}, |
|
{ |
|
"epoch": 12.252887062047368, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.0007549422587590527, |
|
"loss": 6.4407, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 12.272460364063418, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0007545507927187316, |
|
"loss": 6.4457, |
|
"step": 313500 |
|
}, |
|
{ |
|
"epoch": 12.292033666079467, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.0007541593266784107, |
|
"loss": 6.4471, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 12.311606968095518, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.0007537678606380897, |
|
"loss": 6.4473, |
|
"step": 314500 |
|
}, |
|
{ |
|
"epoch": 12.331180270111568, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 0.0007533763945977687, |
|
"loss": 6.4451, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 12.350753572127617, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.0007529849285574477, |
|
"loss": 6.4336, |
|
"step": 315500 |
|
}, |
|
{ |
|
"epoch": 12.370326874143668, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.0007525934625171267, |
|
"loss": 6.4454, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 12.389900176159719, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 0.0007522019964768057, |
|
"loss": 6.4481, |
|
"step": 316500 |
|
}, |
|
{ |
|
"epoch": 12.409473478175768, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 0.0007518105304364847, |
|
"loss": 6.4485, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 12.429046780191818, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.0007514190643961637, |
|
"loss": 6.4553, |
|
"step": 317500 |
|
}, |
|
{ |
|
"epoch": 12.448620082207869, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0007510275983558427, |
|
"loss": 6.4435, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 12.468193384223918, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 0.0007506361323155217, |
|
"loss": 6.4435, |
|
"step": 318500 |
|
}, |
|
{ |
|
"epoch": 12.487766686239969, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.0007502446662752007, |
|
"loss": 6.4451, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 12.50733998825602, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 0.0007498532002348797, |
|
"loss": 6.4511, |
|
"step": 319500 |
|
}, |
|
{ |
|
"epoch": 12.526913290272068, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 0.0007494617341945587, |
|
"loss": 6.4384, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 12.546486592288119, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.0007490702681542376, |
|
"loss": 6.4448, |
|
"step": 320500 |
|
}, |
|
{ |
|
"epoch": 12.56605989430417, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.0007486788021139165, |
|
"loss": 6.4514, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 12.58563319632022, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.0007482873360735956, |
|
"loss": 6.4437, |
|
"step": 321500 |
|
}, |
|
{ |
|
"epoch": 12.605206498336269, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.0007478958700332746, |
|
"loss": 6.4439, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 12.62477980035232, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.0007475044039929536, |
|
"loss": 6.4455, |
|
"step": 322500 |
|
}, |
|
{ |
|
"epoch": 12.64435310236837, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0007471129379526325, |
|
"loss": 6.4413, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 12.66392640438442, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.0007467214719123116, |
|
"loss": 6.4366, |
|
"step": 323500 |
|
}, |
|
{ |
|
"epoch": 12.68349970640047, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0007463300058719906, |
|
"loss": 6.4511, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 12.70307300841652, |
|
"grad_norm": 1.625, |
|
"learning_rate": 0.0007459385398316696, |
|
"loss": 6.4384, |
|
"step": 324500 |
|
}, |
|
{ |
|
"epoch": 12.72264631043257, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.0007455470737913485, |
|
"loss": 6.4402, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 12.74221961244862, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 0.0007451556077510276, |
|
"loss": 6.4542, |
|
"step": 325500 |
|
}, |
|
{ |
|
"epoch": 12.761792914464671, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 0.0007447641417107066, |
|
"loss": 6.4417, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 12.78136621648072, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 0.0007443726756703856, |
|
"loss": 6.4429, |
|
"step": 326500 |
|
}, |
|
{ |
|
"epoch": 12.80093951849677, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.0007439812096300646, |
|
"loss": 6.45, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 12.820512820512821, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.0007435897435897436, |
|
"loss": 6.4472, |
|
"step": 327500 |
|
}, |
|
{ |
|
"epoch": 12.84008612252887, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 0.0007431982775494226, |
|
"loss": 6.438, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 12.85965942454492, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.0007428068115091016, |
|
"loss": 6.451, |
|
"step": 328500 |
|
}, |
|
{ |
|
"epoch": 12.879232726560971, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.0007424153454687806, |
|
"loss": 6.4495, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 12.89880602857702, |
|
"grad_norm": 8.375, |
|
"learning_rate": 0.0007420238794284596, |
|
"loss": 6.4412, |
|
"step": 329500 |
|
}, |
|
{ |
|
"epoch": 12.918379330593071, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.0007416324133881386, |
|
"loss": 6.4584, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 12.937952632609122, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 0.0007412409473478176, |
|
"loss": 6.4446, |
|
"step": 330500 |
|
}, |
|
{ |
|
"epoch": 12.95752593462517, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 0.0007408494813074966, |
|
"loss": 6.4465, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 12.977099236641221, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 0.0007404580152671756, |
|
"loss": 6.4441, |
|
"step": 331500 |
|
}, |
|
{ |
|
"epoch": 12.996672538657272, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 0.0007400665492268546, |
|
"loss": 6.4464, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 6.443148612976074, |
|
"eval_runtime": 21.9848, |
|
"eval_samples_per_second": 90.972, |
|
"eval_steps_per_second": 5.686, |
|
"step": 332085 |
|
}, |
|
{ |
|
"epoch": 13.01624584067332, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 0.0007396750831865336, |
|
"loss": 6.4378, |
|
"step": 332500 |
|
}, |
|
{ |
|
"epoch": 13.035819142689371, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.0007392836171462126, |
|
"loss": 6.4393, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 13.055392444705422, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.0007388921511058916, |
|
"loss": 6.441, |
|
"step": 333500 |
|
}, |
|
{ |
|
"epoch": 13.074965746721471, |
|
"grad_norm": 4.875, |
|
"learning_rate": 0.0007385006850655706, |
|
"loss": 6.4441, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 13.094539048737522, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.0007381092190252496, |
|
"loss": 6.4412, |
|
"step": 334500 |
|
}, |
|
{ |
|
"epoch": 13.114112350753572, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.0007377177529849286, |
|
"loss": 6.4431, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 13.133685652769623, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 0.0007373262869446076, |
|
"loss": 6.4336, |
|
"step": 335500 |
|
}, |
|
{ |
|
"epoch": 13.153258954785672, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.0007369348209042866, |
|
"loss": 6.4395, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 13.172832256801723, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 0.0007365433548639657, |
|
"loss": 6.4369, |
|
"step": 336500 |
|
}, |
|
{ |
|
"epoch": 13.192405558817773, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.0007361518888236446, |
|
"loss": 6.4482, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 13.211978860833822, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 0.0007357604227833236, |
|
"loss": 6.45, |
|
"step": 337500 |
|
}, |
|
{ |
|
"epoch": 13.231552162849873, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.0007353689567430026, |
|
"loss": 6.4423, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 13.251125464865924, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.0007349774907026816, |
|
"loss": 6.4455, |
|
"step": 338500 |
|
}, |
|
{ |
|
"epoch": 13.270698766881972, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 0.0007345860246623605, |
|
"loss": 6.4393, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 13.290272068898023, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.0007341945586220395, |
|
"loss": 6.4441, |
|
"step": 339500 |
|
}, |
|
{ |
|
"epoch": 13.309845370914074, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.0007338030925817185, |
|
"loss": 6.4395, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 13.329418672930123, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.0007334116265413975, |
|
"loss": 6.4375, |
|
"step": 340500 |
|
}, |
|
{ |
|
"epoch": 13.348991974946173, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.0007330201605010765, |
|
"loss": 6.438, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 13.368565276962224, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.0007326286944607555, |
|
"loss": 6.4392, |
|
"step": 341500 |
|
}, |
|
{ |
|
"epoch": 13.388138578978273, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.0007322372284204345, |
|
"loss": 6.4433, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 13.407711880994324, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.0007318457623801135, |
|
"loss": 6.4467, |
|
"step": 342500 |
|
}, |
|
{ |
|
"epoch": 13.427285183010374, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0007314542963397925, |
|
"loss": 6.4401, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 13.446858485026423, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.0007310628302994715, |
|
"loss": 6.4414, |
|
"step": 343500 |
|
}, |
|
{ |
|
"epoch": 13.466431787042474, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.0007306713642591505, |
|
"loss": 6.4451, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 13.486005089058525, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 0.0007302798982188295, |
|
"loss": 6.4419, |
|
"step": 344500 |
|
}, |
|
{ |
|
"epoch": 13.505578391074573, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 0.0007298884321785085, |
|
"loss": 6.439, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 13.525151693090624, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 0.0007294969661381875, |
|
"loss": 6.4437, |
|
"step": 345500 |
|
}, |
|
{ |
|
"epoch": 13.544724995106675, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.0007291055000978665, |
|
"loss": 6.4399, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 13.564298297122726, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 0.0007287140340575455, |
|
"loss": 6.4447, |
|
"step": 346500 |
|
}, |
|
{ |
|
"epoch": 13.583871599138774, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.0007283225680172245, |
|
"loss": 6.4373, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 13.603444901154825, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.0007279311019769035, |
|
"loss": 6.4484, |
|
"step": 347500 |
|
}, |
|
{ |
|
"epoch": 13.623018203170876, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.0007275396359365826, |
|
"loss": 6.4486, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 13.642591505186925, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.0007271481698962615, |
|
"loss": 6.4417, |
|
"step": 348500 |
|
}, |
|
{ |
|
"epoch": 13.662164807202975, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.0007267567038559405, |
|
"loss": 6.4397, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 13.681738109219026, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.0007263652378156195, |
|
"loss": 6.4328, |
|
"step": 349500 |
|
}, |
|
{ |
|
"epoch": 13.701311411235075, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.0007259737717752986, |
|
"loss": 6.4462, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 13.720884713251126, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 0.0007255823057349775, |
|
"loss": 6.4394, |
|
"step": 350500 |
|
}, |
|
{ |
|
"epoch": 13.740458015267176, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.0007251908396946565, |
|
"loss": 6.4434, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 13.760031317283225, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.0007247993736543355, |
|
"loss": 6.4437, |
|
"step": 351500 |
|
}, |
|
{ |
|
"epoch": 13.779604619299276, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.0007244079076140146, |
|
"loss": 6.4386, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 13.799177921315327, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 0.0007240164415736935, |
|
"loss": 6.4476, |
|
"step": 352500 |
|
}, |
|
{ |
|
"epoch": 13.818751223331375, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 0.0007236249755333725, |
|
"loss": 6.4442, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 13.838324525347426, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.0007232335094930515, |
|
"loss": 6.44, |
|
"step": 353500 |
|
}, |
|
{ |
|
"epoch": 13.857897827363477, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.0007228420434527305, |
|
"loss": 6.4395, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 13.877471129379526, |
|
"grad_norm": 2.75, |
|
"learning_rate": 0.0007224505774124095, |
|
"loss": 6.4392, |
|
"step": 354500 |
|
}, |
|
{ |
|
"epoch": 13.897044431395576, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.0007220591113720885, |
|
"loss": 6.4484, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 13.916617733411627, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.0007216676453317675, |
|
"loss": 6.4449, |
|
"step": 355500 |
|
}, |
|
{ |
|
"epoch": 13.936191035427676, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.0007212761792914465, |
|
"loss": 6.4444, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 13.955764337443727, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 0.0007208847132511255, |
|
"loss": 6.4438, |
|
"step": 356500 |
|
}, |
|
{ |
|
"epoch": 13.975337639459777, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 0.0007204932472108045, |
|
"loss": 6.4417, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 13.994910941475826, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 0.0007201017811704836, |
|
"loss": 6.4479, |
|
"step": 357500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 6.441241264343262, |
|
"eval_runtime": 23.2496, |
|
"eval_samples_per_second": 86.023, |
|
"eval_steps_per_second": 5.376, |
|
"step": 357630 |
|
}, |
|
{ |
|
"epoch": 14.014484243491877, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.0007197103151301624, |
|
"loss": 6.4372, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 14.034057545507928, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.0007193188490898414, |
|
"loss": 6.4488, |
|
"step": 358500 |
|
}, |
|
{ |
|
"epoch": 14.053630847523978, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0007189273830495204, |
|
"loss": 6.441, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 14.073204149540027, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.0007185359170091995, |
|
"loss": 6.4405, |
|
"step": 359500 |
|
}, |
|
{ |
|
"epoch": 14.092777451556078, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.0007181444509688784, |
|
"loss": 6.4363, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 14.112350753572128, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0007177529849285574, |
|
"loss": 6.4307, |
|
"step": 360500 |
|
}, |
|
{ |
|
"epoch": 14.131924055588177, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0007173615188882364, |
|
"loss": 6.4421, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 14.151497357604228, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 0.0007169700528479155, |
|
"loss": 6.4308, |
|
"step": 361500 |
|
}, |
|
{ |
|
"epoch": 14.171070659620279, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.0007165785868075944, |
|
"loss": 6.445, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 14.190643961636328, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.0007161871207672734, |
|
"loss": 6.4366, |
|
"step": 362500 |
|
}, |
|
{ |
|
"epoch": 14.210217263652378, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 0.0007157956547269524, |
|
"loss": 6.4377, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 14.229790565668429, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.0007154041886866315, |
|
"loss": 6.4332, |
|
"step": 363500 |
|
}, |
|
{ |
|
"epoch": 14.249363867684478, |
|
"grad_norm": 20.75, |
|
"learning_rate": 0.0007150127226463104, |
|
"loss": 6.4339, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 14.268937169700529, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.0007146212566059894, |
|
"loss": 6.4443, |
|
"step": 364500 |
|
}, |
|
{ |
|
"epoch": 14.28851047171658, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.0007142297905656684, |
|
"loss": 6.4381, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 14.308083773732628, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.0007138383245253475, |
|
"loss": 6.4398, |
|
"step": 365500 |
|
}, |
|
{ |
|
"epoch": 14.327657075748679, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.0007134468584850264, |
|
"loss": 6.4347, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 14.34723037776473, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.0007130553924447054, |
|
"loss": 6.4376, |
|
"step": 366500 |
|
}, |
|
{ |
|
"epoch": 14.366803679780778, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 0.0007126639264043845, |
|
"loss": 6.4392, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 14.386376981796829, |
|
"grad_norm": 2.5, |
|
"learning_rate": 0.0007122724603640635, |
|
"loss": 6.4388, |
|
"step": 367500 |
|
}, |
|
{ |
|
"epoch": 14.40595028381288, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.0007118809943237424, |
|
"loss": 6.4346, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 14.425523585828929, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.0007114895282834214, |
|
"loss": 6.4427, |
|
"step": 368500 |
|
}, |
|
{ |
|
"epoch": 14.44509688784498, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.0007110980622431005, |
|
"loss": 6.4425, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 14.46467018986103, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 0.0007107065962027795, |
|
"loss": 6.4497, |
|
"step": 369500 |
|
}, |
|
{ |
|
"epoch": 14.484243491877079, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.0007103151301624584, |
|
"loss": 6.4378, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 14.50381679389313, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.0007099236641221374, |
|
"loss": 6.4414, |
|
"step": 370500 |
|
}, |
|
{ |
|
"epoch": 14.52339009590918, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0007095321980818165, |
|
"loss": 6.4258, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 14.54296339792523, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 0.0007091407320414954, |
|
"loss": 6.4274, |
|
"step": 371500 |
|
}, |
|
{ |
|
"epoch": 14.56253669994128, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 0.0007087492660011744, |
|
"loss": 6.4458, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 14.58211000195733, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.0007083577999608534, |
|
"loss": 6.4381, |
|
"step": 372500 |
|
}, |
|
{ |
|
"epoch": 14.601683303973381, |
|
"grad_norm": 4.125, |
|
"learning_rate": 0.0007079663339205325, |
|
"loss": 6.441, |
|
"step": 373000 |
|
}, |
|
{ |
|
"epoch": 14.62125660598943, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0007075748678802114, |
|
"loss": 6.4338, |
|
"step": 373500 |
|
}, |
|
{ |
|
"epoch": 14.64082990800548, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 0.0007071834018398904, |
|
"loss": 6.4328, |
|
"step": 374000 |
|
}, |
|
{ |
|
"epoch": 14.660403210021531, |
|
"grad_norm": 3.375, |
|
"learning_rate": 0.0007067919357995694, |
|
"loss": 6.4386, |
|
"step": 374500 |
|
}, |
|
{ |
|
"epoch": 14.67997651203758, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 0.0007064004697592485, |
|
"loss": 6.4317, |
|
"step": 375000 |
|
}, |
|
{ |
|
"epoch": 14.699549814053631, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0007060090037189274, |
|
"loss": 6.4334, |
|
"step": 375500 |
|
}, |
|
{ |
|
"epoch": 14.719123116069682, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 0.0007056175376786064, |
|
"loss": 6.4464, |
|
"step": 376000 |
|
}, |
|
{ |
|
"epoch": 14.73869641808573, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0007052260716382854, |
|
"loss": 6.4376, |
|
"step": 376500 |
|
}, |
|
{ |
|
"epoch": 14.758269720101781, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 0.0007048346055979645, |
|
"loss": 6.4439, |
|
"step": 377000 |
|
}, |
|
{ |
|
"epoch": 14.777843022117832, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.0007044431395576433, |
|
"loss": 6.4408, |
|
"step": 377500 |
|
}, |
|
{ |
|
"epoch": 14.79741632413388, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.0007040516735173223, |
|
"loss": 6.434, |
|
"step": 378000 |
|
}, |
|
{ |
|
"epoch": 14.816989626149931, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.0007036602074770014, |
|
"loss": 6.4404, |
|
"step": 378500 |
|
}, |
|
{ |
|
"epoch": 14.836562928165982, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 0.0007032687414366804, |
|
"loss": 6.4377, |
|
"step": 379000 |
|
}, |
|
{ |
|
"epoch": 14.856136230182031, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 0.0007028772753963593, |
|
"loss": 6.4332, |
|
"step": 379500 |
|
}, |
|
{ |
|
"epoch": 14.875709532198082, |
|
"grad_norm": 3.625, |
|
"learning_rate": 0.0007024858093560383, |
|
"loss": 6.4384, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 14.895282834214132, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.0007020943433157174, |
|
"loss": 6.4439, |
|
"step": 380500 |
|
}, |
|
{ |
|
"epoch": 14.914856136230181, |
|
"grad_norm": 6.625, |
|
"learning_rate": 0.0007017028772753964, |
|
"loss": 6.439, |
|
"step": 381000 |
|
}, |
|
{ |
|
"epoch": 14.934429438246232, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 0.0007013114112350753, |
|
"loss": 6.4396, |
|
"step": 381500 |
|
}, |
|
{ |
|
"epoch": 14.954002740262283, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.0007009199451947543, |
|
"loss": 6.4498, |
|
"step": 382000 |
|
}, |
|
{ |
|
"epoch": 14.973576042278331, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.0007005284791544334, |
|
"loss": 6.4371, |
|
"step": 382500 |
|
}, |
|
{ |
|
"epoch": 14.993149344294382, |
|
"grad_norm": 24.375, |
|
"learning_rate": 0.0007001370131141124, |
|
"loss": 6.4382, |
|
"step": 383000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 6.4391045570373535, |
|
"eval_runtime": 20.2509, |
|
"eval_samples_per_second": 98.761, |
|
"eval_steps_per_second": 6.173, |
|
"step": 383175 |
|
}, |
|
{ |
|
"epoch": 15.012722646310433, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 0.0006997455470737913, |
|
"loss": 6.4283, |
|
"step": 383500 |
|
}, |
|
{ |
|
"epoch": 15.032295948326484, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.0006993540810334703, |
|
"loss": 6.4405, |
|
"step": 384000 |
|
}, |
|
{ |
|
"epoch": 15.051869250342532, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.0006989626149931494, |
|
"loss": 6.4427, |
|
"step": 384500 |
|
}, |
|
{ |
|
"epoch": 15.071442552358583, |
|
"grad_norm": 4.625, |
|
"learning_rate": 0.0006985711489528284, |
|
"loss": 6.4322, |
|
"step": 385000 |
|
}, |
|
{ |
|
"epoch": 15.091015854374634, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.0006981796829125073, |
|
"loss": 6.4296, |
|
"step": 385500 |
|
}, |
|
{ |
|
"epoch": 15.110589156390683, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 0.0006977882168721863, |
|
"loss": 6.437, |
|
"step": 386000 |
|
}, |
|
{ |
|
"epoch": 15.130162458406733, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.0006973967508318654, |
|
"loss": 6.4392, |
|
"step": 386500 |
|
}, |
|
{ |
|
"epoch": 15.149735760422784, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 0.0006970052847915443, |
|
"loss": 6.4383, |
|
"step": 387000 |
|
}, |
|
{ |
|
"epoch": 15.169309062438833, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0006966138187512233, |
|
"loss": 6.4385, |
|
"step": 387500 |
|
}, |
|
{ |
|
"epoch": 15.188882364454884, |
|
"grad_norm": 6.0, |
|
"learning_rate": 0.0006962223527109024, |
|
"loss": 6.4358, |
|
"step": 388000 |
|
}, |
|
{ |
|
"epoch": 15.208455666470934, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.0006958308866705814, |
|
"loss": 6.4298, |
|
"step": 388500 |
|
}, |
|
{ |
|
"epoch": 15.228028968486983, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 0.0006954394206302603, |
|
"loss": 6.4386, |
|
"step": 389000 |
|
}, |
|
{ |
|
"epoch": 15.247602270503034, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.0006950479545899393, |
|
"loss": 6.4419, |
|
"step": 389500 |
|
}, |
|
{ |
|
"epoch": 15.267175572519085, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.0006946564885496184, |
|
"loss": 6.4326, |
|
"step": 390000 |
|
}, |
|
{ |
|
"epoch": 15.286748874535133, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.0006942650225092974, |
|
"loss": 6.4367, |
|
"step": 390500 |
|
}, |
|
{ |
|
"epoch": 15.306322176551184, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.0006938735564689763, |
|
"loss": 6.4375, |
|
"step": 391000 |
|
}, |
|
{ |
|
"epoch": 15.325895478567235, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.0006934820904286553, |
|
"loss": 6.4316, |
|
"step": 391500 |
|
}, |
|
{ |
|
"epoch": 15.345468780583284, |
|
"grad_norm": 9.875, |
|
"learning_rate": 0.0006930906243883344, |
|
"loss": 6.4386, |
|
"step": 392000 |
|
}, |
|
{ |
|
"epoch": 15.365042082599334, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0006926991583480134, |
|
"loss": 6.4268, |
|
"step": 392500 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.0006923076923076923, |
|
"loss": 6.4301, |
|
"step": 393000 |
|
}, |
|
{ |
|
"epoch": 15.404188686631434, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.0006919162262673713, |
|
"loss": 6.4374, |
|
"step": 393500 |
|
}, |
|
{ |
|
"epoch": 15.423761988647485, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.0006915247602270504, |
|
"loss": 6.4423, |
|
"step": 394000 |
|
}, |
|
{ |
|
"epoch": 15.443335290663535, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.0006911332941867294, |
|
"loss": 6.4376, |
|
"step": 394500 |
|
}, |
|
{ |
|
"epoch": 15.462908592679586, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 0.0006907418281464083, |
|
"loss": 6.4384, |
|
"step": 395000 |
|
}, |
|
{ |
|
"epoch": 15.482481894695635, |
|
"grad_norm": 5.375, |
|
"learning_rate": 0.0006903503621060873, |
|
"loss": 6.4293, |
|
"step": 395500 |
|
}, |
|
{ |
|
"epoch": 15.502055196711686, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.0006899588960657664, |
|
"loss": 6.4331, |
|
"step": 396000 |
|
}, |
|
{ |
|
"epoch": 15.521628498727736, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.0006895674300254454, |
|
"loss": 6.4326, |
|
"step": 396500 |
|
}, |
|
{ |
|
"epoch": 15.541201800743785, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 0.0006891759639851242, |
|
"loss": 6.4296, |
|
"step": 397000 |
|
}, |
|
{ |
|
"epoch": 15.560775102759836, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.0006887844979448033, |
|
"loss": 6.4363, |
|
"step": 397500 |
|
}, |
|
{ |
|
"epoch": 15.580348404775886, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.0006883930319044823, |
|
"loss": 6.4393, |
|
"step": 398000 |
|
}, |
|
{ |
|
"epoch": 15.599921706791935, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 0.0006880015658641613, |
|
"loss": 6.4373, |
|
"step": 398500 |
|
}, |
|
{ |
|
"epoch": 15.619495008807986, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 0.0006876100998238402, |
|
"loss": 6.4404, |
|
"step": 399000 |
|
}, |
|
{ |
|
"epoch": 15.639068310824037, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0006872186337835193, |
|
"loss": 6.4434, |
|
"step": 399500 |
|
}, |
|
{ |
|
"epoch": 15.658641612840086, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.0006868271677431983, |
|
"loss": 6.4329, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 15.678214914856136, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0006864357017028773, |
|
"loss": 6.4316, |
|
"step": 400500 |
|
}, |
|
{ |
|
"epoch": 15.697788216872187, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 0.0006860442356625562, |
|
"loss": 6.4376, |
|
"step": 401000 |
|
}, |
|
{ |
|
"epoch": 15.717361518888236, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0006856527696222353, |
|
"loss": 6.4376, |
|
"step": 401500 |
|
}, |
|
{ |
|
"epoch": 15.736934820904287, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0006852613035819143, |
|
"loss": 6.434, |
|
"step": 402000 |
|
}, |
|
{ |
|
"epoch": 15.756508122920337, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.0006848698375415932, |
|
"loss": 6.441, |
|
"step": 402500 |
|
}, |
|
{ |
|
"epoch": 15.776081424936386, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.0006844783715012722, |
|
"loss": 6.4375, |
|
"step": 403000 |
|
}, |
|
{ |
|
"epoch": 15.795654726952437, |
|
"grad_norm": 15.9375, |
|
"learning_rate": 0.0006840869054609513, |
|
"loss": 6.431, |
|
"step": 403500 |
|
}, |
|
{ |
|
"epoch": 15.815228028968487, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0006836954394206303, |
|
"loss": 6.4348, |
|
"step": 404000 |
|
}, |
|
{ |
|
"epoch": 15.834801330984536, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 0.0006833039733803092, |
|
"loss": 6.4251, |
|
"step": 404500 |
|
}, |
|
{ |
|
"epoch": 15.854374633000587, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0006829125073399882, |
|
"loss": 6.4409, |
|
"step": 405000 |
|
}, |
|
{ |
|
"epoch": 15.873947935016638, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.0006825210412996673, |
|
"loss": 6.4516, |
|
"step": 405500 |
|
}, |
|
{ |
|
"epoch": 15.893521237032687, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 0.0006821295752593463, |
|
"loss": 6.4334, |
|
"step": 406000 |
|
}, |
|
{ |
|
"epoch": 15.913094539048737, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.0006817381092190252, |
|
"loss": 6.4318, |
|
"step": 406500 |
|
}, |
|
{ |
|
"epoch": 15.932667841064788, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.0006813466431787042, |
|
"loss": 6.4306, |
|
"step": 407000 |
|
}, |
|
{ |
|
"epoch": 15.952241143080837, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0006809551771383833, |
|
"loss": 6.437, |
|
"step": 407500 |
|
}, |
|
{ |
|
"epoch": 15.971814445096888, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.0006805637110980623, |
|
"loss": 6.4377, |
|
"step": 408000 |
|
}, |
|
{ |
|
"epoch": 15.991387747112938, |
|
"grad_norm": 4.25, |
|
"learning_rate": 0.0006801722450577412, |
|
"loss": 6.4369, |
|
"step": 408500 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 6.43704080581665, |
|
"eval_runtime": 20.4936, |
|
"eval_samples_per_second": 97.592, |
|
"eval_steps_per_second": 6.099, |
|
"step": 408720 |
|
}, |
|
{ |
|
"epoch": 16.010961049128987, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 0.0006797807790174203, |
|
"loss": 6.427, |
|
"step": 409000 |
|
}, |
|
{ |
|
"epoch": 16.03053435114504, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.0006793893129770993, |
|
"loss": 6.4358, |
|
"step": 409500 |
|
}, |
|
{ |
|
"epoch": 16.05010765316109, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.0006789978469367783, |
|
"loss": 6.4409, |
|
"step": 410000 |
|
}, |
|
{ |
|
"epoch": 16.069680955177137, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.0006786063808964572, |
|
"loss": 6.439, |
|
"step": 410500 |
|
}, |
|
{ |
|
"epoch": 16.08925425719319, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.0006782149148561363, |
|
"loss": 6.4372, |
|
"step": 411000 |
|
}, |
|
{ |
|
"epoch": 16.10882755920924, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 0.0006778234488158153, |
|
"loss": 6.4322, |
|
"step": 411500 |
|
}, |
|
{ |
|
"epoch": 16.128400861225288, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0006774319827754943, |
|
"loss": 6.4346, |
|
"step": 412000 |
|
}, |
|
{ |
|
"epoch": 16.14797416324134, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.0006770405167351732, |
|
"loss": 6.433, |
|
"step": 412500 |
|
}, |
|
{ |
|
"epoch": 16.16754746525739, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.0006766490506948523, |
|
"loss": 6.4358, |
|
"step": 413000 |
|
}, |
|
{ |
|
"epoch": 16.187120767273438, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 0.0006762575846545313, |
|
"loss": 6.4389, |
|
"step": 413500 |
|
}, |
|
{ |
|
"epoch": 16.20669406928949, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.0006758661186142103, |
|
"loss": 6.4403, |
|
"step": 414000 |
|
}, |
|
{ |
|
"epoch": 16.22626737130554, |
|
"grad_norm": 5.75, |
|
"learning_rate": 0.0006754746525738891, |
|
"loss": 6.4404, |
|
"step": 414500 |
|
}, |
|
{ |
|
"epoch": 16.245840673321588, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.0006750831865335683, |
|
"loss": 6.439, |
|
"step": 415000 |
|
}, |
|
{ |
|
"epoch": 16.26541397533764, |
|
"grad_norm": 3.125, |
|
"learning_rate": 0.0006746917204932473, |
|
"loss": 6.4303, |
|
"step": 415500 |
|
}, |
|
{ |
|
"epoch": 16.28498727735369, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 0.0006743002544529263, |
|
"loss": 6.4322, |
|
"step": 416000 |
|
}, |
|
{ |
|
"epoch": 16.30456057936974, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.0006739087884126051, |
|
"loss": 6.4349, |
|
"step": 416500 |
|
}, |
|
{ |
|
"epoch": 16.32413388138579, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.0006735173223722842, |
|
"loss": 6.4351, |
|
"step": 417000 |
|
}, |
|
{ |
|
"epoch": 16.34370718340184, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.0006731258563319632, |
|
"loss": 6.4387, |
|
"step": 417500 |
|
}, |
|
{ |
|
"epoch": 16.36328048541789, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.0006727343902916421, |
|
"loss": 6.4356, |
|
"step": 418000 |
|
}, |
|
{ |
|
"epoch": 16.38285378743394, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.0006723429242513212, |
|
"loss": 6.4408, |
|
"step": 418500 |
|
}, |
|
{ |
|
"epoch": 16.40242708944999, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 0.0006719514582110002, |
|
"loss": 6.4244, |
|
"step": 419000 |
|
}, |
|
{ |
|
"epoch": 16.42200039146604, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.0006715599921706792, |
|
"loss": 6.4338, |
|
"step": 419500 |
|
}, |
|
{ |
|
"epoch": 16.44157369348209, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.0006711685261303581, |
|
"loss": 6.435, |
|
"step": 420000 |
|
}, |
|
{ |
|
"epoch": 16.46114699549814, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.0006707770600900372, |
|
"loss": 6.4414, |
|
"step": 420500 |
|
}, |
|
{ |
|
"epoch": 16.48072029751419, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.0006703855940497162, |
|
"loss": 6.4342, |
|
"step": 421000 |
|
}, |
|
{ |
|
"epoch": 16.50029359953024, |
|
"grad_norm": 2.5, |
|
"learning_rate": 0.0006699941280093952, |
|
"loss": 6.4382, |
|
"step": 421500 |
|
}, |
|
{ |
|
"epoch": 16.51986690154629, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.0006696026619690741, |
|
"loss": 6.433, |
|
"step": 422000 |
|
}, |
|
{ |
|
"epoch": 16.53944020356234, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 0.0006692111959287532, |
|
"loss": 6.4282, |
|
"step": 422500 |
|
}, |
|
{ |
|
"epoch": 16.559013505578392, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.0006688197298884322, |
|
"loss": 6.4268, |
|
"step": 423000 |
|
}, |
|
{ |
|
"epoch": 16.57858680759444, |
|
"grad_norm": 2.5, |
|
"learning_rate": 0.0006684282638481112, |
|
"loss": 6.429, |
|
"step": 423500 |
|
}, |
|
{ |
|
"epoch": 16.59816010961049, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.0006680367978077901, |
|
"loss": 6.4419, |
|
"step": 424000 |
|
}, |
|
{ |
|
"epoch": 16.617733411626542, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0006676453317674692, |
|
"loss": 6.4278, |
|
"step": 424500 |
|
}, |
|
{ |
|
"epoch": 16.63730671364259, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.0006672538657271482, |
|
"loss": 6.4343, |
|
"step": 425000 |
|
}, |
|
{ |
|
"epoch": 16.656880015658643, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.0006668623996868272, |
|
"loss": 6.4376, |
|
"step": 425500 |
|
}, |
|
{ |
|
"epoch": 16.676453317674692, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 0.0006664709336465061, |
|
"loss": 6.4388, |
|
"step": 426000 |
|
}, |
|
{ |
|
"epoch": 16.69602661969074, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.0006660794676061852, |
|
"loss": 6.434, |
|
"step": 426500 |
|
}, |
|
{ |
|
"epoch": 16.715599921706794, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.0006656880015658642, |
|
"loss": 6.4432, |
|
"step": 427000 |
|
}, |
|
{ |
|
"epoch": 16.735173223722843, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0006652965355255432, |
|
"loss": 6.4351, |
|
"step": 427500 |
|
}, |
|
{ |
|
"epoch": 16.75474652573889, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.0006649050694852222, |
|
"loss": 6.4306, |
|
"step": 428000 |
|
}, |
|
{ |
|
"epoch": 16.774319827754944, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.0006645136034449012, |
|
"loss": 6.4337, |
|
"step": 428500 |
|
}, |
|
{ |
|
"epoch": 16.793893129770993, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 0.0006641221374045802, |
|
"loss": 6.4353, |
|
"step": 429000 |
|
}, |
|
{ |
|
"epoch": 16.81346643178704, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0006637306713642592, |
|
"loss": 6.4256, |
|
"step": 429500 |
|
}, |
|
{ |
|
"epoch": 16.833039733803094, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 0.0006633392053239382, |
|
"loss": 6.4321, |
|
"step": 430000 |
|
}, |
|
{ |
|
"epoch": 16.852613035819143, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.0006629477392836172, |
|
"loss": 6.4416, |
|
"step": 430500 |
|
}, |
|
{ |
|
"epoch": 16.872186337835192, |
|
"grad_norm": 17.625, |
|
"learning_rate": 0.0006625562732432962, |
|
"loss": 6.4376, |
|
"step": 431000 |
|
}, |
|
{ |
|
"epoch": 16.891759639851244, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.0006621648072029752, |
|
"loss": 6.4386, |
|
"step": 431500 |
|
}, |
|
{ |
|
"epoch": 16.911332941867293, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.0006617733411626542, |
|
"loss": 6.4389, |
|
"step": 432000 |
|
}, |
|
{ |
|
"epoch": 16.930906243883342, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 0.0006613818751223332, |
|
"loss": 6.4396, |
|
"step": 432500 |
|
}, |
|
{ |
|
"epoch": 16.950479545899395, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 0.0006609904090820122, |
|
"loss": 6.4354, |
|
"step": 433000 |
|
}, |
|
{ |
|
"epoch": 16.970052847915444, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.000660598943041691, |
|
"loss": 6.4352, |
|
"step": 433500 |
|
}, |
|
{ |
|
"epoch": 16.989626149931492, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.0006602074770013702, |
|
"loss": 6.4347, |
|
"step": 434000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 6.436838150024414, |
|
"eval_runtime": 20.9495, |
|
"eval_samples_per_second": 95.467, |
|
"eval_steps_per_second": 5.967, |
|
"step": 434265 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 1277250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3275457063057981e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|