|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9905341446923597, |
|
"eval_steps": 46, |
|
"global_step": 368, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005409060175794456, |
|
"grad_norm": 44.9358828224533, |
|
"learning_rate": 2.0000000000000002e-07, |
|
"loss": 3.0416, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005409060175794456, |
|
"eval_loss": 3.2299506664276123, |
|
"eval_runtime": 80.3263, |
|
"eval_samples_per_second": 15.512, |
|
"eval_steps_per_second": 1.942, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010818120351588911, |
|
"grad_norm": 43.82162868015869, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 3.0979, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.016227180527383367, |
|
"grad_norm": 49.66354655578431, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 3.229, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.021636240703177823, |
|
"grad_norm": 44.90350470119676, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 3.1823, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.027045300878972278, |
|
"grad_norm": 45.64225015268925, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 3.1503, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.032454361054766734, |
|
"grad_norm": 43.46015056839171, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 2.9786, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03786342123056119, |
|
"grad_norm": 39.8236071706638, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 2.7861, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.043272481406355645, |
|
"grad_norm": 18.849036433644883, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 2.6546, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0486815415821501, |
|
"grad_norm": 18.842114481944215, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 2.685, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.054090601757944556, |
|
"grad_norm": 19.369280399976688, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.6052, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05949966193373901, |
|
"grad_norm": 16.167411037120388, |
|
"learning_rate": 2.2e-06, |
|
"loss": 2.4654, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.06490872210953347, |
|
"grad_norm": 14.946675935925768, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 2.4427, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.07031778228532792, |
|
"grad_norm": 13.586548911059968, |
|
"learning_rate": 2.6e-06, |
|
"loss": 2.5375, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.07572684246112238, |
|
"grad_norm": 9.968943698698908, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 2.316, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.08113590263691683, |
|
"grad_norm": 10.416045938582544, |
|
"learning_rate": 3e-06, |
|
"loss": 2.4094, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08654496281271129, |
|
"grad_norm": 9.298587355474794, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 2.3988, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.09195402298850575, |
|
"grad_norm": 8.609391957088402, |
|
"learning_rate": 3.4000000000000005e-06, |
|
"loss": 2.3259, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0973630831643002, |
|
"grad_norm": 7.68261305571874, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 2.2034, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.10277214334009466, |
|
"grad_norm": 8.199455018530278, |
|
"learning_rate": 3.8000000000000005e-06, |
|
"loss": 2.1983, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.10818120351588911, |
|
"grad_norm": 7.229515373269984, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.3398, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11359026369168357, |
|
"grad_norm": 8.94152448817592, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 2.2895, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.11899932386747802, |
|
"grad_norm": 7.678296028460811, |
|
"learning_rate": 4.4e-06, |
|
"loss": 2.2891, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.12440838404327248, |
|
"grad_norm": 8.007524747470107, |
|
"learning_rate": 4.600000000000001e-06, |
|
"loss": 2.2389, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.12981744421906694, |
|
"grad_norm": 7.352151792895566, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 2.2593, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1352265043948614, |
|
"grad_norm": 7.103281407665676, |
|
"learning_rate": 5e-06, |
|
"loss": 2.1577, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.14063556457065585, |
|
"grad_norm": 6.919318449883582, |
|
"learning_rate": 4.999955579111413e-06, |
|
"loss": 2.2622, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1460446247464503, |
|
"grad_norm": 7.072321363589398, |
|
"learning_rate": 4.999822318024222e-06, |
|
"loss": 2.1805, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.15145368492224476, |
|
"grad_norm": 8.556428262133437, |
|
"learning_rate": 4.999600221474089e-06, |
|
"loss": 2.2209, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.1568627450980392, |
|
"grad_norm": 6.318203548443499, |
|
"learning_rate": 4.999289297353593e-06, |
|
"loss": 2.1821, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.16227180527383367, |
|
"grad_norm": 7.959416954439196, |
|
"learning_rate": 4.998889556711958e-06, |
|
"loss": 2.1423, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16768086544962812, |
|
"grad_norm": 7.493305387455565, |
|
"learning_rate": 4.9984010137546475e-06, |
|
"loss": 2.1499, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.17308992562542258, |
|
"grad_norm": 7.399157248009571, |
|
"learning_rate": 4.997823685842875e-06, |
|
"loss": 2.1654, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.17849898580121704, |
|
"grad_norm": 7.095225049901362, |
|
"learning_rate": 4.997157593492974e-06, |
|
"loss": 2.2062, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1839080459770115, |
|
"grad_norm": 6.544205100152826, |
|
"learning_rate": 4.996402760375676e-06, |
|
"loss": 2.1302, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.18931710615280595, |
|
"grad_norm": 8.497617899265414, |
|
"learning_rate": 4.995559213315267e-06, |
|
"loss": 2.2452, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1947261663286004, |
|
"grad_norm": 7.711468079457363, |
|
"learning_rate": 4.9946269822886335e-06, |
|
"loss": 2.1562, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.20013522650439486, |
|
"grad_norm": 6.828572393494306, |
|
"learning_rate": 4.993606100424202e-06, |
|
"loss": 2.0838, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.20554428668018931, |
|
"grad_norm": 7.829101006960679, |
|
"learning_rate": 4.992496604000756e-06, |
|
"loss": 2.1608, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.21095334685598377, |
|
"grad_norm": 7.0790174546853954, |
|
"learning_rate": 4.991298532446149e-06, |
|
"loss": 2.1385, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.21636240703177823, |
|
"grad_norm": 6.4583828883763825, |
|
"learning_rate": 4.9900119283359025e-06, |
|
"loss": 2.1094, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.22177146720757268, |
|
"grad_norm": 6.919640648587095, |
|
"learning_rate": 4.988636837391696e-06, |
|
"loss": 2.2184, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.22718052738336714, |
|
"grad_norm": 7.054119878110095, |
|
"learning_rate": 4.987173308479738e-06, |
|
"loss": 2.1584, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2325895875591616, |
|
"grad_norm": 8.011842535646375, |
|
"learning_rate": 4.985621393609032e-06, |
|
"loss": 2.2022, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.23799864773495605, |
|
"grad_norm": 7.872258963246048, |
|
"learning_rate": 4.98398114792953e-06, |
|
"loss": 2.0779, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.2434077079107505, |
|
"grad_norm": 7.5559333876351085, |
|
"learning_rate": 4.982252629730167e-06, |
|
"loss": 2.0195, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.24881676808654496, |
|
"grad_norm": 7.7955956497680985, |
|
"learning_rate": 4.980435900436793e-06, |
|
"loss": 2.0475, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.24881676808654496, |
|
"eval_loss": 2.1143910884857178, |
|
"eval_runtime": 80.1839, |
|
"eval_samples_per_second": 15.539, |
|
"eval_steps_per_second": 1.946, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2542258282623394, |
|
"grad_norm": 6.802229063375309, |
|
"learning_rate": 4.978531024609994e-06, |
|
"loss": 2.0305, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.25963488843813387, |
|
"grad_norm": 7.592805462348185, |
|
"learning_rate": 4.9765380699427905e-06, |
|
"loss": 2.0484, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.2650439486139283, |
|
"grad_norm": 7.243742353954717, |
|
"learning_rate": 4.9744571072582365e-06, |
|
"loss": 2.1142, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2704530087897228, |
|
"grad_norm": 7.924276414192629, |
|
"learning_rate": 4.972288210506902e-06, |
|
"loss": 2.0623, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 7.407311330367973, |
|
"learning_rate": 4.970031456764242e-06, |
|
"loss": 2.032, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.2812711291413117, |
|
"grad_norm": 7.756849919749466, |
|
"learning_rate": 4.967686926227862e-06, |
|
"loss": 1.9791, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.28668018931710615, |
|
"grad_norm": 6.01269466571588, |
|
"learning_rate": 4.965254702214668e-06, |
|
"loss": 2.0046, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.2920892494929006, |
|
"grad_norm": 6.9870832241271685, |
|
"learning_rate": 4.9627348711578996e-06, |
|
"loss": 2.1095, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.29749830966869506, |
|
"grad_norm": 9.083794116875618, |
|
"learning_rate": 4.960127522604065e-06, |
|
"loss": 2.0736, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3029073698444895, |
|
"grad_norm": 7.096143715762253, |
|
"learning_rate": 4.957432749209755e-06, |
|
"loss": 1.9896, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.30831643002028397, |
|
"grad_norm": 5.7983259418594715, |
|
"learning_rate": 4.954650646738354e-06, |
|
"loss": 1.944, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3137254901960784, |
|
"grad_norm": 6.984280635078145, |
|
"learning_rate": 4.951781314056633e-06, |
|
"loss": 2.0276, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.3191345503718729, |
|
"grad_norm": 6.88721515440217, |
|
"learning_rate": 4.948824853131237e-06, |
|
"loss": 2.0442, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.32454361054766734, |
|
"grad_norm": 7.821091549898941, |
|
"learning_rate": 4.9457813690250635e-06, |
|
"loss": 2.0072, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3299526707234618, |
|
"grad_norm": 5.64425372355818, |
|
"learning_rate": 4.942650969893527e-06, |
|
"loss": 1.994, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.33536173089925625, |
|
"grad_norm": 6.2844987961209675, |
|
"learning_rate": 4.939433766980717e-06, |
|
"loss": 2.0615, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.3407707910750507, |
|
"grad_norm": 7.116345411158451, |
|
"learning_rate": 4.936129874615443e-06, |
|
"loss": 2.048, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.34617985125084516, |
|
"grad_norm": 6.7059981437853375, |
|
"learning_rate": 4.932739410207172e-06, |
|
"loss": 1.899, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.3515889114266396, |
|
"grad_norm": 6.681993008670398, |
|
"learning_rate": 4.929262494241859e-06, |
|
"loss": 1.9566, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.35699797160243407, |
|
"grad_norm": 6.496917926567058, |
|
"learning_rate": 4.9256992502776605e-06, |
|
"loss": 1.956, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.36240703177822853, |
|
"grad_norm": 6.708931079572326, |
|
"learning_rate": 4.922049804940546e-06, |
|
"loss": 1.9967, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.367816091954023, |
|
"grad_norm": 5.620040634218876, |
|
"learning_rate": 4.9183142879198e-06, |
|
"loss": 1.916, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.37322515212981744, |
|
"grad_norm": 7.850717902889795, |
|
"learning_rate": 4.914492831963411e-06, |
|
"loss": 1.941, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.3786342123056119, |
|
"grad_norm": 7.497086645607299, |
|
"learning_rate": 4.910585572873355e-06, |
|
"loss": 1.9286, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.38404327248140635, |
|
"grad_norm": 10.987140598022854, |
|
"learning_rate": 4.906592649500767e-06, |
|
"loss": 1.8511, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3894523326572008, |
|
"grad_norm": 6.780385991870079, |
|
"learning_rate": 4.902514203741013e-06, |
|
"loss": 1.9953, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.39486139283299526, |
|
"grad_norm": 6.69524986070073, |
|
"learning_rate": 4.898350380528638e-06, |
|
"loss": 1.9524, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.4002704530087897, |
|
"grad_norm": 6.1699263921182474, |
|
"learning_rate": 4.894101327832225e-06, |
|
"loss": 1.8258, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4056795131845842, |
|
"grad_norm": 6.732182374500395, |
|
"learning_rate": 4.8897671966491315e-06, |
|
"loss": 1.9725, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.41108857336037863, |
|
"grad_norm": 6.598471225386019, |
|
"learning_rate": 4.8853481410001225e-06, |
|
"loss": 1.8993, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4164976335361731, |
|
"grad_norm": 6.93174108718265, |
|
"learning_rate": 4.8808443179239025e-06, |
|
"loss": 1.9447, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.42190669371196754, |
|
"grad_norm": 6.726165747858891, |
|
"learning_rate": 4.87625588747153e-06, |
|
"loss": 1.9723, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.427315753887762, |
|
"grad_norm": 5.798399986576307, |
|
"learning_rate": 4.87158301270073e-06, |
|
"loss": 1.9677, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.43272481406355645, |
|
"grad_norm": 6.420619711799955, |
|
"learning_rate": 4.8668258596701035e-06, |
|
"loss": 1.9731, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4381338742393509, |
|
"grad_norm": 8.374770038931102, |
|
"learning_rate": 4.861984597433223e-06, |
|
"loss": 1.9635, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.44354293441514536, |
|
"grad_norm": 6.2592251917152595, |
|
"learning_rate": 4.857059398032622e-06, |
|
"loss": 1.9228, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.4489519945909398, |
|
"grad_norm": 6.759714436705178, |
|
"learning_rate": 4.85205043649369e-06, |
|
"loss": 1.9456, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.4543610547667343, |
|
"grad_norm": 7.0963936709641615, |
|
"learning_rate": 4.846957890818444e-06, |
|
"loss": 1.9217, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"grad_norm": 6.941342449698633, |
|
"learning_rate": 4.841781941979207e-06, |
|
"loss": 1.7469, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.4651791751183232, |
|
"grad_norm": 7.445994149780175, |
|
"learning_rate": 4.836522773912178e-06, |
|
"loss": 1.9254, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 6.588581579766995, |
|
"learning_rate": 4.83118057351089e-06, |
|
"loss": 1.8967, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.4759972954699121, |
|
"grad_norm": 7.117087421099954, |
|
"learning_rate": 4.825755530619576e-06, |
|
"loss": 1.8865, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.48140635564570655, |
|
"grad_norm": 6.724049881659424, |
|
"learning_rate": 4.820247838026414e-06, |
|
"loss": 1.9435, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.486815415821501, |
|
"grad_norm": 5.6973534950025595, |
|
"learning_rate": 4.814657691456685e-06, |
|
"loss": 1.7882, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.49222447599729546, |
|
"grad_norm": 6.388481680244839, |
|
"learning_rate": 4.808985289565813e-06, |
|
"loss": 1.8173, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.4976335361730899, |
|
"grad_norm": 6.3977034813469835, |
|
"learning_rate": 4.803230833932302e-06, |
|
"loss": 1.8489, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.4976335361730899, |
|
"eval_loss": 1.89235520362854, |
|
"eval_runtime": 80.2557, |
|
"eval_samples_per_second": 15.525, |
|
"eval_steps_per_second": 1.944, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5030425963488844, |
|
"grad_norm": 7.046151003888674, |
|
"learning_rate": 4.797394529050577e-06, |
|
"loss": 1.9349, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5084516565246788, |
|
"grad_norm": 6.590771586445206, |
|
"learning_rate": 4.791476582323719e-06, |
|
"loss": 1.9962, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5138607167004733, |
|
"grad_norm": 7.166866163450392, |
|
"learning_rate": 4.785477204056089e-06, |
|
"loss": 1.8174, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5192697768762677, |
|
"grad_norm": 6.716770374519456, |
|
"learning_rate": 4.779396607445858e-06, |
|
"loss": 1.9214, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5246788370520622, |
|
"grad_norm": 7.244507141710972, |
|
"learning_rate": 4.77323500857743e-06, |
|
"loss": 1.8681, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5300878972278567, |
|
"grad_norm": 6.3217358598893165, |
|
"learning_rate": 4.7669926264137625e-06, |
|
"loss": 1.8825, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.5354969574036511, |
|
"grad_norm": 5.77085923138114, |
|
"learning_rate": 4.760669682788584e-06, |
|
"loss": 1.7838, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.5409060175794456, |
|
"grad_norm": 5.638989379158351, |
|
"learning_rate": 4.754266402398517e-06, |
|
"loss": 1.7818, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.54631507775524, |
|
"grad_norm": 6.283880096623512, |
|
"learning_rate": 4.747783012795083e-06, |
|
"loss": 1.7555, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 7.08060002757244, |
|
"learning_rate": 4.741219744376624e-06, |
|
"loss": 1.8585, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.5571331981068289, |
|
"grad_norm": 7.077388397562155, |
|
"learning_rate": 4.734576830380113e-06, |
|
"loss": 1.7837, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.5625422582826234, |
|
"grad_norm": 6.093542681362889, |
|
"learning_rate": 4.727854506872863e-06, |
|
"loss": 1.7841, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.5679513184584178, |
|
"grad_norm": 6.250665388589361, |
|
"learning_rate": 4.721053012744142e-06, |
|
"loss": 1.8065, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5733603786342123, |
|
"grad_norm": 7.137442110609306, |
|
"learning_rate": 4.71417258969668e-06, |
|
"loss": 1.8022, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.5787694388100068, |
|
"grad_norm": 6.025245815328945, |
|
"learning_rate": 4.70721348223808e-06, |
|
"loss": 1.8129, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.5841784989858012, |
|
"grad_norm": 6.920748676536336, |
|
"learning_rate": 4.700175937672134e-06, |
|
"loss": 1.8498, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.5895875591615957, |
|
"grad_norm": 7.6481497850706415, |
|
"learning_rate": 4.693060206090028e-06, |
|
"loss": 1.7777, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.5949966193373901, |
|
"grad_norm": 7.666186710495507, |
|
"learning_rate": 4.685866540361456e-06, |
|
"loss": 1.834, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6004056795131846, |
|
"grad_norm": 7.021331030038572, |
|
"learning_rate": 4.678595196125638e-06, |
|
"loss": 1.8563, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.605814739688979, |
|
"grad_norm": 6.487310124057107, |
|
"learning_rate": 4.671246431782234e-06, |
|
"loss": 1.7152, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6112237998647735, |
|
"grad_norm": 6.492879712741564, |
|
"learning_rate": 4.6638205084821544e-06, |
|
"loss": 1.7396, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.6166328600405679, |
|
"grad_norm": 6.932406364697437, |
|
"learning_rate": 4.656317690118291e-06, |
|
"loss": 1.6753, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.6220419202163624, |
|
"grad_norm": 6.635109301432599, |
|
"learning_rate": 4.648738243316128e-06, |
|
"loss": 1.7832, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6274509803921569, |
|
"grad_norm": 6.6861559036147975, |
|
"learning_rate": 4.641082437424277e-06, |
|
"loss": 1.7703, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.6328600405679513, |
|
"grad_norm": 6.9646373816737075, |
|
"learning_rate": 4.633350544504899e-06, |
|
"loss": 1.7764, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.6382691007437458, |
|
"grad_norm": 6.144269305268074, |
|
"learning_rate": 4.625542839324036e-06, |
|
"loss": 1.8115, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.6436781609195402, |
|
"grad_norm": 6.489513509921581, |
|
"learning_rate": 4.617659599341849e-06, |
|
"loss": 1.7991, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.6490872210953347, |
|
"grad_norm": 6.883770839388401, |
|
"learning_rate": 4.609701104702759e-06, |
|
"loss": 1.6693, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6544962812711291, |
|
"grad_norm": 7.5470889656860605, |
|
"learning_rate": 4.6016676382254895e-06, |
|
"loss": 1.7072, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.6599053414469236, |
|
"grad_norm": 7.546505121169261, |
|
"learning_rate": 4.593559485393015e-06, |
|
"loss": 1.7841, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.665314401622718, |
|
"grad_norm": 7.028634956829483, |
|
"learning_rate": 4.585376934342418e-06, |
|
"loss": 1.8216, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.6707234617985125, |
|
"grad_norm": 6.173100762882487, |
|
"learning_rate": 4.577120275854649e-06, |
|
"loss": 1.7126, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.676132521974307, |
|
"grad_norm": 7.3755207955713, |
|
"learning_rate": 4.568789803344196e-06, |
|
"loss": 1.6914, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6815415821501014, |
|
"grad_norm": 7.110847605538237, |
|
"learning_rate": 4.56038581284865e-06, |
|
"loss": 1.7385, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.6869506423258959, |
|
"grad_norm": 6.361787130642339, |
|
"learning_rate": 4.551908603018191e-06, |
|
"loss": 1.6135, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.6923597025016903, |
|
"grad_norm": 6.445020674626523, |
|
"learning_rate": 4.543358475104975e-06, |
|
"loss": 1.709, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.6977687626774848, |
|
"grad_norm": 9.06412704312379, |
|
"learning_rate": 4.5347357329524254e-06, |
|
"loss": 1.846, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.7031778228532792, |
|
"grad_norm": 6.7066499384595675, |
|
"learning_rate": 4.5260406829844364e-06, |
|
"loss": 1.7074, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7085868830290737, |
|
"grad_norm": 5.861118356970615, |
|
"learning_rate": 4.5172736341944845e-06, |
|
"loss": 1.7179, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.7139959432048681, |
|
"grad_norm": 7.102544513000641, |
|
"learning_rate": 4.5084348981346495e-06, |
|
"loss": 1.7577, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.7194050033806626, |
|
"grad_norm": 10.027416022472378, |
|
"learning_rate": 4.499524788904537e-06, |
|
"loss": 1.6513, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.7248140635564571, |
|
"grad_norm": 6.648158309890374, |
|
"learning_rate": 4.490543623140123e-06, |
|
"loss": 1.6589, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.7302231237322515, |
|
"grad_norm": 6.662339022641756, |
|
"learning_rate": 4.481491720002499e-06, |
|
"loss": 1.6705, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.735632183908046, |
|
"grad_norm": 6.0555935222186665, |
|
"learning_rate": 4.472369401166531e-06, |
|
"loss": 1.7424, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.7410412440838404, |
|
"grad_norm": 8.895889108381375, |
|
"learning_rate": 4.463176990809423e-06, |
|
"loss": 1.7386, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.7464503042596349, |
|
"grad_norm": 7.003045853961112, |
|
"learning_rate": 4.453914815599206e-06, |
|
"loss": 1.7036, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.7464503042596349, |
|
"eval_loss": 1.716886281967163, |
|
"eval_runtime": 80.2672, |
|
"eval_samples_per_second": 15.523, |
|
"eval_steps_per_second": 1.944, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.7518593644354293, |
|
"grad_norm": 6.693085585003622, |
|
"learning_rate": 4.444583204683123e-06, |
|
"loss": 1.7129, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.7572684246112238, |
|
"grad_norm": 7.502871529343553, |
|
"learning_rate": 4.435182489675931e-06, |
|
"loss": 1.6455, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7626774847870182, |
|
"grad_norm": 6.062617524551209, |
|
"learning_rate": 4.425713004648123e-06, |
|
"loss": 1.5962, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.7680865449628127, |
|
"grad_norm": 7.073651768871659, |
|
"learning_rate": 4.416175086114049e-06, |
|
"loss": 1.6784, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.7734956051386072, |
|
"grad_norm": 5.885299008267407, |
|
"learning_rate": 4.406569073019965e-06, |
|
"loss": 1.7525, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.7789046653144016, |
|
"grad_norm": 6.313009716756996, |
|
"learning_rate": 4.396895306731978e-06, |
|
"loss": 1.7347, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.7843137254901961, |
|
"grad_norm": 7.009990810809474, |
|
"learning_rate": 4.387154131023924e-06, |
|
"loss": 1.691, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.7897227856659905, |
|
"grad_norm": 6.900733353908871, |
|
"learning_rate": 4.377345892065149e-06, |
|
"loss": 1.6789, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.795131845841785, |
|
"grad_norm": 6.705414309211519, |
|
"learning_rate": 4.367470938408204e-06, |
|
"loss": 1.6024, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.8005409060175794, |
|
"grad_norm": 5.746533610111662, |
|
"learning_rate": 4.357529620976463e-06, |
|
"loss": 1.5715, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.8059499661933739, |
|
"grad_norm": 6.344927493402156, |
|
"learning_rate": 4.3475222930516484e-06, |
|
"loss": 1.6393, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.8113590263691683, |
|
"grad_norm": 7.177833196869734, |
|
"learning_rate": 4.337449310261279e-06, |
|
"loss": 1.7165, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8167680865449628, |
|
"grad_norm": 7.330166717470119, |
|
"learning_rate": 4.327311030566033e-06, |
|
"loss": 1.5549, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.8221771467207573, |
|
"grad_norm": 6.615833346853653, |
|
"learning_rate": 4.317107814247022e-06, |
|
"loss": 1.5402, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 6.435680171929204, |
|
"learning_rate": 4.306840023892998e-06, |
|
"loss": 1.6594, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.8329952670723462, |
|
"grad_norm": 8.276922490588758, |
|
"learning_rate": 4.2965080243874555e-06, |
|
"loss": 1.7175, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.8384043272481406, |
|
"grad_norm": 6.944104925297799, |
|
"learning_rate": 4.2861121828956745e-06, |
|
"loss": 1.6139, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.8438133874239351, |
|
"grad_norm": 6.95913091453491, |
|
"learning_rate": 4.275652868851669e-06, |
|
"loss": 1.6509, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.8492224475997295, |
|
"grad_norm": 6.520552325908588, |
|
"learning_rate": 4.265130453945056e-06, |
|
"loss": 1.6153, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.854631507775524, |
|
"grad_norm": 5.793083719201756, |
|
"learning_rate": 4.254545312107854e-06, |
|
"loss": 1.6652, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.8600405679513184, |
|
"grad_norm": 7.087661599168472, |
|
"learning_rate": 4.243897819501187e-06, |
|
"loss": 1.5381, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.8654496281271129, |
|
"grad_norm": 7.094455948395506, |
|
"learning_rate": 4.233188354501921e-06, |
|
"loss": 1.5494, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8708586883029074, |
|
"grad_norm": 8.203152015742518, |
|
"learning_rate": 4.222417297689217e-06, |
|
"loss": 1.6491, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.8762677484787018, |
|
"grad_norm": 6.453682013406382, |
|
"learning_rate": 4.211585031831007e-06, |
|
"loss": 1.6017, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.8816768086544963, |
|
"grad_norm": 5.936146356941861, |
|
"learning_rate": 4.200691941870392e-06, |
|
"loss": 1.578, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.8870858688302907, |
|
"grad_norm": 6.939707441726559, |
|
"learning_rate": 4.189738414911959e-06, |
|
"loss": 1.6267, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.8924949290060852, |
|
"grad_norm": 5.410313740063528, |
|
"learning_rate": 4.178724840208029e-06, |
|
"loss": 1.4958, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.8979039891818796, |
|
"grad_norm": 6.696136302418629, |
|
"learning_rate": 4.167651609144822e-06, |
|
"loss": 1.5497, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.9033130493576741, |
|
"grad_norm": 7.906312937190654, |
|
"learning_rate": 4.15651911522855e-06, |
|
"loss": 1.5839, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.9087221095334685, |
|
"grad_norm": 6.535495007076388, |
|
"learning_rate": 4.145327754071427e-06, |
|
"loss": 1.5472, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.914131169709263, |
|
"grad_norm": 5.836290315234971, |
|
"learning_rate": 4.134077923377622e-06, |
|
"loss": 1.552, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"grad_norm": 6.8226833628498476, |
|
"learning_rate": 4.122770022929114e-06, |
|
"loss": 1.5789, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9249492900608519, |
|
"grad_norm": 6.636708734597821, |
|
"learning_rate": 4.1114044545714935e-06, |
|
"loss": 1.5297, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.9303583502366464, |
|
"grad_norm": 6.480166160413371, |
|
"learning_rate": 4.0999816221996755e-06, |
|
"loss": 1.64, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.9357674104124408, |
|
"grad_norm": 6.305918232446464, |
|
"learning_rate": 4.088501931743551e-06, |
|
"loss": 1.6244, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 6.244739864099848, |
|
"learning_rate": 4.076965791153562e-06, |
|
"loss": 1.4548, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.9465855307640297, |
|
"grad_norm": 6.652315653413243, |
|
"learning_rate": 4.065373610386201e-06, |
|
"loss": 1.6279, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.9519945909398242, |
|
"grad_norm": 7.0369918458505, |
|
"learning_rate": 4.0537258013894434e-06, |
|
"loss": 1.5423, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.9574036511156186, |
|
"grad_norm": 6.219650042475582, |
|
"learning_rate": 4.042022778088111e-06, |
|
"loss": 1.6608, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.9628127112914131, |
|
"grad_norm": 6.876925788244563, |
|
"learning_rate": 4.030264956369158e-06, |
|
"loss": 1.5072, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.9682217714672076, |
|
"grad_norm": 6.90949169776309, |
|
"learning_rate": 4.018452754066895e-06, |
|
"loss": 1.6312, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.973630831643002, |
|
"grad_norm": 6.8575340845851604, |
|
"learning_rate": 4.006586590948141e-06, |
|
"loss": 1.5603, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9790398918187965, |
|
"grad_norm": 6.399141972511512, |
|
"learning_rate": 3.994666888697304e-06, |
|
"loss": 1.4676, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.9844489519945909, |
|
"grad_norm": 6.8804715530154255, |
|
"learning_rate": 3.982694070901396e-06, |
|
"loss": 1.5238, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.9898580121703854, |
|
"grad_norm": 6.143146424939786, |
|
"learning_rate": 3.970668563034982e-06, |
|
"loss": 1.5458, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.9952670723461798, |
|
"grad_norm": 7.1826166940238005, |
|
"learning_rate": 3.958590792445057e-06, |
|
"loss": 1.5179, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.9952670723461798, |
|
"eval_loss": 1.555787205696106, |
|
"eval_runtime": 80.2289, |
|
"eval_samples_per_second": 15.531, |
|
"eval_steps_per_second": 1.944, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.0006761325219744, |
|
"grad_norm": 6.359754610852221, |
|
"learning_rate": 3.946461188335863e-06, |
|
"loss": 1.4166, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.0060851926977687, |
|
"grad_norm": 6.558163414784584, |
|
"learning_rate": 3.934280181753634e-06, |
|
"loss": 1.2751, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.0114942528735633, |
|
"grad_norm": 7.219629642258684, |
|
"learning_rate": 3.922048205571279e-06, |
|
"loss": 1.1886, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.0169033130493577, |
|
"grad_norm": 5.641940967289977, |
|
"learning_rate": 3.909765694473e-06, |
|
"loss": 1.162, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.0223123732251522, |
|
"grad_norm": 5.146358119235958, |
|
"learning_rate": 3.897433084938841e-06, |
|
"loss": 1.1985, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.0277214334009466, |
|
"grad_norm": 6.030205789253815, |
|
"learning_rate": 3.885050815229182e-06, |
|
"loss": 1.2081, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0331304935767411, |
|
"grad_norm": 7.022365747053938, |
|
"learning_rate": 3.872619325369162e-06, |
|
"loss": 1.2893, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.0385395537525355, |
|
"grad_norm": 5.333726155190956, |
|
"learning_rate": 3.860139057133042e-06, |
|
"loss": 1.0908, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.04394861392833, |
|
"grad_norm": 5.45665767931003, |
|
"learning_rate": 3.8476104540285054e-06, |
|
"loss": 1.1623, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.0493576741041244, |
|
"grad_norm": 5.712839244673719, |
|
"learning_rate": 3.835033961280898e-06, |
|
"loss": 1.2006, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.054766734279919, |
|
"grad_norm": 5.034751006183729, |
|
"learning_rate": 3.8224100258174066e-06, |
|
"loss": 1.1717, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.0601757944557133, |
|
"grad_norm": 5.683598056495194, |
|
"learning_rate": 3.809739096251176e-06, |
|
"loss": 1.0888, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.0655848546315079, |
|
"grad_norm": 5.519253752312427, |
|
"learning_rate": 3.7970216228653667e-06, |
|
"loss": 1.1504, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.0709939148073022, |
|
"grad_norm": 5.447708800800649, |
|
"learning_rate": 3.7842580575971533e-06, |
|
"loss": 1.1493, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.0764029749830968, |
|
"grad_norm": 5.433128433416691, |
|
"learning_rate": 3.7714488540216637e-06, |
|
"loss": 1.2068, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.0818120351588911, |
|
"grad_norm": 5.161001564479917, |
|
"learning_rate": 3.7585944673358632e-06, |
|
"loss": 1.061, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0872210953346857, |
|
"grad_norm": 6.0824539385903895, |
|
"learning_rate": 3.745695354342374e-06, |
|
"loss": 1.0569, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.09263015551048, |
|
"grad_norm": 5.834270627588388, |
|
"learning_rate": 3.7327519734332453e-06, |
|
"loss": 1.1536, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.0980392156862746, |
|
"grad_norm": 5.775803225686334, |
|
"learning_rate": 3.7197647845736616e-06, |
|
"loss": 1.0768, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.103448275862069, |
|
"grad_norm": 5.875985815161611, |
|
"learning_rate": 3.7067342492855997e-06, |
|
"loss": 1.0848, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.1088573360378635, |
|
"grad_norm": 5.252887507709333, |
|
"learning_rate": 3.6936608306314227e-06, |
|
"loss": 1.0704, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.1142663962136579, |
|
"grad_norm": 5.6931233725933375, |
|
"learning_rate": 3.6805449931974313e-06, |
|
"loss": 1.0765, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.1196754563894524, |
|
"grad_norm": 5.48726034244755, |
|
"learning_rate": 3.6673872030773473e-06, |
|
"loss": 1.1818, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.1250845165652468, |
|
"grad_norm": 6.429761402145819, |
|
"learning_rate": 3.654187927855754e-06, |
|
"loss": 1.0093, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.1304935767410413, |
|
"grad_norm": 6.370439411738345, |
|
"learning_rate": 3.6409476365914786e-06, |
|
"loss": 1.1248, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.1359026369168357, |
|
"grad_norm": 5.059354487980713, |
|
"learning_rate": 3.6276667998009242e-06, |
|
"loss": 1.1993, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.1413116970926303, |
|
"grad_norm": 5.240968702755266, |
|
"learning_rate": 3.6143458894413463e-06, |
|
"loss": 1.1065, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.1467207572684246, |
|
"grad_norm": 6.6242597409867425, |
|
"learning_rate": 3.600985378894086e-06, |
|
"loss": 1.1479, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.1521298174442192, |
|
"grad_norm": 5.568795571449795, |
|
"learning_rate": 3.5875857429477447e-06, |
|
"loss": 1.1048, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.1575388776200135, |
|
"grad_norm": 5.421352959165325, |
|
"learning_rate": 3.5741474577813086e-06, |
|
"loss": 1.1126, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.162947937795808, |
|
"grad_norm": 4.977758792843085, |
|
"learning_rate": 3.5606710009472335e-06, |
|
"loss": 1.0064, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.1683569979716024, |
|
"grad_norm": 5.911407929293024, |
|
"learning_rate": 3.54715685135447e-06, |
|
"loss": 1.1859, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.173766058147397, |
|
"grad_norm": 5.557031565026226, |
|
"learning_rate": 3.5336054892514437e-06, |
|
"loss": 1.1526, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.1791751183231913, |
|
"grad_norm": 5.900696316987968, |
|
"learning_rate": 3.520017396208993e-06, |
|
"loss": 1.0158, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.184584178498986, |
|
"grad_norm": 5.287116454496747, |
|
"learning_rate": 3.5063930551032494e-06, |
|
"loss": 1.0677, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.1899932386747802, |
|
"grad_norm": 5.106236012898038, |
|
"learning_rate": 3.4927329500984857e-06, |
|
"loss": 1.0827, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.1954022988505748, |
|
"grad_norm": 4.8246900511041595, |
|
"learning_rate": 3.4790375666299026e-06, |
|
"loss": 1.1788, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.2008113590263692, |
|
"grad_norm": 5.545181164784277, |
|
"learning_rate": 3.465307391386383e-06, |
|
"loss": 1.1896, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.2062204192021637, |
|
"grad_norm": 5.427546334067872, |
|
"learning_rate": 3.4515429122931955e-06, |
|
"loss": 1.0658, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.211629479377958, |
|
"grad_norm": 5.245977350973333, |
|
"learning_rate": 3.437744618494653e-06, |
|
"loss": 1.1658, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.2170385395537526, |
|
"grad_norm": 5.043336556534448, |
|
"learning_rate": 3.423913000336732e-06, |
|
"loss": 1.098, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.222447599729547, |
|
"grad_norm": 5.579610380422948, |
|
"learning_rate": 3.41004854934965e-06, |
|
"loss": 1.0795, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.2278566599053415, |
|
"grad_norm": 6.096777208300464, |
|
"learning_rate": 3.3961517582303916e-06, |
|
"loss": 1.0956, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.2332657200811359, |
|
"grad_norm": 5.2061771582860645, |
|
"learning_rate": 3.3822231208252053e-06, |
|
"loss": 1.1306, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.2386747802569305, |
|
"grad_norm": 5.182819955042982, |
|
"learning_rate": 3.3682631321120507e-06, |
|
"loss": 1.1009, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.2440838404327248, |
|
"grad_norm": 5.625028453640487, |
|
"learning_rate": 3.354272288183012e-06, |
|
"loss": 1.1819, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2440838404327248, |
|
"eval_loss": 1.463145136833191, |
|
"eval_runtime": 80.1754, |
|
"eval_samples_per_second": 15.541, |
|
"eval_steps_per_second": 1.946, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2494929006085194, |
|
"grad_norm": 5.590526458310239, |
|
"learning_rate": 3.340251086226663e-06, |
|
"loss": 0.9821, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.2549019607843137, |
|
"grad_norm": 11.81361368808831, |
|
"learning_rate": 3.326200024510405e-06, |
|
"loss": 1.1092, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.260311020960108, |
|
"grad_norm": 5.093824480458287, |
|
"learning_rate": 3.3121196023627543e-06, |
|
"loss": 1.1585, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.2657200811359026, |
|
"grad_norm": 5.166324447334581, |
|
"learning_rate": 3.2980103201556023e-06, |
|
"loss": 1.157, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.2711291413116972, |
|
"grad_norm": 5.637337054230702, |
|
"learning_rate": 3.2838726792864315e-06, |
|
"loss": 1.166, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.2765382014874915, |
|
"grad_norm": 5.6363361298308545, |
|
"learning_rate": 3.2697071821604986e-06, |
|
"loss": 1.0634, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.2819472616632859, |
|
"grad_norm": 5.96879436061098, |
|
"learning_rate": 3.255514332172979e-06, |
|
"loss": 1.0877, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.2873563218390804, |
|
"grad_norm": 5.298279579130303, |
|
"learning_rate": 3.2412946336910778e-06, |
|
"loss": 1.1096, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.292765382014875, |
|
"grad_norm": 5.24661665042464, |
|
"learning_rate": 3.2270485920361093e-06, |
|
"loss": 1.0553, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.2981744421906694, |
|
"grad_norm": 5.2866147649117305, |
|
"learning_rate": 3.2127767134655374e-06, |
|
"loss": 1.0457, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.3035835023664637, |
|
"grad_norm": 4.959907157136451, |
|
"learning_rate": 3.198479505154984e-06, |
|
"loss": 1.1399, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.3089925625422583, |
|
"grad_norm": 5.238554311773635, |
|
"learning_rate": 3.184157475180208e-06, |
|
"loss": 1.1271, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.3144016227180528, |
|
"grad_norm": 6.098173197957714, |
|
"learning_rate": 3.1698111324990454e-06, |
|
"loss": 1.0782, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.3198106828938472, |
|
"grad_norm": 11.039229160599406, |
|
"learning_rate": 3.15544098693333e-06, |
|
"loss": 1.05, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.3252197430696415, |
|
"grad_norm": 5.499488969453295, |
|
"learning_rate": 3.14104754915077e-06, |
|
"loss": 1.0368, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.330628803245436, |
|
"grad_norm": 5.138414489753493, |
|
"learning_rate": 3.1266313306468018e-06, |
|
"loss": 1.0835, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.3360378634212307, |
|
"grad_norm": 5.106872795478693, |
|
"learning_rate": 3.1121928437264138e-06, |
|
"loss": 0.9708, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.341446923597025, |
|
"grad_norm": 5.002564538923287, |
|
"learning_rate": 3.0977326014859415e-06, |
|
"loss": 1.0687, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.3468559837728193, |
|
"grad_norm": 5.387403166077266, |
|
"learning_rate": 3.0832511177948326e-06, |
|
"loss": 1.1496, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.352265043948614, |
|
"grad_norm": 8.271273394288187, |
|
"learning_rate": 3.0687489072773864e-06, |
|
"loss": 1.0601, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3576741041244085, |
|
"grad_norm": 5.909473723884275, |
|
"learning_rate": 3.0542264852944635e-06, |
|
"loss": 1.1829, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.3630831643002028, |
|
"grad_norm": 5.390843669805894, |
|
"learning_rate": 3.0396843679251777e-06, |
|
"loss": 1.1672, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.3684922244759972, |
|
"grad_norm": 4.9990577854234095, |
|
"learning_rate": 3.0251230719485465e-06, |
|
"loss": 1.0904, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.3739012846517917, |
|
"grad_norm": 5.2939324887679495, |
|
"learning_rate": 3.0105431148251364e-06, |
|
"loss": 1.0716, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 5.423330025150549, |
|
"learning_rate": 2.9959450146786674e-06, |
|
"loss": 1.1325, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.3847194050033806, |
|
"grad_norm": 5.023381297619562, |
|
"learning_rate": 2.981329290277605e-06, |
|
"loss": 0.987, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.390128465179175, |
|
"grad_norm": 5.282148760283777, |
|
"learning_rate": 2.966696461016721e-06, |
|
"loss": 1.0894, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.3955375253549696, |
|
"grad_norm": 5.15061299365383, |
|
"learning_rate": 2.952047046898637e-06, |
|
"loss": 1.1275, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.4009465855307641, |
|
"grad_norm": 5.6780949048605285, |
|
"learning_rate": 2.9373815685153485e-06, |
|
"loss": 0.9549, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.4063556457065585, |
|
"grad_norm": 5.251474491048633, |
|
"learning_rate": 2.9227005470297194e-06, |
|
"loss": 1.0549, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.4117647058823528, |
|
"grad_norm": 5.418474582825882, |
|
"learning_rate": 2.9080045041569647e-06, |
|
"loss": 1.1761, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.4171737660581474, |
|
"grad_norm": 5.363207500979156, |
|
"learning_rate": 2.893293962146114e-06, |
|
"loss": 1.132, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.422582826233942, |
|
"grad_norm": 6.178387649415905, |
|
"learning_rate": 2.878569443761442e-06, |
|
"loss": 1.0214, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.4279918864097363, |
|
"grad_norm": 5.718107503992589, |
|
"learning_rate": 2.863831472263904e-06, |
|
"loss": 1.0028, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.4334009465855306, |
|
"grad_norm": 5.279016485794886, |
|
"learning_rate": 2.8490805713925298e-06, |
|
"loss": 1.0827, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.4388100067613252, |
|
"grad_norm": 5.087811977719158, |
|
"learning_rate": 2.8343172653458194e-06, |
|
"loss": 1.0937, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.4442190669371198, |
|
"grad_norm": 5.4070064172712184, |
|
"learning_rate": 2.8195420787631113e-06, |
|
"loss": 1.123, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.4496281271129141, |
|
"grad_norm": 5.587801015021142, |
|
"learning_rate": 2.8047555367059404e-06, |
|
"loss": 1.0621, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.4550371872887085, |
|
"grad_norm": 5.552901382895493, |
|
"learning_rate": 2.7899581646393746e-06, |
|
"loss": 0.9631, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.460446247464503, |
|
"grad_norm": 5.4717166148353185, |
|
"learning_rate": 2.7751504884133484e-06, |
|
"loss": 1.0253, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4658553076402976, |
|
"grad_norm": 5.3779298143381356, |
|
"learning_rate": 2.7603330342439686e-06, |
|
"loss": 0.9938, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.471264367816092, |
|
"grad_norm": 6.033013211604236, |
|
"learning_rate": 2.745506328694822e-06, |
|
"loss": 1.0509, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.4766734279918863, |
|
"grad_norm": 5.832457552507952, |
|
"learning_rate": 2.730670898658255e-06, |
|
"loss": 1.1067, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.4820824881676808, |
|
"grad_norm": 5.7329943331650295, |
|
"learning_rate": 2.7158272713366573e-06, |
|
"loss": 1.0657, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.4874915483434754, |
|
"grad_norm": 5.506227442307808, |
|
"learning_rate": 2.700975974223719e-06, |
|
"loss": 1.1391, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.4929006085192698, |
|
"grad_norm": 4.978032312700087, |
|
"learning_rate": 2.6861175350856937e-06, |
|
"loss": 0.9931, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.4929006085192698, |
|
"eval_loss": 1.3746687173843384, |
|
"eval_runtime": 80.4525, |
|
"eval_samples_per_second": 15.487, |
|
"eval_steps_per_second": 1.939, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.498309668695064, |
|
"grad_norm": 4.962349655742751, |
|
"learning_rate": 2.6712524819426355e-06, |
|
"loss": 1.0006, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.5037187288708587, |
|
"grad_norm": 4.925015437741922, |
|
"learning_rate": 2.656381343049641e-06, |
|
"loss": 1.1016, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.5091277890466532, |
|
"grad_norm": 4.738818602054919, |
|
"learning_rate": 2.6415046468780726e-06, |
|
"loss": 1.0465, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.5145368492224476, |
|
"grad_norm": 4.961032669335967, |
|
"learning_rate": 2.626622922096782e-06, |
|
"loss": 0.9809, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.519945909398242, |
|
"grad_norm": 5.043527496081483, |
|
"learning_rate": 2.6117366975533187e-06, |
|
"loss": 1.0272, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.5253549695740365, |
|
"grad_norm": 5.913998311820069, |
|
"learning_rate": 2.596846502255142e-06, |
|
"loss": 1.0146, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.530764029749831, |
|
"grad_norm": 5.295260207722928, |
|
"learning_rate": 2.581952865350815e-06, |
|
"loss": 1.0956, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.5361730899256254, |
|
"grad_norm": 4.466395085273886, |
|
"learning_rate": 2.5670563161112073e-06, |
|
"loss": 1.0354, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.5415821501014197, |
|
"grad_norm": 5.399245625271399, |
|
"learning_rate": 2.5521573839106815e-06, |
|
"loss": 0.9433, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.5469912102772143, |
|
"grad_norm": 5.270037129845648, |
|
"learning_rate": 2.5372565982082843e-06, |
|
"loss": 0.9744, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.5524002704530089, |
|
"grad_norm": 5.161110168680388, |
|
"learning_rate": 2.5223544885289287e-06, |
|
"loss": 1.0077, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.5578093306288032, |
|
"grad_norm": 5.6674277481523045, |
|
"learning_rate": 2.5074515844445774e-06, |
|
"loss": 1.0805, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.5632183908045976, |
|
"grad_norm": 5.089380302532208, |
|
"learning_rate": 2.4925484155554235e-06, |
|
"loss": 0.9904, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.5686274509803921, |
|
"grad_norm": 4.6826224514211, |
|
"learning_rate": 2.477645511471073e-06, |
|
"loss": 1.0843, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.5740365111561867, |
|
"grad_norm": 5.231440812973741, |
|
"learning_rate": 2.462743401791716e-06, |
|
"loss": 1.0343, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.579445571331981, |
|
"grad_norm": 6.063260236415264, |
|
"learning_rate": 2.4478426160893197e-06, |
|
"loss": 1.0377, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.5848546315077754, |
|
"grad_norm": 4.850759349852377, |
|
"learning_rate": 2.4329436838887936e-06, |
|
"loss": 0.984, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.59026369168357, |
|
"grad_norm": 5.356535713038874, |
|
"learning_rate": 2.4180471346491864e-06, |
|
"loss": 1.0006, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.5956727518593645, |
|
"grad_norm": 5.843575763516113, |
|
"learning_rate": 2.403153497744859e-06, |
|
"loss": 1.0172, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.6010818120351589, |
|
"grad_norm": 5.383170068467428, |
|
"learning_rate": 2.3882633024466813e-06, |
|
"loss": 0.9683, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.6064908722109532, |
|
"grad_norm": 5.754163901430788, |
|
"learning_rate": 2.3733770779032185e-06, |
|
"loss": 0.9569, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.6118999323867478, |
|
"grad_norm": 5.20527028666756, |
|
"learning_rate": 2.3584953531219278e-06, |
|
"loss": 1.0422, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.6173089925625423, |
|
"grad_norm": 7.36652597560724, |
|
"learning_rate": 2.3436186569503598e-06, |
|
"loss": 1.0485, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.6227180527383367, |
|
"grad_norm": 5.265722625187695, |
|
"learning_rate": 2.3287475180573653e-06, |
|
"loss": 1.0536, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.628127112914131, |
|
"grad_norm": 5.352738983760188, |
|
"learning_rate": 2.3138824649143076e-06, |
|
"loss": 1.0608, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.6335361730899256, |
|
"grad_norm": 5.069558233311071, |
|
"learning_rate": 2.2990240257762817e-06, |
|
"loss": 0.9955, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.6389452332657202, |
|
"grad_norm": 5.3724147320331435, |
|
"learning_rate": 2.2841727286633444e-06, |
|
"loss": 1.0888, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.6443542934415145, |
|
"grad_norm": 4.963379896141614, |
|
"learning_rate": 2.269329101341745e-06, |
|
"loss": 0.98, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.6497633536173089, |
|
"grad_norm": 5.132481781947378, |
|
"learning_rate": 2.254493671305179e-06, |
|
"loss": 0.9771, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.6551724137931034, |
|
"grad_norm": 5.347290600611583, |
|
"learning_rate": 2.239666965756032e-06, |
|
"loss": 0.9705, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.660581473968898, |
|
"grad_norm": 5.722698577141815, |
|
"learning_rate": 2.224849511586652e-06, |
|
"loss": 0.9266, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.6659905341446923, |
|
"grad_norm": 5.733293031193421, |
|
"learning_rate": 2.2100418353606262e-06, |
|
"loss": 1.0429, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.6713995943204867, |
|
"grad_norm": 6.274637328200243, |
|
"learning_rate": 2.19524446329406e-06, |
|
"loss": 1.0208, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.6768086544962812, |
|
"grad_norm": 5.5296462714582955, |
|
"learning_rate": 2.180457921236889e-06, |
|
"loss": 0.9945, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.6822177146720758, |
|
"grad_norm": 5.336860694103749, |
|
"learning_rate": 2.165682734654181e-06, |
|
"loss": 0.9847, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.6876267748478702, |
|
"grad_norm": 5.016859607786664, |
|
"learning_rate": 2.150919428607472e-06, |
|
"loss": 1.0305, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.6930358350236645, |
|
"grad_norm": 4.918636259586344, |
|
"learning_rate": 2.1361685277360973e-06, |
|
"loss": 1.0611, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.698444895199459, |
|
"grad_norm": 5.520829989468747, |
|
"learning_rate": 2.1214305562385592e-06, |
|
"loss": 0.8441, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.7038539553752536, |
|
"grad_norm": 5.327879216317322, |
|
"learning_rate": 2.106706037853887e-06, |
|
"loss": 1.0225, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.709263015551048, |
|
"grad_norm": 5.3843566459704135, |
|
"learning_rate": 2.0919954958430357e-06, |
|
"loss": 1.0082, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.7146720757268423, |
|
"grad_norm": 5.395175713065742, |
|
"learning_rate": 2.077299452970282e-06, |
|
"loss": 1.023, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.720081135902637, |
|
"grad_norm": 5.887472074041824, |
|
"learning_rate": 2.062618431484652e-06, |
|
"loss": 0.9278, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.7254901960784315, |
|
"grad_norm": 6.537709956987992, |
|
"learning_rate": 2.047952953101363e-06, |
|
"loss": 1.0788, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.7308992562542258, |
|
"grad_norm": 6.118159510578694, |
|
"learning_rate": 2.0333035389832795e-06, |
|
"loss": 1.1197, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.7363083164300201, |
|
"grad_norm": 5.2165328231189765, |
|
"learning_rate": 2.0186707097223952e-06, |
|
"loss": 1.0348, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.7417173766058147, |
|
"grad_norm": 5.79401876729717, |
|
"learning_rate": 2.0040549853213326e-06, |
|
"loss": 1.0581, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.7417173766058147, |
|
"eval_loss": 1.306036353111267, |
|
"eval_runtime": 80.4204, |
|
"eval_samples_per_second": 15.494, |
|
"eval_steps_per_second": 1.94, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.7471264367816093, |
|
"grad_norm": 5.1382450667316455, |
|
"learning_rate": 1.989456885174865e-06, |
|
"loss": 0.9931, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.7525354969574036, |
|
"grad_norm": 5.923288803726085, |
|
"learning_rate": 1.9748769280514544e-06, |
|
"loss": 1.048, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.757944557133198, |
|
"grad_norm": 5.27150872289333, |
|
"learning_rate": 1.960315632074824e-06, |
|
"loss": 0.9658, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.7633536173089925, |
|
"grad_norm": 5.594406875794453, |
|
"learning_rate": 1.945773514705537e-06, |
|
"loss": 0.9441, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.768762677484787, |
|
"grad_norm": 5.205336223607601, |
|
"learning_rate": 1.931251092722615e-06, |
|
"loss": 1.0016, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.7741717376605814, |
|
"grad_norm": 5.104727647984587, |
|
"learning_rate": 1.916748882205168e-06, |
|
"loss": 0.917, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.7795807978363758, |
|
"grad_norm": 4.929606382507894, |
|
"learning_rate": 1.9022673985140585e-06, |
|
"loss": 1.0204, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.7849898580121704, |
|
"grad_norm": 4.872805835810651, |
|
"learning_rate": 1.8878071562735873e-06, |
|
"loss": 0.9421, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.790398918187965, |
|
"grad_norm": 5.328166060135325, |
|
"learning_rate": 1.8733686693531986e-06, |
|
"loss": 1.0683, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.7958079783637593, |
|
"grad_norm": 5.20759899782578, |
|
"learning_rate": 1.8589524508492308e-06, |
|
"loss": 0.9228, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.8012170385395536, |
|
"grad_norm": 5.362139404873552, |
|
"learning_rate": 1.84455901306667e-06, |
|
"loss": 1.0189, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.8066260987153482, |
|
"grad_norm": 5.165887526933946, |
|
"learning_rate": 1.8301888675009554e-06, |
|
"loss": 1.0733, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.8120351588911427, |
|
"grad_norm": 5.69463106350885, |
|
"learning_rate": 1.8158425248197931e-06, |
|
"loss": 1.0076, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.817444219066937, |
|
"grad_norm": 5.001839035249502, |
|
"learning_rate": 1.8015204948450166e-06, |
|
"loss": 1.009, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.8228532792427314, |
|
"grad_norm": 5.335741327820695, |
|
"learning_rate": 1.787223286534463e-06, |
|
"loss": 0.9548, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.828262339418526, |
|
"grad_norm": 6.290040565183058, |
|
"learning_rate": 1.7729514079638915e-06, |
|
"loss": 1.091, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.8336713995943206, |
|
"grad_norm": 4.84945280688225, |
|
"learning_rate": 1.7587053663089233e-06, |
|
"loss": 0.9476, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.839080459770115, |
|
"grad_norm": 4.954078085115975, |
|
"learning_rate": 1.7444856678270218e-06, |
|
"loss": 0.8989, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.8444895199459093, |
|
"grad_norm": 5.322267912027798, |
|
"learning_rate": 1.7302928178395018e-06, |
|
"loss": 1.0021, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.8498985801217038, |
|
"grad_norm": 4.608936147804684, |
|
"learning_rate": 1.716127320713568e-06, |
|
"loss": 0.9097, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.8553076402974984, |
|
"grad_norm": 5.1258935953966605, |
|
"learning_rate": 1.7019896798443984e-06, |
|
"loss": 0.9417, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.8607167004732927, |
|
"grad_norm": 5.5152859470375475, |
|
"learning_rate": 1.6878803976372465e-06, |
|
"loss": 1.0601, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.866125760649087, |
|
"grad_norm": 8.364778472582735, |
|
"learning_rate": 1.6737999754895965e-06, |
|
"loss": 0.9963, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.8715348208248817, |
|
"grad_norm": 4.9035977247931575, |
|
"learning_rate": 1.6597489137733377e-06, |
|
"loss": 0.8301, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.8769438810006762, |
|
"grad_norm": 5.3057895439765845, |
|
"learning_rate": 1.6457277118169893e-06, |
|
"loss": 0.9862, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.8823529411764706, |
|
"grad_norm": 5.656903858451251, |
|
"learning_rate": 1.6317368678879497e-06, |
|
"loss": 0.9841, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.887762001352265, |
|
"grad_norm": 5.054670138966832, |
|
"learning_rate": 1.6177768791747957e-06, |
|
"loss": 0.9873, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.8931710615280595, |
|
"grad_norm": 4.872926655885509, |
|
"learning_rate": 1.6038482417696095e-06, |
|
"loss": 1.0795, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.898580121703854, |
|
"grad_norm": 5.313971672771769, |
|
"learning_rate": 1.5899514506503499e-06, |
|
"loss": 0.9203, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.9039891818796484, |
|
"grad_norm": 4.964610105036094, |
|
"learning_rate": 1.5760869996632685e-06, |
|
"loss": 1.0121, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.9093982420554427, |
|
"grad_norm": 4.990950102894019, |
|
"learning_rate": 1.5622553815053476e-06, |
|
"loss": 0.9234, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.9148073022312373, |
|
"grad_norm": 5.392912217914297, |
|
"learning_rate": 1.5484570877068055e-06, |
|
"loss": 0.9205, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.9202163624070319, |
|
"grad_norm": 5.4989192390271935, |
|
"learning_rate": 1.5346926086136171e-06, |
|
"loss": 1.0099, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.9256254225828262, |
|
"grad_norm": 4.982181015464049, |
|
"learning_rate": 1.5209624333700985e-06, |
|
"loss": 0.937, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.9310344827586206, |
|
"grad_norm": 5.49893209655002, |
|
"learning_rate": 1.5072670499015151e-06, |
|
"loss": 0.9491, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.9364435429344151, |
|
"grad_norm": 5.018154393664902, |
|
"learning_rate": 1.493606944896751e-06, |
|
"loss": 1.0217, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.9418526031102097, |
|
"grad_norm": 5.78990835618344, |
|
"learning_rate": 1.4799826037910082e-06, |
|
"loss": 0.9641, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.947261663286004, |
|
"grad_norm": 4.991442418184385, |
|
"learning_rate": 1.4663945107485567e-06, |
|
"loss": 0.9263, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.9526707234617984, |
|
"grad_norm": 6.0772021735310675, |
|
"learning_rate": 1.4528431486455311e-06, |
|
"loss": 1.0178, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.958079783637593, |
|
"grad_norm": 5.265699493164527, |
|
"learning_rate": 1.4393289990527665e-06, |
|
"loss": 0.9058, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.9634888438133875, |
|
"grad_norm": 5.332909399121638, |
|
"learning_rate": 1.425852542218692e-06, |
|
"loss": 0.8859, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.9688979039891819, |
|
"grad_norm": 5.145306432714561, |
|
"learning_rate": 1.412414257052256e-06, |
|
"loss": 0.9829, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.9743069641649762, |
|
"grad_norm": 5.392247944243994, |
|
"learning_rate": 1.3990146211059141e-06, |
|
"loss": 0.9439, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.9797160243407708, |
|
"grad_norm": 5.346185673554031, |
|
"learning_rate": 1.3856541105586545e-06, |
|
"loss": 0.9911, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.9851250845165653, |
|
"grad_norm": 5.1690668100026285, |
|
"learning_rate": 1.3723332001990774e-06, |
|
"loss": 0.8723, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.9905341446923597, |
|
"grad_norm": 5.00656915809359, |
|
"learning_rate": 1.3590523634085218e-06, |
|
"loss": 0.915, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.9905341446923597, |
|
"eval_loss": 1.247275710105896, |
|
"eval_runtime": 80.2409, |
|
"eval_samples_per_second": 15.528, |
|
"eval_steps_per_second": 1.944, |
|
"step": 368 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 552, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 184, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 526384229253120.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|