{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993853718500307, "eval_steps": 204, "global_step": 813, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001229256299938537, "grad_norm": 0.19411148130893707, "learning_rate": 2.0000000000000003e-06, "loss": 1.1612, "step": 1 }, { "epoch": 0.001229256299938537, "eval_loss": 2.1468453407287598, "eval_runtime": 66.4837, "eval_samples_per_second": 10.303, "eval_steps_per_second": 5.159, "step": 1 }, { "epoch": 0.002458512599877074, "grad_norm": 0.2264145463705063, "learning_rate": 4.000000000000001e-06, "loss": 1.4401, "step": 2 }, { "epoch": 0.0036877688998156115, "grad_norm": 0.2364473193883896, "learning_rate": 6e-06, "loss": 1.4676, "step": 3 }, { "epoch": 0.004917025199754148, "grad_norm": 0.24018821120262146, "learning_rate": 8.000000000000001e-06, "loss": 1.3851, "step": 4 }, { "epoch": 0.006146281499692686, "grad_norm": 0.23238497972488403, "learning_rate": 1e-05, "loss": 1.213, "step": 5 }, { "epoch": 0.007375537799631223, "grad_norm": 0.24634625017642975, "learning_rate": 1.2e-05, "loss": 1.2627, "step": 6 }, { "epoch": 0.008604794099569761, "grad_norm": 0.26495596766471863, "learning_rate": 1.4000000000000001e-05, "loss": 1.3908, "step": 7 }, { "epoch": 0.009834050399508297, "grad_norm": 0.2719455361366272, "learning_rate": 1.6000000000000003e-05, "loss": 1.3814, "step": 8 }, { "epoch": 0.011063306699446834, "grad_norm": 0.26454323530197144, "learning_rate": 1.8e-05, "loss": 1.2438, "step": 9 }, { "epoch": 0.012292562999385371, "grad_norm": 0.3004608750343323, "learning_rate": 2e-05, "loss": 1.3694, "step": 10 }, { "epoch": 0.013521819299323909, "grad_norm": 0.3035408854484558, "learning_rate": 2.2000000000000003e-05, "loss": 1.4792, "step": 11 }, { "epoch": 0.014751075599262446, "grad_norm": 0.4270775020122528, "learning_rate": 2.4e-05, "loss": 1.1673, "step": 12 }, { "epoch": 0.015980331899200985, "grad_norm": 0.4388391971588135, "learning_rate": 2.6000000000000002e-05, "loss": 1.5171, "step": 13 }, { "epoch": 0.017209588199139522, "grad_norm": 0.7133700847625732, "learning_rate": 2.8000000000000003e-05, "loss": 1.0732, "step": 14 }, { "epoch": 0.01843884449907806, "grad_norm": 1.026840329170227, "learning_rate": 3e-05, "loss": 1.1705, "step": 15 }, { "epoch": 0.019668100799016593, "grad_norm": 0.7934454679489136, "learning_rate": 3.2000000000000005e-05, "loss": 1.3509, "step": 16 }, { "epoch": 0.02089735709895513, "grad_norm": 0.8138520121574402, "learning_rate": 3.4000000000000007e-05, "loss": 1.181, "step": 17 }, { "epoch": 0.022126613398893668, "grad_norm": 1.7830528020858765, "learning_rate": 3.6e-05, "loss": 2.1836, "step": 18 }, { "epoch": 0.023355869698832205, "grad_norm": 10.527496337890625, "learning_rate": 3.8e-05, "loss": 3.3749, "step": 19 }, { "epoch": 0.024585125998770743, "grad_norm": 6.364173889160156, "learning_rate": 4e-05, "loss": 2.9191, "step": 20 }, { "epoch": 0.02581438229870928, "grad_norm": 7.087876796722412, "learning_rate": 4.2e-05, "loss": 3.0788, "step": 21 }, { "epoch": 0.027043638598647817, "grad_norm": 5.370169639587402, "learning_rate": 4.4000000000000006e-05, "loss": 2.7809, "step": 22 }, { "epoch": 0.028272894898586354, "grad_norm": 4.118806838989258, "learning_rate": 4.600000000000001e-05, "loss": 2.7475, "step": 23 }, { "epoch": 0.02950215119852489, "grad_norm": 4.46057653427124, "learning_rate": 4.8e-05, "loss": 2.6906, "step": 24 }, { "epoch": 0.03073140749846343, "grad_norm": 3.8601913452148438, "learning_rate": 5e-05, "loss": 2.652, "step": 25 }, { "epoch": 0.03196066379840197, "grad_norm": 0.19972144067287445, "learning_rate": 5.2000000000000004e-05, "loss": 1.1845, "step": 26 }, { "epoch": 0.03318992009834051, "grad_norm": 0.21230019629001617, "learning_rate": 5.4000000000000005e-05, "loss": 1.2875, "step": 27 }, { "epoch": 0.034419176398279044, "grad_norm": 0.22694356739521027, "learning_rate": 5.6000000000000006e-05, "loss": 1.1829, "step": 28 }, { "epoch": 0.03564843269821758, "grad_norm": 0.2587474584579468, "learning_rate": 5.8e-05, "loss": 1.3909, "step": 29 }, { "epoch": 0.03687768899815612, "grad_norm": 0.24409259855747223, "learning_rate": 6e-05, "loss": 1.1671, "step": 30 }, { "epoch": 0.03810694529809465, "grad_norm": 0.26323097944259644, "learning_rate": 6.2e-05, "loss": 1.2055, "step": 31 }, { "epoch": 0.03933620159803319, "grad_norm": 0.2842409908771515, "learning_rate": 6.400000000000001e-05, "loss": 1.3051, "step": 32 }, { "epoch": 0.040565457897971724, "grad_norm": 0.32476744055747986, "learning_rate": 6.6e-05, "loss": 1.0828, "step": 33 }, { "epoch": 0.04179471419791026, "grad_norm": 0.32893380522727966, "learning_rate": 6.800000000000001e-05, "loss": 1.3896, "step": 34 }, { "epoch": 0.0430239704978488, "grad_norm": 0.3359004855155945, "learning_rate": 7e-05, "loss": 1.3713, "step": 35 }, { "epoch": 0.044253226797787336, "grad_norm": 0.5471646189689636, "learning_rate": 7.2e-05, "loss": 1.2572, "step": 36 }, { "epoch": 0.04548248309772587, "grad_norm": 0.5404387712478638, "learning_rate": 7.4e-05, "loss": 1.2321, "step": 37 }, { "epoch": 0.04671173939766441, "grad_norm": 1.0199828147888184, "learning_rate": 7.6e-05, "loss": 0.8434, "step": 38 }, { "epoch": 0.04794099569760295, "grad_norm": 1.5890088081359863, "learning_rate": 7.800000000000001e-05, "loss": 0.7781, "step": 39 }, { "epoch": 0.049170251997541485, "grad_norm": 0.7897126078605652, "learning_rate": 8e-05, "loss": 1.1982, "step": 40 }, { "epoch": 0.05039950829748002, "grad_norm": 0.7874982953071594, "learning_rate": 8.2e-05, "loss": 1.3432, "step": 41 }, { "epoch": 0.05162876459741856, "grad_norm": 1.3902230262756348, "learning_rate": 8.4e-05, "loss": 1.7817, "step": 42 }, { "epoch": 0.0528580208973571, "grad_norm": 4.006369590759277, "learning_rate": 8.6e-05, "loss": 1.8832, "step": 43 }, { "epoch": 0.054087277197295634, "grad_norm": 7.2500996589660645, "learning_rate": 8.800000000000001e-05, "loss": 1.3256, "step": 44 }, { "epoch": 0.05531653349723417, "grad_norm": 5.088122844696045, "learning_rate": 9e-05, "loss": 1.4272, "step": 45 }, { "epoch": 0.05654578979717271, "grad_norm": 2.9680252075195312, "learning_rate": 9.200000000000001e-05, "loss": 1.8063, "step": 46 }, { "epoch": 0.057775046097111246, "grad_norm": 3.4886820316314697, "learning_rate": 9.4e-05, "loss": 1.7842, "step": 47 }, { "epoch": 0.05900430239704978, "grad_norm": 2.635120153427124, "learning_rate": 9.6e-05, "loss": 1.7775, "step": 48 }, { "epoch": 0.06023355869698832, "grad_norm": 2.7715940475463867, "learning_rate": 9.8e-05, "loss": 1.6795, "step": 49 }, { "epoch": 0.06146281499692686, "grad_norm": 4.598182678222656, "learning_rate": 0.0001, "loss": 2.0141, "step": 50 }, { "epoch": 0.0626920712968654, "grad_norm": 0.4595154821872711, "learning_rate": 9.999957617159031e-05, "loss": 1.1302, "step": 51 }, { "epoch": 0.06392132759680394, "grad_norm": 0.3996050953865051, "learning_rate": 9.999830469354645e-05, "loss": 1.3499, "step": 52 }, { "epoch": 0.06515058389674247, "grad_norm": 0.4248620867729187, "learning_rate": 9.999618558742398e-05, "loss": 1.4393, "step": 53 }, { "epoch": 0.06637984019668101, "grad_norm": 0.37063130736351013, "learning_rate": 9.999321888914836e-05, "loss": 1.4761, "step": 54 }, { "epoch": 0.06760909649661954, "grad_norm": 0.3327302038669586, "learning_rate": 9.998940464901447e-05, "loss": 1.1365, "step": 55 }, { "epoch": 0.06883835279655809, "grad_norm": 0.3424387276172638, "learning_rate": 9.998474293168562e-05, "loss": 1.2037, "step": 56 }, { "epoch": 0.07006760909649662, "grad_norm": 0.34453633427619934, "learning_rate": 9.997923381619256e-05, "loss": 0.9586, "step": 57 }, { "epoch": 0.07129686539643516, "grad_norm": 0.3327544033527374, "learning_rate": 9.997287739593206e-05, "loss": 1.3026, "step": 58 }, { "epoch": 0.0725261216963737, "grad_norm": 0.33542299270629883, "learning_rate": 9.996567377866537e-05, "loss": 1.1601, "step": 59 }, { "epoch": 0.07375537799631224, "grad_norm": 0.5743572115898132, "learning_rate": 9.99576230865164e-05, "loss": 1.3892, "step": 60 }, { "epoch": 0.07498463429625077, "grad_norm": 0.4624180495738983, "learning_rate": 9.994872545596966e-05, "loss": 1.2519, "step": 61 }, { "epoch": 0.0762138905961893, "grad_norm": 0.6259918808937073, "learning_rate": 9.993898103786786e-05, "loss": 1.315, "step": 62 }, { "epoch": 0.07744314689612784, "grad_norm": 0.6186118125915527, "learning_rate": 9.992838999740947e-05, "loss": 0.877, "step": 63 }, { "epoch": 0.07867240319606637, "grad_norm": 0.6788893342018127, "learning_rate": 9.991695251414583e-05, "loss": 0.886, "step": 64 }, { "epoch": 0.07990165949600492, "grad_norm": 0.7688488960266113, "learning_rate": 9.990466878197817e-05, "loss": 0.7427, "step": 65 }, { "epoch": 0.08113091579594345, "grad_norm": 0.6739158630371094, "learning_rate": 9.989153900915427e-05, "loss": 1.091, "step": 66 }, { "epoch": 0.08236017209588199, "grad_norm": 1.0515763759613037, "learning_rate": 9.987756341826493e-05, "loss": 1.4195, "step": 67 }, { "epoch": 0.08358942839582052, "grad_norm": 2.324380397796631, "learning_rate": 9.98627422462403e-05, "loss": 1.8108, "step": 68 }, { "epoch": 0.08481868469575907, "grad_norm": 4.131134510040283, "learning_rate": 9.98470757443457e-05, "loss": 1.2769, "step": 69 }, { "epoch": 0.0860479409956976, "grad_norm": 6.158152103424072, "learning_rate": 9.983056417817747e-05, "loss": 1.609, "step": 70 }, { "epoch": 0.08727719729563614, "grad_norm": 2.710057020187378, "learning_rate": 9.981320782765846e-05, "loss": 1.6382, "step": 71 }, { "epoch": 0.08850645359557467, "grad_norm": 2.729590654373169, "learning_rate": 9.979500698703323e-05, "loss": 1.8179, "step": 72 }, { "epoch": 0.08973570989551322, "grad_norm": 2.1861114501953125, "learning_rate": 9.977596196486314e-05, "loss": 1.7416, "step": 73 }, { "epoch": 0.09096496619545175, "grad_norm": 2.614532947540283, "learning_rate": 9.975607308402101e-05, "loss": 1.8413, "step": 74 }, { "epoch": 0.09219422249539029, "grad_norm": 3.3295183181762695, "learning_rate": 9.973534068168579e-05, "loss": 2.1946, "step": 75 }, { "epoch": 0.09342347879532882, "grad_norm": 0.3009834885597229, "learning_rate": 9.97137651093367e-05, "loss": 1.1058, "step": 76 }, { "epoch": 0.09465273509526737, "grad_norm": 0.2889084815979004, "learning_rate": 9.969134673274738e-05, "loss": 1.0812, "step": 77 }, { "epoch": 0.0958819913952059, "grad_norm": 0.26639047265052795, "learning_rate": 9.966808593197959e-05, "loss": 1.2787, "step": 78 }, { "epoch": 0.09711124769514444, "grad_norm": 0.2839871048927307, "learning_rate": 9.964398310137688e-05, "loss": 1.2314, "step": 79 }, { "epoch": 0.09834050399508297, "grad_norm": 0.29856863617897034, "learning_rate": 9.961903864955783e-05, "loss": 1.1781, "step": 80 }, { "epoch": 0.09956976029502151, "grad_norm": 0.3113296329975128, "learning_rate": 9.959325299940914e-05, "loss": 1.1297, "step": 81 }, { "epoch": 0.10079901659496004, "grad_norm": 0.3259466290473938, "learning_rate": 9.956662658807842e-05, "loss": 1.3892, "step": 82 }, { "epoch": 0.10202827289489859, "grad_norm": 0.3366626501083374, "learning_rate": 9.95391598669669e-05, "loss": 1.1833, "step": 83 }, { "epoch": 0.10325752919483712, "grad_norm": 0.3032483458518982, "learning_rate": 9.95108533017216e-05, "loss": 1.1729, "step": 84 }, { "epoch": 0.10448678549477566, "grad_norm": 0.4028280973434448, "learning_rate": 9.948170737222762e-05, "loss": 1.1019, "step": 85 }, { "epoch": 0.1057160417947142, "grad_norm": 0.3796052932739258, "learning_rate": 9.945172257259986e-05, "loss": 1.3822, "step": 86 }, { "epoch": 0.10694529809465274, "grad_norm": 0.3956368565559387, "learning_rate": 9.942089941117472e-05, "loss": 1.2101, "step": 87 }, { "epoch": 0.10817455439459127, "grad_norm": 0.5040555596351624, "learning_rate": 9.938923841050147e-05, "loss": 1.059, "step": 88 }, { "epoch": 0.10940381069452981, "grad_norm": 0.7209507822990417, "learning_rate": 9.935674010733336e-05, "loss": 0.9387, "step": 89 }, { "epoch": 0.11063306699446834, "grad_norm": 0.6711410284042358, "learning_rate": 9.932340505261855e-05, "loss": 0.9325, "step": 90 }, { "epoch": 0.11186232329440689, "grad_norm": 0.670559823513031, "learning_rate": 9.928923381149078e-05, "loss": 1.1188, "step": 91 }, { "epoch": 0.11309157959434542, "grad_norm": 1.4009896516799927, "learning_rate": 9.925422696325975e-05, "loss": 1.4021, "step": 92 }, { "epoch": 0.11432083589428396, "grad_norm": 2.7449545860290527, "learning_rate": 9.921838510140135e-05, "loss": 1.7181, "step": 93 }, { "epoch": 0.11555009219422249, "grad_norm": 3.5462844371795654, "learning_rate": 9.918170883354755e-05, "loss": 1.4934, "step": 94 }, { "epoch": 0.11677934849416104, "grad_norm": 3.204674005508423, "learning_rate": 9.914419878147611e-05, "loss": 1.2952, "step": 95 }, { "epoch": 0.11800860479409957, "grad_norm": 2.583436965942383, "learning_rate": 9.910585558110006e-05, "loss": 1.418, "step": 96 }, { "epoch": 0.11923786109403811, "grad_norm": 3.0214803218841553, "learning_rate": 9.906667988245692e-05, "loss": 1.8579, "step": 97 }, { "epoch": 0.12046711739397664, "grad_norm": 2.359790325164795, "learning_rate": 9.902667234969764e-05, "loss": 1.2705, "step": 98 }, { "epoch": 0.12169637369391519, "grad_norm": 2.093607187271118, "learning_rate": 9.898583366107538e-05, "loss": 1.4241, "step": 99 }, { "epoch": 0.12292562999385372, "grad_norm": 2.613720655441284, "learning_rate": 9.8944164508934e-05, "loss": 1.7558, "step": 100 }, { "epoch": 0.12415488629379226, "grad_norm": 0.29464319348335266, "learning_rate": 9.890166559969631e-05, "loss": 1.1966, "step": 101 }, { "epoch": 0.1253841425937308, "grad_norm": 0.27224430441856384, "learning_rate": 9.885833765385212e-05, "loss": 1.3172, "step": 102 }, { "epoch": 0.12661339889366932, "grad_norm": 0.2738960385322571, "learning_rate": 9.881418140594603e-05, "loss": 1.2875, "step": 103 }, { "epoch": 0.12784265519360788, "grad_norm": 0.274746298789978, "learning_rate": 9.876919760456492e-05, "loss": 1.3156, "step": 104 }, { "epoch": 0.1290719114935464, "grad_norm": 0.3050672113895416, "learning_rate": 9.872338701232526e-05, "loss": 1.2426, "step": 105 }, { "epoch": 0.13030116779348494, "grad_norm": 0.2726648449897766, "learning_rate": 9.867675040586034e-05, "loss": 1.1997, "step": 106 }, { "epoch": 0.13153042409342347, "grad_norm": 0.2615199685096741, "learning_rate": 9.862928857580687e-05, "loss": 1.1518, "step": 107 }, { "epoch": 0.13275968039336203, "grad_norm": 0.27568066120147705, "learning_rate": 9.858100232679175e-05, "loss": 0.9874, "step": 108 }, { "epoch": 0.13398893669330056, "grad_norm": 0.29168951511383057, "learning_rate": 9.853189247741833e-05, "loss": 1.2147, "step": 109 }, { "epoch": 0.1352181929932391, "grad_norm": 0.30630671977996826, "learning_rate": 9.848195986025257e-05, "loss": 1.2474, "step": 110 }, { "epoch": 0.13644744929317762, "grad_norm": 0.3246194124221802, "learning_rate": 9.843120532180896e-05, "loss": 1.1839, "step": 111 }, { "epoch": 0.13767670559311618, "grad_norm": 0.34899017214775085, "learning_rate": 9.837962972253612e-05, "loss": 1.2389, "step": 112 }, { "epoch": 0.1389059618930547, "grad_norm": 0.3848627805709839, "learning_rate": 9.83272339368022e-05, "loss": 1.1833, "step": 113 }, { "epoch": 0.14013521819299324, "grad_norm": 0.4109489917755127, "learning_rate": 9.827401885288013e-05, "loss": 1.1026, "step": 114 }, { "epoch": 0.14136447449293177, "grad_norm": 0.6600728034973145, "learning_rate": 9.821998537293245e-05, "loss": 1.4073, "step": 115 }, { "epoch": 0.14259373079287033, "grad_norm": 0.5556017756462097, "learning_rate": 9.816513441299613e-05, "loss": 0.6878, "step": 116 }, { "epoch": 0.14382298709280886, "grad_norm": 0.5937761068344116, "learning_rate": 9.810946690296698e-05, "loss": 0.7988, "step": 117 }, { "epoch": 0.1450522433927474, "grad_norm": 0.6892157196998596, "learning_rate": 9.80529837865839e-05, "loss": 1.2152, "step": 118 }, { "epoch": 0.14628149969268592, "grad_norm": 1.1046031713485718, "learning_rate": 9.799568602141283e-05, "loss": 1.4396, "step": 119 }, { "epoch": 0.14751075599262448, "grad_norm": 3.366898536682129, "learning_rate": 9.793757457883062e-05, "loss": 1.6062, "step": 120 }, { "epoch": 0.148740012292563, "grad_norm": 4.46527624130249, "learning_rate": 9.787865044400848e-05, "loss": 1.041, "step": 121 }, { "epoch": 0.14996926859250154, "grad_norm": 3.8992013931274414, "learning_rate": 9.781891461589531e-05, "loss": 1.6166, "step": 122 }, { "epoch": 0.15119852489244007, "grad_norm": 2.6794042587280273, "learning_rate": 9.775836810720074e-05, "loss": 1.5444, "step": 123 }, { "epoch": 0.1524277811923786, "grad_norm": 2.1487152576446533, "learning_rate": 9.769701194437799e-05, "loss": 1.4051, "step": 124 }, { "epoch": 0.15365703749231716, "grad_norm": 2.6264848709106445, "learning_rate": 9.763484716760649e-05, "loss": 1.7286, "step": 125 }, { "epoch": 0.15488629379225569, "grad_norm": 0.2960408329963684, "learning_rate": 9.757187483077413e-05, "loss": 1.1932, "step": 126 }, { "epoch": 0.15611555009219422, "grad_norm": 0.2633897364139557, "learning_rate": 9.750809600145954e-05, "loss": 1.2997, "step": 127 }, { "epoch": 0.15734480639213275, "grad_norm": 0.2459549605846405, "learning_rate": 9.744351176091393e-05, "loss": 1.0985, "step": 128 }, { "epoch": 0.1585740626920713, "grad_norm": 0.30462849140167236, "learning_rate": 9.737812320404271e-05, "loss": 1.4303, "step": 129 }, { "epoch": 0.15980331899200984, "grad_norm": 0.27317526936531067, "learning_rate": 9.731193143938704e-05, "loss": 1.224, "step": 130 }, { "epoch": 0.16103257529194837, "grad_norm": 0.26538556814193726, "learning_rate": 9.724493758910491e-05, "loss": 1.2667, "step": 131 }, { "epoch": 0.1622618315918869, "grad_norm": 0.28112831711769104, "learning_rate": 9.71771427889522e-05, "loss": 1.1212, "step": 132 }, { "epoch": 0.16349108789182545, "grad_norm": 0.2989320755004883, "learning_rate": 9.71085481882634e-05, "loss": 1.0484, "step": 133 }, { "epoch": 0.16472034419176398, "grad_norm": 0.2814895212650299, "learning_rate": 9.703915494993215e-05, "loss": 0.7544, "step": 134 }, { "epoch": 0.16594960049170251, "grad_norm": 0.3104398846626282, "learning_rate": 9.696896425039146e-05, "loss": 1.0323, "step": 135 }, { "epoch": 0.16717885679164105, "grad_norm": 0.4948181211948395, "learning_rate": 9.689797727959387e-05, "loss": 1.2073, "step": 136 }, { "epoch": 0.1684081130915796, "grad_norm": 0.4018343985080719, "learning_rate": 9.682619524099112e-05, "loss": 1.2409, "step": 137 }, { "epoch": 0.16963736939151813, "grad_norm": 0.5637558102607727, "learning_rate": 9.675361935151395e-05, "loss": 1.3184, "step": 138 }, { "epoch": 0.17086662569145666, "grad_norm": 0.7405252456665039, "learning_rate": 9.66802508415513e-05, "loss": 1.0983, "step": 139 }, { "epoch": 0.1720958819913952, "grad_norm": 0.6686736345291138, "learning_rate": 9.660609095492952e-05, "loss": 1.0025, "step": 140 }, { "epoch": 0.17332513829133375, "grad_norm": 0.7121345400810242, "learning_rate": 9.653114094889127e-05, "loss": 0.9337, "step": 141 }, { "epoch": 0.17455439459127228, "grad_norm": 1.06205153465271, "learning_rate": 9.645540209407425e-05, "loss": 1.2931, "step": 142 }, { "epoch": 0.1757836508912108, "grad_norm": 2.3874034881591797, "learning_rate": 9.637887567448959e-05, "loss": 1.5124, "step": 143 }, { "epoch": 0.17701290719114934, "grad_norm": 2.6609811782836914, "learning_rate": 9.630156298750011e-05, "loss": 1.4161, "step": 144 }, { "epoch": 0.1782421634910879, "grad_norm": 2.413705587387085, "learning_rate": 9.622346534379833e-05, "loss": 1.2768, "step": 145 }, { "epoch": 0.17947141979102643, "grad_norm": 2.920910120010376, "learning_rate": 9.614458406738427e-05, "loss": 1.0866, "step": 146 }, { "epoch": 0.18070067609096496, "grad_norm": 2.389439582824707, "learning_rate": 9.606492049554297e-05, "loss": 1.4862, "step": 147 }, { "epoch": 0.1819299323909035, "grad_norm": 2.03515887260437, "learning_rate": 9.598447597882181e-05, "loss": 1.3503, "step": 148 }, { "epoch": 0.18315918869084205, "grad_norm": 2.016889810562134, "learning_rate": 9.590325188100768e-05, "loss": 1.2565, "step": 149 }, { "epoch": 0.18438844499078058, "grad_norm": 2.1591711044311523, "learning_rate": 9.582124957910375e-05, "loss": 1.1261, "step": 150 }, { "epoch": 0.1856177012907191, "grad_norm": 0.2707172632217407, "learning_rate": 9.573847046330628e-05, "loss": 1.1045, "step": 151 }, { "epoch": 0.18684695759065764, "grad_norm": 0.25980842113494873, "learning_rate": 9.565491593698086e-05, "loss": 1.274, "step": 152 }, { "epoch": 0.1880762138905962, "grad_norm": 0.25503602623939514, "learning_rate": 9.55705874166388e-05, "loss": 1.0971, "step": 153 }, { "epoch": 0.18930547019053473, "grad_norm": 0.27756351232528687, "learning_rate": 9.548548633191299e-05, "loss": 1.215, "step": 154 }, { "epoch": 0.19053472649047326, "grad_norm": 0.2732703387737274, "learning_rate": 9.539961412553375e-05, "loss": 1.1326, "step": 155 }, { "epoch": 0.1917639827904118, "grad_norm": 0.28855475783348083, "learning_rate": 9.531297225330429e-05, "loss": 1.2862, "step": 156 }, { "epoch": 0.19299323909035035, "grad_norm": 0.3158769905567169, "learning_rate": 9.522556218407608e-05, "loss": 1.2254, "step": 157 }, { "epoch": 0.19422249539028888, "grad_norm": 0.30355289578437805, "learning_rate": 9.513738539972394e-05, "loss": 1.062, "step": 158 }, { "epoch": 0.1954517516902274, "grad_norm": 0.3448358178138733, "learning_rate": 9.504844339512095e-05, "loss": 0.9856, "step": 159 }, { "epoch": 0.19668100799016594, "grad_norm": 0.3306958079338074, "learning_rate": 9.495873767811305e-05, "loss": 1.2696, "step": 160 }, { "epoch": 0.1979102642901045, "grad_norm": 0.4231187105178833, "learning_rate": 9.486826976949345e-05, "loss": 1.1711, "step": 161 }, { "epoch": 0.19913952059004303, "grad_norm": 0.5289990901947021, "learning_rate": 9.477704120297697e-05, "loss": 1.4088, "step": 162 }, { "epoch": 0.20036877688998156, "grad_norm": 0.5111967921257019, "learning_rate": 9.468505352517394e-05, "loss": 1.1683, "step": 163 }, { "epoch": 0.2015980331899201, "grad_norm": 0.7477207779884338, "learning_rate": 9.459230829556401e-05, "loss": 0.995, "step": 164 }, { "epoch": 0.20282728948985865, "grad_norm": 0.7836649417877197, "learning_rate": 9.449880708646971e-05, "loss": 0.8027, "step": 165 }, { "epoch": 0.20405654578979718, "grad_norm": 0.6803653240203857, "learning_rate": 9.440455148302977e-05, "loss": 0.9725, "step": 166 }, { "epoch": 0.2052858020897357, "grad_norm": 0.8779723048210144, "learning_rate": 9.430954308317233e-05, "loss": 1.1995, "step": 167 }, { "epoch": 0.20651505838967424, "grad_norm": 1.3584879636764526, "learning_rate": 9.421378349758769e-05, "loss": 1.4558, "step": 168 }, { "epoch": 0.2077443146896128, "grad_norm": 2.1976521015167236, "learning_rate": 9.411727434970121e-05, "loss": 1.0717, "step": 169 }, { "epoch": 0.20897357098955133, "grad_norm": 3.9302353858947754, "learning_rate": 9.402001727564565e-05, "loss": 1.5138, "step": 170 }, { "epoch": 0.21020282728948986, "grad_norm": 3.9594686031341553, "learning_rate": 9.392201392423342e-05, "loss": 1.4295, "step": 171 }, { "epoch": 0.2114320835894284, "grad_norm": 3.2994837760925293, "learning_rate": 9.382326595692868e-05, "loss": 1.8676, "step": 172 }, { "epoch": 0.21266133988936695, "grad_norm": 2.219341993331909, "learning_rate": 9.372377504781924e-05, "loss": 1.3185, "step": 173 }, { "epoch": 0.21389059618930548, "grad_norm": 2.3389649391174316, "learning_rate": 9.362354288358803e-05, "loss": 0.9969, "step": 174 }, { "epoch": 0.215119852489244, "grad_norm": 3.8493995666503906, "learning_rate": 9.35225711634846e-05, "loss": 1.2903, "step": 175 }, { "epoch": 0.21634910878918254, "grad_norm": 0.24931700527668, "learning_rate": 9.34208615992963e-05, "loss": 1.051, "step": 176 }, { "epoch": 0.2175783650891211, "grad_norm": 0.2944095730781555, "learning_rate": 9.331841591531922e-05, "loss": 1.3364, "step": 177 }, { "epoch": 0.21880762138905963, "grad_norm": 0.26118403673171997, "learning_rate": 9.321523584832905e-05, "loss": 1.1487, "step": 178 }, { "epoch": 0.22003687768899816, "grad_norm": 0.29458168148994446, "learning_rate": 9.311132314755149e-05, "loss": 1.365, "step": 179 }, { "epoch": 0.2212661339889367, "grad_norm": 0.2739919424057007, "learning_rate": 9.300667957463278e-05, "loss": 1.2595, "step": 180 }, { "epoch": 0.22249539028887522, "grad_norm": 0.25647538900375366, "learning_rate": 9.290130690360965e-05, "loss": 0.9865, "step": 181 }, { "epoch": 0.22372464658881377, "grad_norm": 0.27343517541885376, "learning_rate": 9.279520692087938e-05, "loss": 1.1263, "step": 182 }, { "epoch": 0.2249539028887523, "grad_norm": 0.3220975697040558, "learning_rate": 9.268838142516943e-05, "loss": 1.3404, "step": 183 }, { "epoch": 0.22618315918869084, "grad_norm": 0.3012546896934509, "learning_rate": 9.258083222750703e-05, "loss": 0.934, "step": 184 }, { "epoch": 0.22741241548862937, "grad_norm": 0.3433031439781189, "learning_rate": 9.247256115118835e-05, "loss": 1.1895, "step": 185 }, { "epoch": 0.22864167178856792, "grad_norm": 0.3515290915966034, "learning_rate": 9.236357003174775e-05, "loss": 1.3236, "step": 186 }, { "epoch": 0.22987092808850645, "grad_norm": 0.4033795893192291, "learning_rate": 9.225386071692654e-05, "loss": 1.2089, "step": 187 }, { "epoch": 0.23110018438844498, "grad_norm": 0.42729562520980835, "learning_rate": 9.214343506664168e-05, "loss": 1.1346, "step": 188 }, { "epoch": 0.23232944068838352, "grad_norm": 0.6692906618118286, "learning_rate": 9.203229495295429e-05, "loss": 1.0211, "step": 189 }, { "epoch": 0.23355869698832207, "grad_norm": 0.6882857084274292, "learning_rate": 9.192044226003789e-05, "loss": 0.8235, "step": 190 }, { "epoch": 0.2347879532882606, "grad_norm": 0.6821665167808533, "learning_rate": 9.18078788841464e-05, "loss": 0.8171, "step": 191 }, { "epoch": 0.23601720958819913, "grad_norm": 0.7368921041488647, "learning_rate": 9.169460673358212e-05, "loss": 0.9993, "step": 192 }, { "epoch": 0.23724646588813766, "grad_norm": 0.9759008884429932, "learning_rate": 9.158062772866325e-05, "loss": 1.2029, "step": 193 }, { "epoch": 0.23847572218807622, "grad_norm": 2.167100667953491, "learning_rate": 9.146594380169143e-05, "loss": 1.1393, "step": 194 }, { "epoch": 0.23970497848801475, "grad_norm": 2.76292085647583, "learning_rate": 9.135055689691888e-05, "loss": 0.946, "step": 195 }, { "epoch": 0.24093423478795328, "grad_norm": 3.504427671432495, "learning_rate": 9.123446897051555e-05, "loss": 1.7001, "step": 196 }, { "epoch": 0.2421634910878918, "grad_norm": 2.606448173522949, "learning_rate": 9.111768199053588e-05, "loss": 1.6293, "step": 197 }, { "epoch": 0.24339274738783037, "grad_norm": 2.1803855895996094, "learning_rate": 9.100019793688549e-05, "loss": 1.2392, "step": 198 }, { "epoch": 0.2446220036877689, "grad_norm": 2.3470633029937744, "learning_rate": 9.088201880128755e-05, "loss": 1.0844, "step": 199 }, { "epoch": 0.24585125998770743, "grad_norm": 2.47255802154541, "learning_rate": 9.076314658724906e-05, "loss": 1.19, "step": 200 }, { "epoch": 0.24708051628764596, "grad_norm": 0.2115241140127182, "learning_rate": 9.064358331002691e-05, "loss": 0.9038, "step": 201 }, { "epoch": 0.24830977258758452, "grad_norm": 0.2693980038166046, "learning_rate": 9.05233309965936e-05, "loss": 1.0014, "step": 202 }, { "epoch": 0.24953902888752305, "grad_norm": 0.28890225291252136, "learning_rate": 9.040239168560303e-05, "loss": 1.1698, "step": 203 }, { "epoch": 0.2507682851874616, "grad_norm": 0.27143335342407227, "learning_rate": 9.028076742735583e-05, "loss": 1.1856, "step": 204 }, { "epoch": 0.2507682851874616, "eval_loss": 1.0315037965774536, "eval_runtime": 65.4064, "eval_samples_per_second": 10.473, "eval_steps_per_second": 5.244, "step": 204 }, { "epoch": 0.2519975414874001, "grad_norm": 0.3105545938014984, "learning_rate": 9.015846028376462e-05, "loss": 1.2827, "step": 205 }, { "epoch": 0.25322679778733864, "grad_norm": 0.2826372981071472, "learning_rate": 9.00354723283191e-05, "loss": 1.1159, "step": 206 }, { "epoch": 0.2544560540872772, "grad_norm": 0.2823708951473236, "learning_rate": 8.991180564605086e-05, "loss": 1.0368, "step": 207 }, { "epoch": 0.25568531038721576, "grad_norm": 0.28265297412872314, "learning_rate": 8.978746233349802e-05, "loss": 1.1583, "step": 208 }, { "epoch": 0.2569145666871543, "grad_norm": 0.3202212452888489, "learning_rate": 8.966244449866973e-05, "loss": 1.2069, "step": 209 }, { "epoch": 0.2581438229870928, "grad_norm": 0.30576291680336, "learning_rate": 8.953675426101038e-05, "loss": 1.1588, "step": 210 }, { "epoch": 0.25937307928703135, "grad_norm": 0.3853960633277893, "learning_rate": 8.941039375136371e-05, "loss": 1.1947, "step": 211 }, { "epoch": 0.2606023355869699, "grad_norm": 0.4404067099094391, "learning_rate": 8.928336511193669e-05, "loss": 1.0786, "step": 212 }, { "epoch": 0.2618315918869084, "grad_norm": 0.422333300113678, "learning_rate": 8.915567049626315e-05, "loss": 1.1454, "step": 213 }, { "epoch": 0.26306084818684694, "grad_norm": 0.5277565121650696, "learning_rate": 8.902731206916734e-05, "loss": 0.7775, "step": 214 }, { "epoch": 0.26429010448678547, "grad_norm": 0.7032243609428406, "learning_rate": 8.889829200672719e-05, "loss": 0.5771, "step": 215 }, { "epoch": 0.26551936078672406, "grad_norm": 0.6663339734077454, "learning_rate": 8.876861249623739e-05, "loss": 0.616, "step": 216 }, { "epoch": 0.2667486170866626, "grad_norm": 0.8129518628120422, "learning_rate": 8.863827573617238e-05, "loss": 1.1483, "step": 217 }, { "epoch": 0.2679778733866011, "grad_norm": 1.0273211002349854, "learning_rate": 8.850728393614902e-05, "loss": 1.2066, "step": 218 }, { "epoch": 0.26920712968653965, "grad_norm": 1.5424954891204834, "learning_rate": 8.837563931688919e-05, "loss": 1.247, "step": 219 }, { "epoch": 0.2704363859864782, "grad_norm": 2.9167752265930176, "learning_rate": 8.824334411018204e-05, "loss": 1.3413, "step": 220 }, { "epoch": 0.2716656422864167, "grad_norm": 5.498292446136475, "learning_rate": 8.811040055884629e-05, "loss": 1.0072, "step": 221 }, { "epoch": 0.27289489858635524, "grad_norm": 3.1687686443328857, "learning_rate": 8.797681091669206e-05, "loss": 1.3309, "step": 222 }, { "epoch": 0.27412415488629377, "grad_norm": 2.760160446166992, "learning_rate": 8.784257744848279e-05, "loss": 1.5268, "step": 223 }, { "epoch": 0.27535341118623236, "grad_norm": 2.3323326110839844, "learning_rate": 8.770770242989679e-05, "loss": 1.27, "step": 224 }, { "epoch": 0.2765826674861709, "grad_norm": 2.150510549545288, "learning_rate": 8.75721881474886e-05, "loss": 1.0602, "step": 225 }, { "epoch": 0.2778119237861094, "grad_norm": 0.23049846291542053, "learning_rate": 8.743603689865039e-05, "loss": 1.0067, "step": 226 }, { "epoch": 0.27904118008604795, "grad_norm": 0.2650708556175232, "learning_rate": 8.729925099157281e-05, "loss": 1.1932, "step": 227 }, { "epoch": 0.2802704363859865, "grad_norm": 0.2723963260650635, "learning_rate": 8.7161832745206e-05, "loss": 1.2495, "step": 228 }, { "epoch": 0.281499692685925, "grad_norm": 0.26627010107040405, "learning_rate": 8.702378448922026e-05, "loss": 1.2837, "step": 229 }, { "epoch": 0.28272894898586354, "grad_norm": 0.2728361189365387, "learning_rate": 8.688510856396648e-05, "loss": 1.2969, "step": 230 }, { "epoch": 0.28395820528580207, "grad_norm": 0.26788559556007385, "learning_rate": 8.674580732043656e-05, "loss": 1.0944, "step": 231 }, { "epoch": 0.28518746158574065, "grad_norm": 0.3129604160785675, "learning_rate": 8.660588312022344e-05, "loss": 1.3591, "step": 232 }, { "epoch": 0.2864167178856792, "grad_norm": 0.32250627875328064, "learning_rate": 8.646533833548119e-05, "loss": 1.1469, "step": 233 }, { "epoch": 0.2876459741856177, "grad_norm": 0.32614386081695557, "learning_rate": 8.632417534888473e-05, "loss": 1.3551, "step": 234 }, { "epoch": 0.28887523048555624, "grad_norm": 0.3620636463165283, "learning_rate": 8.61823965535894e-05, "loss": 1.1427, "step": 235 }, { "epoch": 0.2901044867854948, "grad_norm": 0.39082473516464233, "learning_rate": 8.604000435319047e-05, "loss": 1.0041, "step": 236 }, { "epoch": 0.2913337430854333, "grad_norm": 0.3823097050189972, "learning_rate": 8.589700116168232e-05, "loss": 1.1756, "step": 237 }, { "epoch": 0.29256299938537184, "grad_norm": 0.5359341502189636, "learning_rate": 8.575338940341757e-05, "loss": 1.1814, "step": 238 }, { "epoch": 0.29379225568531037, "grad_norm": 0.6902546286582947, "learning_rate": 8.560917151306593e-05, "loss": 0.9253, "step": 239 }, { "epoch": 0.29502151198524895, "grad_norm": 0.7236252427101135, "learning_rate": 8.5464349935573e-05, "loss": 0.6398, "step": 240 }, { "epoch": 0.2962507682851875, "grad_norm": 0.7172759175300598, "learning_rate": 8.53189271261187e-05, "loss": 0.9061, "step": 241 }, { "epoch": 0.297480024585126, "grad_norm": 0.7999723553657532, "learning_rate": 8.517290555007578e-05, "loss": 1.0691, "step": 242 }, { "epoch": 0.29870928088506454, "grad_norm": 1.235872745513916, "learning_rate": 8.502628768296788e-05, "loss": 1.5235, "step": 243 }, { "epoch": 0.2999385371850031, "grad_norm": 1.9676207304000854, "learning_rate": 8.487907601042777e-05, "loss": 1.5859, "step": 244 }, { "epoch": 0.3011677934849416, "grad_norm": 3.5035860538482666, "learning_rate": 8.473127302815496e-05, "loss": 1.1743, "step": 245 }, { "epoch": 0.30239704978488013, "grad_norm": 4.519472599029541, "learning_rate": 8.458288124187359e-05, "loss": 0.7165, "step": 246 }, { "epoch": 0.30362630608481866, "grad_norm": 2.3718838691711426, "learning_rate": 8.443390316728987e-05, "loss": 1.1449, "step": 247 }, { "epoch": 0.3048555623847572, "grad_norm": 2.1668829917907715, "learning_rate": 8.428434133004937e-05, "loss": 1.0383, "step": 248 }, { "epoch": 0.3060848186846958, "grad_norm": 3.2350733280181885, "learning_rate": 8.413419826569435e-05, "loss": 1.2341, "step": 249 }, { "epoch": 0.3073140749846343, "grad_norm": 2.3541886806488037, "learning_rate": 8.398347651962064e-05, "loss": 1.0355, "step": 250 }, { "epoch": 0.30854333128457284, "grad_norm": 0.2730487883090973, "learning_rate": 8.383217864703456e-05, "loss": 1.2813, "step": 251 }, { "epoch": 0.30977258758451137, "grad_norm": 0.2517383098602295, "learning_rate": 8.36803072129096e-05, "loss": 1.1793, "step": 252 }, { "epoch": 0.3110018438844499, "grad_norm": 0.28486472368240356, "learning_rate": 8.352786479194288e-05, "loss": 1.4065, "step": 253 }, { "epoch": 0.31223110018438843, "grad_norm": 0.3247184455394745, "learning_rate": 8.337485396851155e-05, "loss": 1.4863, "step": 254 }, { "epoch": 0.31346035648432696, "grad_norm": 0.26896461844444275, "learning_rate": 8.322127733662897e-05, "loss": 1.1373, "step": 255 }, { "epoch": 0.3146896127842655, "grad_norm": 0.29333245754241943, "learning_rate": 8.306713749990072e-05, "loss": 1.0615, "step": 256 }, { "epoch": 0.3159188690842041, "grad_norm": 0.2958793640136719, "learning_rate": 8.291243707148048e-05, "loss": 0.9392, "step": 257 }, { "epoch": 0.3171481253841426, "grad_norm": 0.3320540487766266, "learning_rate": 8.275717867402575e-05, "loss": 1.2935, "step": 258 }, { "epoch": 0.31837738168408114, "grad_norm": 0.3567339777946472, "learning_rate": 8.260136493965326e-05, "loss": 1.0954, "step": 259 }, { "epoch": 0.31960663798401967, "grad_norm": 0.38393881916999817, "learning_rate": 8.244499850989452e-05, "loss": 1.045, "step": 260 }, { "epoch": 0.3208358942839582, "grad_norm": 0.41993001103401184, "learning_rate": 8.228808203565095e-05, "loss": 1.2225, "step": 261 }, { "epoch": 0.32206515058389673, "grad_norm": 0.6547941565513611, "learning_rate": 8.213061817714893e-05, "loss": 0.9286, "step": 262 }, { "epoch": 0.32329440688383526, "grad_norm": 0.7117279767990112, "learning_rate": 8.197260960389474e-05, "loss": 0.5088, "step": 263 }, { "epoch": 0.3245236631837738, "grad_norm": 0.7041743993759155, "learning_rate": 8.181405899462926e-05, "loss": 0.8899, "step": 264 }, { "epoch": 0.3257529194837124, "grad_norm": 0.7142787575721741, "learning_rate": 8.16549690372826e-05, "loss": 0.7447, "step": 265 }, { "epoch": 0.3269821757836509, "grad_norm": 0.8879908323287964, "learning_rate": 8.14953424289285e-05, "loss": 1.2607, "step": 266 }, { "epoch": 0.32821143208358944, "grad_norm": 0.9387282133102417, "learning_rate": 8.133518187573862e-05, "loss": 1.1611, "step": 267 }, { "epoch": 0.32944068838352797, "grad_norm": 1.4039078950881958, "learning_rate": 8.117449009293668e-05, "loss": 0.9947, "step": 268 }, { "epoch": 0.3306699446834665, "grad_norm": 3.3686740398406982, "learning_rate": 8.101326980475237e-05, "loss": 1.0783, "step": 269 }, { "epoch": 0.33189920098340503, "grad_norm": 2.8384785652160645, "learning_rate": 8.085152374437525e-05, "loss": 0.9008, "step": 270 }, { "epoch": 0.33312845728334356, "grad_norm": 2.453441619873047, "learning_rate": 8.06892546539083e-05, "loss": 0.5504, "step": 271 }, { "epoch": 0.3343577135832821, "grad_norm": 2.592667579650879, "learning_rate": 8.052646528432158e-05, "loss": 0.7489, "step": 272 }, { "epoch": 0.3355869698832207, "grad_norm": 1.9753395318984985, "learning_rate": 8.036315839540545e-05, "loss": 0.9747, "step": 273 }, { "epoch": 0.3368162261831592, "grad_norm": 3.042698860168457, "learning_rate": 8.019933675572389e-05, "loss": 1.6841, "step": 274 }, { "epoch": 0.33804548248309774, "grad_norm": 2.4343316555023193, "learning_rate": 8.00350031425675e-05, "loss": 0.869, "step": 275 }, { "epoch": 0.33927473878303627, "grad_norm": 0.2026144415140152, "learning_rate": 7.98701603419064e-05, "loss": 0.8867, "step": 276 }, { "epoch": 0.3405039950829748, "grad_norm": 0.24370141327381134, "learning_rate": 7.970481114834312e-05, "loss": 1.3135, "step": 277 }, { "epoch": 0.34173325138291333, "grad_norm": 0.22894087433815002, "learning_rate": 7.953895836506508e-05, "loss": 1.0986, "step": 278 }, { "epoch": 0.34296250768285186, "grad_norm": 0.2533970773220062, "learning_rate": 7.937260480379712e-05, "loss": 1.1821, "step": 279 }, { "epoch": 0.3441917639827904, "grad_norm": 0.25789350271224976, "learning_rate": 7.920575328475385e-05, "loss": 1.1414, "step": 280 }, { "epoch": 0.345421020282729, "grad_norm": 0.28820541501045227, "learning_rate": 7.903840663659186e-05, "loss": 1.3332, "step": 281 }, { "epoch": 0.3466502765826675, "grad_norm": 0.28611505031585693, "learning_rate": 7.887056769636165e-05, "loss": 1.0901, "step": 282 }, { "epoch": 0.34787953288260604, "grad_norm": 0.28022873401641846, "learning_rate": 7.870223930945972e-05, "loss": 0.8461, "step": 283 }, { "epoch": 0.34910878918254457, "grad_norm": 0.3246136009693146, "learning_rate": 7.853342432958013e-05, "loss": 0.9325, "step": 284 }, { "epoch": 0.3503380454824831, "grad_norm": 0.3149406611919403, "learning_rate": 7.836412561866629e-05, "loss": 1.013, "step": 285 }, { "epoch": 0.3515673017824216, "grad_norm": 0.3745490610599518, "learning_rate": 7.819434604686228e-05, "loss": 1.2624, "step": 286 }, { "epoch": 0.35279655808236016, "grad_norm": 0.4822925329208374, "learning_rate": 7.802408849246442e-05, "loss": 1.2424, "step": 287 }, { "epoch": 0.3540258143822987, "grad_norm": 0.6210641264915466, "learning_rate": 7.785335584187219e-05, "loss": 1.2527, "step": 288 }, { "epoch": 0.3552550706822373, "grad_norm": 0.6488444805145264, "learning_rate": 7.768215098953952e-05, "loss": 0.7986, "step": 289 }, { "epoch": 0.3564843269821758, "grad_norm": 0.760388195514679, "learning_rate": 7.751047683792561e-05, "loss": 1.0136, "step": 290 }, { "epoch": 0.35771358328211433, "grad_norm": 0.7666548490524292, "learning_rate": 7.73383362974458e-05, "loss": 0.8205, "step": 291 }, { "epoch": 0.35894283958205286, "grad_norm": 0.7492078542709351, "learning_rate": 7.71657322864221e-05, "loss": 0.9254, "step": 292 }, { "epoch": 0.3601720958819914, "grad_norm": 0.9061193466186523, "learning_rate": 7.699266773103389e-05, "loss": 1.3013, "step": 293 }, { "epoch": 0.3614013521819299, "grad_norm": 2.1404013633728027, "learning_rate": 7.681914556526817e-05, "loss": 1.5957, "step": 294 }, { "epoch": 0.36263060848186845, "grad_norm": 2.647864580154419, "learning_rate": 7.664516873086987e-05, "loss": 1.1658, "step": 295 }, { "epoch": 0.363859864781807, "grad_norm": 3.0906460285186768, "learning_rate": 7.647074017729202e-05, "loss": 1.1344, "step": 296 }, { "epoch": 0.36508912108174557, "grad_norm": 2.2348814010620117, "learning_rate": 7.629586286164565e-05, "loss": 0.8813, "step": 297 }, { "epoch": 0.3663183773816841, "grad_norm": 2.937446117401123, "learning_rate": 7.612053974864976e-05, "loss": 1.0414, "step": 298 }, { "epoch": 0.36754763368162263, "grad_norm": 2.5343546867370605, "learning_rate": 7.594477381058098e-05, "loss": 1.1847, "step": 299 }, { "epoch": 0.36877688998156116, "grad_norm": 2.8971638679504395, "learning_rate": 7.576856802722325e-05, "loss": 0.9029, "step": 300 }, { "epoch": 0.3700061462814997, "grad_norm": 0.1982557773590088, "learning_rate": 7.559192538581722e-05, "loss": 0.9314, "step": 301 }, { "epoch": 0.3712354025814382, "grad_norm": 0.24721381068229675, "learning_rate": 7.541484888100974e-05, "loss": 1.2432, "step": 302 }, { "epoch": 0.37246465888137675, "grad_norm": 0.24999506771564484, "learning_rate": 7.523734151480289e-05, "loss": 1.285, "step": 303 }, { "epoch": 0.3736939151813153, "grad_norm": 0.267764151096344, "learning_rate": 7.505940629650326e-05, "loss": 1.198, "step": 304 }, { "epoch": 0.3749231714812538, "grad_norm": 0.26003679633140564, "learning_rate": 7.488104624267091e-05, "loss": 1.2001, "step": 305 }, { "epoch": 0.3761524277811924, "grad_norm": 0.28197526931762695, "learning_rate": 7.470226437706813e-05, "loss": 1.1687, "step": 306 }, { "epoch": 0.37738168408113093, "grad_norm": 0.29367661476135254, "learning_rate": 7.452306373060829e-05, "loss": 1.211, "step": 307 }, { "epoch": 0.37861094038106946, "grad_norm": 0.2982727885246277, "learning_rate": 7.434344734130437e-05, "loss": 1.151, "step": 308 }, { "epoch": 0.379840196681008, "grad_norm": 0.3283758759498596, "learning_rate": 7.416341825421754e-05, "loss": 0.9875, "step": 309 }, { "epoch": 0.3810694529809465, "grad_norm": 0.32420334219932556, "learning_rate": 7.398297952140544e-05, "loss": 1.0796, "step": 310 }, { "epoch": 0.38229870928088505, "grad_norm": 0.4046980142593384, "learning_rate": 7.380213420187055e-05, "loss": 1.1158, "step": 311 }, { "epoch": 0.3835279655808236, "grad_norm": 0.391736775636673, "learning_rate": 7.36208853615082e-05, "loss": 1.1682, "step": 312 }, { "epoch": 0.3847572218807621, "grad_norm": 0.6027556657791138, "learning_rate": 7.343923607305471e-05, "loss": 1.0696, "step": 313 }, { "epoch": 0.3859864781807007, "grad_norm": 0.6483603119850159, "learning_rate": 7.325718941603527e-05, "loss": 0.7843, "step": 314 }, { "epoch": 0.38721573448063923, "grad_norm": 0.6711483001708984, "learning_rate": 7.307474847671168e-05, "loss": 0.7247, "step": 315 }, { "epoch": 0.38844499078057776, "grad_norm": 0.7372632026672363, "learning_rate": 7.289191634803003e-05, "loss": 1.0535, "step": 316 }, { "epoch": 0.3896742470805163, "grad_norm": 0.7427420020103455, "learning_rate": 7.270869612956835e-05, "loss": 1.0563, "step": 317 }, { "epoch": 0.3909035033804548, "grad_norm": 2.6449501514434814, "learning_rate": 7.252509092748401e-05, "loss": 1.3099, "step": 318 }, { "epoch": 0.39213275968039335, "grad_norm": 3.1938464641571045, "learning_rate": 7.234110385446103e-05, "loss": 1.1728, "step": 319 }, { "epoch": 0.3933620159803319, "grad_norm": 2.584103584289551, "learning_rate": 7.215673802965734e-05, "loss": 0.792, "step": 320 }, { "epoch": 0.3945912722802704, "grad_norm": 2.358025074005127, "learning_rate": 7.197199657865195e-05, "loss": 1.0462, "step": 321 }, { "epoch": 0.395820528580209, "grad_norm": 2.9621617794036865, "learning_rate": 7.178688263339184e-05, "loss": 1.4222, "step": 322 }, { "epoch": 0.3970497848801475, "grad_norm": 2.5362660884857178, "learning_rate": 7.160139933213898e-05, "loss": 1.1527, "step": 323 }, { "epoch": 0.39827904118008606, "grad_norm": 2.4901375770568848, "learning_rate": 7.141554981941709e-05, "loss": 1.1712, "step": 324 }, { "epoch": 0.3995082974800246, "grad_norm": 2.9214236736297607, "learning_rate": 7.12293372459583e-05, "loss": 1.1977, "step": 325 }, { "epoch": 0.4007375537799631, "grad_norm": 0.24753543734550476, "learning_rate": 7.104276476864974e-05, "loss": 1.2176, "step": 326 }, { "epoch": 0.40196681007990165, "grad_norm": 0.25986090302467346, "learning_rate": 7.085583555048008e-05, "loss": 1.2854, "step": 327 }, { "epoch": 0.4031960663798402, "grad_norm": 0.2640175521373749, "learning_rate": 7.066855276048587e-05, "loss": 1.2204, "step": 328 }, { "epoch": 0.4044253226797787, "grad_norm": 0.2603614330291748, "learning_rate": 7.048091957369776e-05, "loss": 1.2621, "step": 329 }, { "epoch": 0.4056545789797173, "grad_norm": 0.2921195924282074, "learning_rate": 7.029293917108678e-05, "loss": 1.281, "step": 330 }, { "epoch": 0.4068838352796558, "grad_norm": 0.2984941899776459, "learning_rate": 7.010461473951033e-05, "loss": 1.071, "step": 331 }, { "epoch": 0.40811309157959436, "grad_norm": 0.31219175457954407, "learning_rate": 6.991594947165818e-05, "loss": 1.3161, "step": 332 }, { "epoch": 0.4093423478795329, "grad_norm": 0.31329602003097534, "learning_rate": 6.972694656599834e-05, "loss": 0.9854, "step": 333 }, { "epoch": 0.4105716041794714, "grad_norm": 0.3356671929359436, "learning_rate": 6.953760922672286e-05, "loss": 1.02, "step": 334 }, { "epoch": 0.41180086047940995, "grad_norm": 0.3843994438648224, "learning_rate": 6.934794066369348e-05, "loss": 1.2173, "step": 335 }, { "epoch": 0.4130301167793485, "grad_norm": 0.45338544249534607, "learning_rate": 6.915794409238718e-05, "loss": 1.3614, "step": 336 }, { "epoch": 0.414259373079287, "grad_norm": 0.4857298731803894, "learning_rate": 6.896762273384178e-05, "loss": 1.0175, "step": 337 }, { "epoch": 0.4154886293792256, "grad_norm": 0.6512896418571472, "learning_rate": 6.877697981460125e-05, "loss": 0.6555, "step": 338 }, { "epoch": 0.4167178856791641, "grad_norm": 0.6744720935821533, "learning_rate": 6.858601856666094e-05, "loss": 0.6057, "step": 339 }, { "epoch": 0.41794714197910265, "grad_norm": 0.6527014374732971, "learning_rate": 6.839474222741299e-05, "loss": 0.9116, "step": 340 }, { "epoch": 0.4191763982790412, "grad_norm": 0.6935631036758423, "learning_rate": 6.820315403959123e-05, "loss": 0.9876, "step": 341 }, { "epoch": 0.4204056545789797, "grad_norm": 0.6856899261474609, "learning_rate": 6.801125725121636e-05, "loss": 0.9591, "step": 342 }, { "epoch": 0.42163491087891825, "grad_norm": 1.2577812671661377, "learning_rate": 6.781905511554079e-05, "loss": 1.3174, "step": 343 }, { "epoch": 0.4228641671788568, "grad_norm": 2.421950578689575, "learning_rate": 6.762655089099353e-05, "loss": 1.6442, "step": 344 }, { "epoch": 0.4240934234787953, "grad_norm": 2.6432454586029053, "learning_rate": 6.743374784112501e-05, "loss": 1.0468, "step": 345 }, { "epoch": 0.4253226797787339, "grad_norm": 2.7061827182769775, "learning_rate": 6.724064923455155e-05, "loss": 1.1526, "step": 346 }, { "epoch": 0.4265519360786724, "grad_norm": 2.466057777404785, "learning_rate": 6.704725834490024e-05, "loss": 1.1463, "step": 347 }, { "epoch": 0.42778119237861095, "grad_norm": 2.753512144088745, "learning_rate": 6.685357845075315e-05, "loss": 0.9492, "step": 348 }, { "epoch": 0.4290104486785495, "grad_norm": 2.76118803024292, "learning_rate": 6.665961283559197e-05, "loss": 0.8543, "step": 349 }, { "epoch": 0.430239704978488, "grad_norm": 2.295574426651001, "learning_rate": 6.646536478774222e-05, "loss": 0.9564, "step": 350 }, { "epoch": 0.43146896127842654, "grad_norm": 0.22731368243694305, "learning_rate": 6.627083760031754e-05, "loss": 0.9719, "step": 351 }, { "epoch": 0.4326982175783651, "grad_norm": 0.20097078382968903, "learning_rate": 6.60760345711639e-05, "loss": 1.0094, "step": 352 }, { "epoch": 0.4339274738783036, "grad_norm": 0.23321934044361115, "learning_rate": 6.58809590028036e-05, "loss": 1.101, "step": 353 }, { "epoch": 0.4351567301782422, "grad_norm": 0.27995625138282776, "learning_rate": 6.568561420237935e-05, "loss": 1.3545, "step": 354 }, { "epoch": 0.4363859864781807, "grad_norm": 0.259082168340683, "learning_rate": 6.54900034815982e-05, "loss": 1.1598, "step": 355 }, { "epoch": 0.43761524277811925, "grad_norm": 0.2688703238964081, "learning_rate": 6.52941301566754e-05, "loss": 1.1141, "step": 356 }, { "epoch": 0.4388444990780578, "grad_norm": 0.34018442034721375, "learning_rate": 6.50979975482781e-05, "loss": 1.2811, "step": 357 }, { "epoch": 0.4400737553779963, "grad_norm": 0.2925175130367279, "learning_rate": 6.490160898146918e-05, "loss": 0.9025, "step": 358 }, { "epoch": 0.44130301167793484, "grad_norm": 0.30208972096443176, "learning_rate": 6.470496778565082e-05, "loss": 1.0301, "step": 359 }, { "epoch": 0.4425322679778734, "grad_norm": 0.3110770285129547, "learning_rate": 6.4508077294508e-05, "loss": 1.0911, "step": 360 }, { "epoch": 0.4437615242778119, "grad_norm": 0.426252543926239, "learning_rate": 6.431094084595209e-05, "loss": 1.1214, "step": 361 }, { "epoch": 0.44499078057775043, "grad_norm": 0.4019356966018677, "learning_rate": 6.411356178206419e-05, "loss": 1.3063, "step": 362 }, { "epoch": 0.446220036877689, "grad_norm": 0.4622703194618225, "learning_rate": 6.391594344903848e-05, "loss": 1.1208, "step": 363 }, { "epoch": 0.44744929317762755, "grad_norm": 0.5752270817756653, "learning_rate": 6.371808919712549e-05, "loss": 0.9653, "step": 364 }, { "epoch": 0.4486785494775661, "grad_norm": 0.6309720277786255, "learning_rate": 6.35200023805754e-05, "loss": 0.5664, "step": 365 }, { "epoch": 0.4499078057775046, "grad_norm": 0.612684965133667, "learning_rate": 6.332168635758097e-05, "loss": 1.0443, "step": 366 }, { "epoch": 0.45113706207744314, "grad_norm": 0.6797056794166565, "learning_rate": 6.31231444902208e-05, "loss": 0.8389, "step": 367 }, { "epoch": 0.45236631837738167, "grad_norm": 1.222960352897644, "learning_rate": 6.292438014440227e-05, "loss": 1.4688, "step": 368 }, { "epoch": 0.4535955746773202, "grad_norm": 2.9443516731262207, "learning_rate": 6.272539668980441e-05, "loss": 1.0079, "step": 369 }, { "epoch": 0.45482483097725873, "grad_norm": 3.0168612003326416, "learning_rate": 6.252619749982089e-05, "loss": 0.9232, "step": 370 }, { "epoch": 0.4560540872771973, "grad_norm": 1.9470983743667603, "learning_rate": 6.232678595150275e-05, "loss": 0.8126, "step": 371 }, { "epoch": 0.45728334357713585, "grad_norm": 2.4769911766052246, "learning_rate": 6.212716542550112e-05, "loss": 0.7786, "step": 372 }, { "epoch": 0.4585125998770744, "grad_norm": 2.849158525466919, "learning_rate": 6.192733930601005e-05, "loss": 1.1914, "step": 373 }, { "epoch": 0.4597418561770129, "grad_norm": 2.6154119968414307, "learning_rate": 6.172731098070899e-05, "loss": 0.9171, "step": 374 }, { "epoch": 0.46097111247695144, "grad_norm": 3.5901479721069336, "learning_rate": 6.152708384070541e-05, "loss": 1.1269, "step": 375 }, { "epoch": 0.46220036877688997, "grad_norm": 0.23536159098148346, "learning_rate": 6.132666128047732e-05, "loss": 0.8768, "step": 376 }, { "epoch": 0.4634296250768285, "grad_norm": 0.24834086000919342, "learning_rate": 6.112604669781572e-05, "loss": 1.0644, "step": 377 }, { "epoch": 0.46465888137676703, "grad_norm": 0.3041445016860962, "learning_rate": 6.0925243493767016e-05, "loss": 1.2779, "step": 378 }, { "epoch": 0.4658881376767056, "grad_norm": 0.3158765137195587, "learning_rate": 6.0724255072575275e-05, "loss": 1.352, "step": 379 }, { "epoch": 0.46711739397664415, "grad_norm": 0.2845201790332794, "learning_rate": 6.0523084841624635e-05, "loss": 1.2567, "step": 380 }, { "epoch": 0.4683466502765827, "grad_norm": 0.2909673750400543, "learning_rate": 6.0321736211381464e-05, "loss": 1.1735, "step": 381 }, { "epoch": 0.4695759065765212, "grad_norm": 0.2946690022945404, "learning_rate": 6.0120212595336545e-05, "loss": 1.1514, "step": 382 }, { "epoch": 0.47080516287645974, "grad_norm": 0.302846223115921, "learning_rate": 5.9918517409947215e-05, "loss": 1.0621, "step": 383 }, { "epoch": 0.47203441917639827, "grad_norm": 0.3197241425514221, "learning_rate": 5.971665407457948e-05, "loss": 1.0299, "step": 384 }, { "epoch": 0.4732636754763368, "grad_norm": 0.342777281999588, "learning_rate": 5.951462601144998e-05, "loss": 1.0858, "step": 385 }, { "epoch": 0.47449293177627533, "grad_norm": 0.3554008901119232, "learning_rate": 5.931243664556803e-05, "loss": 1.1441, "step": 386 }, { "epoch": 0.4757221880762139, "grad_norm": 0.36057665944099426, "learning_rate": 5.9110089404677524e-05, "loss": 1.1836, "step": 387 }, { "epoch": 0.47695144437615244, "grad_norm": 0.5004509091377258, "learning_rate": 5.890758771919884e-05, "loss": 1.4109, "step": 388 }, { "epoch": 0.478180700676091, "grad_norm": 0.5211744904518127, "learning_rate": 5.8704935022170684e-05, "loss": 1.0097, "step": 389 }, { "epoch": 0.4794099569760295, "grad_norm": 0.7474620938301086, "learning_rate": 5.8502134749191816e-05, "loss": 0.8777, "step": 390 }, { "epoch": 0.48063921327596804, "grad_norm": 0.7044636011123657, "learning_rate": 5.8299190338362996e-05, "loss": 0.9007, "step": 391 }, { "epoch": 0.48186846957590657, "grad_norm": 0.6484948992729187, "learning_rate": 5.8096105230228435e-05, "loss": 0.8261, "step": 392 }, { "epoch": 0.4830977258758451, "grad_norm": 0.672816812992096, "learning_rate": 5.78928828677177e-05, "loss": 1.0531, "step": 393 }, { "epoch": 0.4843269821757836, "grad_norm": 1.1637803316116333, "learning_rate": 5.768952669608724e-05, "loss": 1.1586, "step": 394 }, { "epoch": 0.4855562384757222, "grad_norm": 3.1862003803253174, "learning_rate": 5.748604016286192e-05, "loss": 1.6232, "step": 395 }, { "epoch": 0.48678549477566074, "grad_norm": 3.3833253383636475, "learning_rate": 5.728242671777672e-05, "loss": 1.0918, "step": 396 }, { "epoch": 0.4880147510755993, "grad_norm": 3.116319417953491, "learning_rate": 5.707868981271815e-05, "loss": 0.8615, "step": 397 }, { "epoch": 0.4892440073755378, "grad_norm": 2.5967965126037598, "learning_rate": 5.687483290166573e-05, "loss": 0.8579, "step": 398 }, { "epoch": 0.49047326367547633, "grad_norm": 3.7683048248291016, "learning_rate": 5.6670859440633486e-05, "loss": 1.0777, "step": 399 }, { "epoch": 0.49170251997541486, "grad_norm": 3.182317018508911, "learning_rate": 5.646677288761132e-05, "loss": 0.8932, "step": 400 }, { "epoch": 0.4929317762753534, "grad_norm": 0.19372360408306122, "learning_rate": 5.6262576702506406e-05, "loss": 0.8516, "step": 401 }, { "epoch": 0.4941610325752919, "grad_norm": 0.223338320851326, "learning_rate": 5.6058274347084504e-05, "loss": 1.1287, "step": 402 }, { "epoch": 0.4953902888752305, "grad_norm": 0.23367400467395782, "learning_rate": 5.585386928491134e-05, "loss": 1.1128, "step": 403 }, { "epoch": 0.49661954517516904, "grad_norm": 0.2717371881008148, "learning_rate": 5.5649364981293786e-05, "loss": 1.2813, "step": 404 }, { "epoch": 0.49784880147510757, "grad_norm": 0.25909724831581116, "learning_rate": 5.54447649032212e-05, "loss": 1.2149, "step": 405 }, { "epoch": 0.4990780577750461, "grad_norm": 0.25411197543144226, "learning_rate": 5.5240072519306606e-05, "loss": 1.0679, "step": 406 }, { "epoch": 0.5003073140749846, "grad_norm": 0.2962128520011902, "learning_rate": 5.503529129972792e-05, "loss": 1.1156, "step": 407 }, { "epoch": 0.5015365703749232, "grad_norm": 0.29252949357032776, "learning_rate": 5.483042471616908e-05, "loss": 1.125, "step": 408 }, { "epoch": 0.5015365703749232, "eval_loss": 0.9695894122123718, "eval_runtime": 65.3254, "eval_samples_per_second": 10.486, "eval_steps_per_second": 5.251, "step": 408 }, { "epoch": 0.5027658266748617, "grad_norm": 0.31012120842933655, "learning_rate": 5.4625476241761196e-05, "loss": 1.1491, "step": 409 }, { "epoch": 0.5039950829748002, "grad_norm": 0.3182609975337982, "learning_rate": 5.442044935102375e-05, "loss": 1.0786, "step": 410 }, { "epoch": 0.5052243392747388, "grad_norm": 0.35413217544555664, "learning_rate": 5.421534751980556e-05, "loss": 1.2194, "step": 411 }, { "epoch": 0.5064535955746773, "grad_norm": 0.39505264163017273, "learning_rate": 5.401017422522594e-05, "loss": 1.0296, "step": 412 }, { "epoch": 0.5076828518746158, "grad_norm": 0.4267808496952057, "learning_rate": 5.380493294561573e-05, "loss": 1.0906, "step": 413 }, { "epoch": 0.5089121081745543, "grad_norm": 0.5495923161506653, "learning_rate": 5.359962716045835e-05, "loss": 1.2171, "step": 414 }, { "epoch": 0.510141364474493, "grad_norm": 0.6385165452957153, "learning_rate": 5.3394260350330796e-05, "loss": 0.8179, "step": 415 }, { "epoch": 0.5113706207744315, "grad_norm": 0.7211401462554932, "learning_rate": 5.318883599684456e-05, "loss": 0.7624, "step": 416 }, { "epoch": 0.51259987707437, "grad_norm": 0.800375759601593, "learning_rate": 5.298335758258678e-05, "loss": 0.9597, "step": 417 }, { "epoch": 0.5138291333743086, "grad_norm": 0.9896557927131653, "learning_rate": 5.2777828591060984e-05, "loss": 1.12, "step": 418 }, { "epoch": 0.5150583896742471, "grad_norm": 2.869112968444824, "learning_rate": 5.257225250662823e-05, "loss": 1.2992, "step": 419 }, { "epoch": 0.5162876459741856, "grad_norm": 3.394543409347534, "learning_rate": 5.236663281444791e-05, "loss": 1.298, "step": 420 }, { "epoch": 0.5175169022741242, "grad_norm": 3.3043065071105957, "learning_rate": 5.21609730004187e-05, "loss": 1.1924, "step": 421 }, { "epoch": 0.5187461585740627, "grad_norm": 2.603189706802368, "learning_rate": 5.1955276551119495e-05, "loss": 1.2155, "step": 422 }, { "epoch": 0.5199754148740012, "grad_norm": 3.018608808517456, "learning_rate": 5.174954695375023e-05, "loss": 1.2001, "step": 423 }, { "epoch": 0.5212046711739398, "grad_norm": 1.8929078578948975, "learning_rate": 5.154378769607286e-05, "loss": 0.7124, "step": 424 }, { "epoch": 0.5224339274738783, "grad_norm": 2.4811208248138428, "learning_rate": 5.1338002266352106e-05, "loss": 0.8491, "step": 425 }, { "epoch": 0.5236631837738168, "grad_norm": 0.22651559114456177, "learning_rate": 5.113219415329645e-05, "loss": 0.9983, "step": 426 }, { "epoch": 0.5248924400737554, "grad_norm": 0.2817627787590027, "learning_rate": 5.0926366845998904e-05, "loss": 1.1314, "step": 427 }, { "epoch": 0.5261216963736939, "grad_norm": 0.27301520109176636, "learning_rate": 5.072052383387786e-05, "loss": 1.2722, "step": 428 }, { "epoch": 0.5273509526736324, "grad_norm": 0.2770383059978485, "learning_rate": 5.0514668606618e-05, "loss": 1.1654, "step": 429 }, { "epoch": 0.5285802089735709, "grad_norm": 0.3147628903388977, "learning_rate": 5.0308804654111056e-05, "loss": 1.0896, "step": 430 }, { "epoch": 0.5298094652735095, "grad_norm": 0.305074542760849, "learning_rate": 5.01029354663967e-05, "loss": 1.1754, "step": 431 }, { "epoch": 0.5310387215734481, "grad_norm": 0.3240547776222229, "learning_rate": 4.9897064533603315e-05, "loss": 1.2413, "step": 432 }, { "epoch": 0.5322679778733866, "grad_norm": 0.34143832325935364, "learning_rate": 4.9691195345888956e-05, "loss": 1.1504, "step": 433 }, { "epoch": 0.5334972341733252, "grad_norm": 0.3945367634296417, "learning_rate": 4.948533139338202e-05, "loss": 1.2914, "step": 434 }, { "epoch": 0.5347264904732637, "grad_norm": 0.371367871761322, "learning_rate": 4.927947616612215e-05, "loss": 1.3269, "step": 435 }, { "epoch": 0.5359557467732022, "grad_norm": 0.4348413646221161, "learning_rate": 4.90736331540011e-05, "loss": 1.2061, "step": 436 }, { "epoch": 0.5371850030731408, "grad_norm": 0.5112064480781555, "learning_rate": 4.886780584670356e-05, "loss": 1.2283, "step": 437 }, { "epoch": 0.5384142593730793, "grad_norm": 0.6409791707992554, "learning_rate": 4.866199773364789e-05, "loss": 0.7663, "step": 438 }, { "epoch": 0.5396435156730178, "grad_norm": 0.7076222896575928, "learning_rate": 4.845621230392716e-05, "loss": 0.8073, "step": 439 }, { "epoch": 0.5408727719729564, "grad_norm": 0.6545782089233398, "learning_rate": 4.825045304624978e-05, "loss": 0.762, "step": 440 }, { "epoch": 0.5421020282728949, "grad_norm": 0.6853161454200745, "learning_rate": 4.804472344888052e-05, "loss": 0.9905, "step": 441 }, { "epoch": 0.5433312845728334, "grad_norm": 0.6758946776390076, "learning_rate": 4.7839026999581296e-05, "loss": 0.9353, "step": 442 }, { "epoch": 0.544560540872772, "grad_norm": 1.1472845077514648, "learning_rate": 4.7633367185552095e-05, "loss": 0.9796, "step": 443 }, { "epoch": 0.5457897971727105, "grad_norm": 2.543483018875122, "learning_rate": 4.742774749337179e-05, "loss": 1.5897, "step": 444 }, { "epoch": 0.547019053472649, "grad_norm": 3.029282331466675, "learning_rate": 4.7222171408939034e-05, "loss": 1.2527, "step": 445 }, { "epoch": 0.5482483097725875, "grad_norm": 3.1332318782806396, "learning_rate": 4.701664241741323e-05, "loss": 1.1093, "step": 446 }, { "epoch": 0.5494775660725261, "grad_norm": 2.407163381576538, "learning_rate": 4.681116400315544e-05, "loss": 0.75, "step": 447 }, { "epoch": 0.5507068223724647, "grad_norm": 2.622605085372925, "learning_rate": 4.6605739649669236e-05, "loss": 0.8924, "step": 448 }, { "epoch": 0.5519360786724032, "grad_norm": 2.5338971614837646, "learning_rate": 4.640037283954165e-05, "loss": 0.681, "step": 449 }, { "epoch": 0.5531653349723418, "grad_norm": 2.4635331630706787, "learning_rate": 4.619506705438428e-05, "loss": 0.7866, "step": 450 }, { "epoch": 0.5543945912722803, "grad_norm": 0.18258269131183624, "learning_rate": 4.598982577477408e-05, "loss": 0.8069, "step": 451 }, { "epoch": 0.5556238475722188, "grad_norm": 0.23453976213932037, "learning_rate": 4.578465248019445e-05, "loss": 1.1606, "step": 452 }, { "epoch": 0.5568531038721574, "grad_norm": 0.25334998965263367, "learning_rate": 4.557955064897626e-05, "loss": 1.2323, "step": 453 }, { "epoch": 0.5580823601720959, "grad_norm": 0.2757891118526459, "learning_rate": 4.537452375823881e-05, "loss": 1.4173, "step": 454 }, { "epoch": 0.5593116164720344, "grad_norm": 0.2755081355571747, "learning_rate": 4.5169575283830936e-05, "loss": 1.2255, "step": 455 }, { "epoch": 0.560540872771973, "grad_norm": 0.29173940420150757, "learning_rate": 4.496470870027209e-05, "loss": 1.4528, "step": 456 }, { "epoch": 0.5617701290719115, "grad_norm": 0.27483510971069336, "learning_rate": 4.475992748069339e-05, "loss": 0.7703, "step": 457 }, { "epoch": 0.56299938537185, "grad_norm": 0.3180837333202362, "learning_rate": 4.455523509677882e-05, "loss": 1.025, "step": 458 }, { "epoch": 0.5642286416717885, "grad_norm": 0.3103027939796448, "learning_rate": 4.435063501870622e-05, "loss": 1.0503, "step": 459 }, { "epoch": 0.5654578979717271, "grad_norm": 0.3448195457458496, "learning_rate": 4.4146130715088676e-05, "loss": 1.0366, "step": 460 }, { "epoch": 0.5666871542716656, "grad_norm": 0.3892152011394501, "learning_rate": 4.3941725652915494e-05, "loss": 1.1834, "step": 461 }, { "epoch": 0.5679164105716041, "grad_norm": 0.4770027697086334, "learning_rate": 4.373742329749362e-05, "loss": 1.4524, "step": 462 }, { "epoch": 0.5691456668715427, "grad_norm": 0.5856226682662964, "learning_rate": 4.3533227112388694e-05, "loss": 1.2538, "step": 463 }, { "epoch": 0.5703749231714813, "grad_norm": 0.6203077435493469, "learning_rate": 4.332914055936653e-05, "loss": 0.6276, "step": 464 }, { "epoch": 0.5716041794714198, "grad_norm": 0.6294959187507629, "learning_rate": 4.3125167098334286e-05, "loss": 0.642, "step": 465 }, { "epoch": 0.5728334357713584, "grad_norm": 0.7135624885559082, "learning_rate": 4.2921310187281864e-05, "loss": 1.1486, "step": 466 }, { "epoch": 0.5740626920712969, "grad_norm": 0.7505180835723877, "learning_rate": 4.27175732822233e-05, "loss": 0.9109, "step": 467 }, { "epoch": 0.5752919483712354, "grad_norm": 1.0719517469406128, "learning_rate": 4.251395983713809e-05, "loss": 1.2909, "step": 468 }, { "epoch": 0.576521204671174, "grad_norm": 2.5531132221221924, "learning_rate": 4.231047330391278e-05, "loss": 1.1723, "step": 469 }, { "epoch": 0.5777504609711125, "grad_norm": 3.104837656021118, "learning_rate": 4.21071171322823e-05, "loss": 1.3296, "step": 470 }, { "epoch": 0.578979717271051, "grad_norm": 3.239119052886963, "learning_rate": 4.190389476977156e-05, "loss": 0.8768, "step": 471 }, { "epoch": 0.5802089735709896, "grad_norm": 2.681823968887329, "learning_rate": 4.170080966163702e-05, "loss": 0.8809, "step": 472 }, { "epoch": 0.5814382298709281, "grad_norm": 2.557533025741577, "learning_rate": 4.149786525080819e-05, "loss": 0.8249, "step": 473 }, { "epoch": 0.5826674861708666, "grad_norm": 2.4591336250305176, "learning_rate": 4.1295064977829334e-05, "loss": 0.7384, "step": 474 }, { "epoch": 0.5838967424708051, "grad_norm": 2.414395809173584, "learning_rate": 4.109241228080115e-05, "loss": 0.7307, "step": 475 }, { "epoch": 0.5851259987707437, "grad_norm": 0.2034798115491867, "learning_rate": 4.088991059532248e-05, "loss": 0.8326, "step": 476 }, { "epoch": 0.5863552550706822, "grad_norm": 0.24682076275348663, "learning_rate": 4.0687563354431984e-05, "loss": 1.3382, "step": 477 }, { "epoch": 0.5875845113706207, "grad_norm": 0.2430254966020584, "learning_rate": 4.048537398855003e-05, "loss": 1.2718, "step": 478 }, { "epoch": 0.5888137676705593, "grad_norm": 0.27249017357826233, "learning_rate": 4.028334592542054e-05, "loss": 1.2328, "step": 479 }, { "epoch": 0.5900430239704979, "grad_norm": 0.28996631503105164, "learning_rate": 4.008148259005279e-05, "loss": 1.3963, "step": 480 }, { "epoch": 0.5912722802704364, "grad_norm": 0.28658372163772583, "learning_rate": 3.9879787404663474e-05, "loss": 1.2326, "step": 481 }, { "epoch": 0.592501536570375, "grad_norm": 0.3246002495288849, "learning_rate": 3.967826378861854e-05, "loss": 1.1071, "step": 482 }, { "epoch": 0.5937307928703135, "grad_norm": 0.33610400557518005, "learning_rate": 3.947691515837537e-05, "loss": 1.1798, "step": 483 }, { "epoch": 0.594960049170252, "grad_norm": 0.35427945852279663, "learning_rate": 3.927574492742473e-05, "loss": 1.3277, "step": 484 }, { "epoch": 0.5961893054701906, "grad_norm": 0.3942146599292755, "learning_rate": 3.907475650623299e-05, "loss": 1.0499, "step": 485 }, { "epoch": 0.5974185617701291, "grad_norm": 0.36504921317100525, "learning_rate": 3.887395330218429e-05, "loss": 1.1876, "step": 486 }, { "epoch": 0.5986478180700676, "grad_norm": 0.45135965943336487, "learning_rate": 3.867333871952269e-05, "loss": 1.1775, "step": 487 }, { "epoch": 0.5998770743700061, "grad_norm": 0.6297984719276428, "learning_rate": 3.84729161592946e-05, "loss": 0.9756, "step": 488 }, { "epoch": 0.6011063306699447, "grad_norm": 0.6099340915679932, "learning_rate": 3.827268901929102e-05, "loss": 0.7669, "step": 489 }, { "epoch": 0.6023355869698832, "grad_norm": 0.7176752686500549, "learning_rate": 3.8072660693989967e-05, "loss": 0.5387, "step": 490 }, { "epoch": 0.6035648432698217, "grad_norm": 0.6262489557266235, "learning_rate": 3.78728345744989e-05, "loss": 0.9306, "step": 491 }, { "epoch": 0.6047940995697603, "grad_norm": 1.0194954872131348, "learning_rate": 3.767321404849727e-05, "loss": 1.1677, "step": 492 }, { "epoch": 0.6060233558696988, "grad_norm": 1.6236090660095215, "learning_rate": 3.7473802500179114e-05, "loss": 1.0458, "step": 493 }, { "epoch": 0.6072526121696373, "grad_norm": 2.5714359283447266, "learning_rate": 3.727460331019559e-05, "loss": 1.2519, "step": 494 }, { "epoch": 0.6084818684695759, "grad_norm": 2.4615423679351807, "learning_rate": 3.7075619855597744e-05, "loss": 0.5842, "step": 495 }, { "epoch": 0.6097111247695144, "grad_norm": 3.4131109714508057, "learning_rate": 3.6876855509779206e-05, "loss": 1.3071, "step": 496 }, { "epoch": 0.610940381069453, "grad_norm": 3.461987257003784, "learning_rate": 3.667831364241904e-05, "loss": 1.1426, "step": 497 }, { "epoch": 0.6121696373693916, "grad_norm": 2.462989330291748, "learning_rate": 3.6479997619424605e-05, "loss": 0.8421, "step": 498 }, { "epoch": 0.6133988936693301, "grad_norm": 2.423410654067993, "learning_rate": 3.628191080287451e-05, "loss": 0.7845, "step": 499 }, { "epoch": 0.6146281499692686, "grad_norm": 2.5741212368011475, "learning_rate": 3.608405655096154e-05, "loss": 0.6916, "step": 500 }, { "epoch": 0.6158574062692072, "grad_norm": 0.2071269154548645, "learning_rate": 3.588643821793582e-05, "loss": 0.9648, "step": 501 }, { "epoch": 0.6170866625691457, "grad_norm": 0.23868902027606964, "learning_rate": 3.5689059154047915e-05, "loss": 1.261, "step": 502 }, { "epoch": 0.6183159188690842, "grad_norm": 0.2465473711490631, "learning_rate": 3.5491922705492e-05, "loss": 1.2999, "step": 503 }, { "epoch": 0.6195451751690227, "grad_norm": 0.25372591614723206, "learning_rate": 3.5295032214349196e-05, "loss": 1.3633, "step": 504 }, { "epoch": 0.6207744314689613, "grad_norm": 0.24923810362815857, "learning_rate": 3.5098391018530816e-05, "loss": 1.1408, "step": 505 }, { "epoch": 0.6220036877688998, "grad_norm": 0.2594156563282013, "learning_rate": 3.4902002451721916e-05, "loss": 1.158, "step": 506 }, { "epoch": 0.6232329440688383, "grad_norm": 0.28343746066093445, "learning_rate": 3.4705869843324614e-05, "loss": 1.1717, "step": 507 }, { "epoch": 0.6244622003687769, "grad_norm": 0.29203376173973083, "learning_rate": 3.450999651840179e-05, "loss": 1.0693, "step": 508 }, { "epoch": 0.6256914566687154, "grad_norm": 0.3236950933933258, "learning_rate": 3.431438579762066e-05, "loss": 1.0098, "step": 509 }, { "epoch": 0.6269207129686539, "grad_norm": 0.3463621139526367, "learning_rate": 3.411904099719642e-05, "loss": 1.1123, "step": 510 }, { "epoch": 0.6281499692685925, "grad_norm": 0.4081440269947052, "learning_rate": 3.3923965428836105e-05, "loss": 1.1303, "step": 511 }, { "epoch": 0.629379225568531, "grad_norm": 0.38460928201675415, "learning_rate": 3.3729162399682456e-05, "loss": 1.0858, "step": 512 }, { "epoch": 0.6306084818684696, "grad_norm": 0.474970281124115, "learning_rate": 3.35346352122578e-05, "loss": 1.1176, "step": 513 }, { "epoch": 0.6318377381684082, "grad_norm": 0.7059311270713806, "learning_rate": 3.3340387164408046e-05, "loss": 1.1786, "step": 514 }, { "epoch": 0.6330669944683467, "grad_norm": 0.7394426465034485, "learning_rate": 3.314642154924686e-05, "loss": 0.7836, "step": 515 }, { "epoch": 0.6342962507682852, "grad_norm": 0.766044020652771, "learning_rate": 3.295274165509979e-05, "loss": 0.8919, "step": 516 }, { "epoch": 0.6355255070682237, "grad_norm": 0.7920165061950684, "learning_rate": 3.275935076544845e-05, "loss": 1.0439, "step": 517 }, { "epoch": 0.6367547633681623, "grad_norm": 1.224188208580017, "learning_rate": 3.256625215887502e-05, "loss": 1.3103, "step": 518 }, { "epoch": 0.6379840196681008, "grad_norm": 2.0128793716430664, "learning_rate": 3.237344910900648e-05, "loss": 1.0804, "step": 519 }, { "epoch": 0.6392132759680393, "grad_norm": 3.370530366897583, "learning_rate": 3.218094488445923e-05, "loss": 0.5978, "step": 520 }, { "epoch": 0.6404425322679779, "grad_norm": 3.095421314239502, "learning_rate": 3.198874274878365e-05, "loss": 0.847, "step": 521 }, { "epoch": 0.6416717885679164, "grad_norm": 3.4100868701934814, "learning_rate": 3.179684596040878e-05, "loss": 0.8489, "step": 522 }, { "epoch": 0.6429010448678549, "grad_norm": 2.608668088912964, "learning_rate": 3.1605257772587035e-05, "loss": 0.8378, "step": 523 }, { "epoch": 0.6441303011677935, "grad_norm": 2.300096273422241, "learning_rate": 3.141398143333907e-05, "loss": 0.7825, "step": 524 }, { "epoch": 0.645359557467732, "grad_norm": 2.652704954147339, "learning_rate": 3.1223020185398765e-05, "loss": 0.6505, "step": 525 }, { "epoch": 0.6465888137676705, "grad_norm": 0.19123272597789764, "learning_rate": 3.103237726615822e-05, "loss": 0.8317, "step": 526 }, { "epoch": 0.647818070067609, "grad_norm": 0.22544066607952118, "learning_rate": 3.084205590761284e-05, "loss": 1.1174, "step": 527 }, { "epoch": 0.6490473263675476, "grad_norm": 0.25559771060943604, "learning_rate": 3.065205933630655e-05, "loss": 1.258, "step": 528 }, { "epoch": 0.6502765826674862, "grad_norm": 0.2453518509864807, "learning_rate": 3.0462390773277154e-05, "loss": 1.1197, "step": 529 }, { "epoch": 0.6515058389674248, "grad_norm": 0.26897767186164856, "learning_rate": 3.0273053434001662e-05, "loss": 1.2231, "step": 530 }, { "epoch": 0.6527350952673633, "grad_norm": 0.2956353425979614, "learning_rate": 3.0084050528341824e-05, "loss": 0.9809, "step": 531 }, { "epoch": 0.6539643515673018, "grad_norm": 0.3043264150619507, "learning_rate": 2.989538526048968e-05, "loss": 1.2625, "step": 532 }, { "epoch": 0.6551936078672403, "grad_norm": 0.3115137219429016, "learning_rate": 2.9707060828913225e-05, "loss": 1.1544, "step": 533 }, { "epoch": 0.6564228641671789, "grad_norm": 0.29051673412323, "learning_rate": 2.9519080426302238e-05, "loss": 0.8664, "step": 534 }, { "epoch": 0.6576521204671174, "grad_norm": 0.31704720854759216, "learning_rate": 2.933144723951414e-05, "loss": 0.8553, "step": 535 }, { "epoch": 0.6588813767670559, "grad_norm": 0.3343563675880432, "learning_rate": 2.9144164449519917e-05, "loss": 1.042, "step": 536 }, { "epoch": 0.6601106330669945, "grad_norm": 0.411081999540329, "learning_rate": 2.895723523135028e-05, "loss": 1.2587, "step": 537 }, { "epoch": 0.661339889366933, "grad_norm": 0.4190918505191803, "learning_rate": 2.877066275404172e-05, "loss": 1.1546, "step": 538 }, { "epoch": 0.6625691456668715, "grad_norm": 0.5606351494789124, "learning_rate": 2.8584450180582912e-05, "loss": 0.9889, "step": 539 }, { "epoch": 0.6637984019668101, "grad_norm": 0.6339558959007263, "learning_rate": 2.839860066786103e-05, "loss": 0.5278, "step": 540 }, { "epoch": 0.6650276582667486, "grad_norm": 0.6629421710968018, "learning_rate": 2.8213117366608188e-05, "loss": 0.752, "step": 541 }, { "epoch": 0.6662569145666871, "grad_norm": 0.7566842436790466, "learning_rate": 2.802800342134807e-05, "loss": 1.0395, "step": 542 }, { "epoch": 0.6674861708666256, "grad_norm": 1.169986605644226, "learning_rate": 2.784326197034266e-05, "loss": 1.1942, "step": 543 }, { "epoch": 0.6687154271665642, "grad_norm": 2.78249192237854, "learning_rate": 2.7658896145538983e-05, "loss": 1.1694, "step": 544 }, { "epoch": 0.6699446834665027, "grad_norm": 2.5933189392089844, "learning_rate": 2.7474909072515993e-05, "loss": 0.9325, "step": 545 }, { "epoch": 0.6711739397664414, "grad_norm": 2.37711238861084, "learning_rate": 2.7291303870431662e-05, "loss": 0.9246, "step": 546 }, { "epoch": 0.6724031960663799, "grad_norm": 3.3568241596221924, "learning_rate": 2.710808365197e-05, "loss": 0.9492, "step": 547 }, { "epoch": 0.6736324523663184, "grad_norm": 2.975985527038574, "learning_rate": 2.6925251523288346e-05, "loss": 0.8018, "step": 548 }, { "epoch": 0.6748617086662569, "grad_norm": 2.78823184967041, "learning_rate": 2.674281058396473e-05, "loss": 0.843, "step": 549 }, { "epoch": 0.6760909649661955, "grad_norm": 4.179988384246826, "learning_rate": 2.6560763926945275e-05, "loss": 1.0288, "step": 550 }, { "epoch": 0.677320221266134, "grad_norm": 0.21164661645889282, "learning_rate": 2.6379114638491807e-05, "loss": 0.9468, "step": 551 }, { "epoch": 0.6785494775660725, "grad_norm": 0.2241944521665573, "learning_rate": 2.6197865798129462e-05, "loss": 1.1578, "step": 552 }, { "epoch": 0.6797787338660111, "grad_norm": 0.25557973980903625, "learning_rate": 2.601702047859455e-05, "loss": 1.2432, "step": 553 }, { "epoch": 0.6810079901659496, "grad_norm": 0.2733787000179291, "learning_rate": 2.5836581745782475e-05, "loss": 1.2624, "step": 554 }, { "epoch": 0.6822372464658881, "grad_norm": 0.29645049571990967, "learning_rate": 2.5656552658695642e-05, "loss": 1.2544, "step": 555 }, { "epoch": 0.6834665027658267, "grad_norm": 0.29211127758026123, "learning_rate": 2.5476936269391726e-05, "loss": 1.0548, "step": 556 }, { "epoch": 0.6846957590657652, "grad_norm": 0.3013113737106323, "learning_rate": 2.5297735622931874e-05, "loss": 1.0941, "step": 557 }, { "epoch": 0.6859250153657037, "grad_norm": 0.33897635340690613, "learning_rate": 2.5118953757329088e-05, "loss": 1.0855, "step": 558 }, { "epoch": 0.6871542716656422, "grad_norm": 0.36230790615081787, "learning_rate": 2.494059370349673e-05, "loss": 1.0817, "step": 559 }, { "epoch": 0.6883835279655808, "grad_norm": 0.36360612511634827, "learning_rate": 2.4762658485197123e-05, "loss": 1.0868, "step": 560 }, { "epoch": 0.6896127842655193, "grad_norm": 0.45407038927078247, "learning_rate": 2.4585151118990286e-05, "loss": 1.1186, "step": 561 }, { "epoch": 0.690842040565458, "grad_norm": 0.5940976142883301, "learning_rate": 2.4408074614182773e-05, "loss": 1.1448, "step": 562 }, { "epoch": 0.6920712968653965, "grad_norm": 0.6550815105438232, "learning_rate": 2.4231431972776758e-05, "loss": 0.931, "step": 563 }, { "epoch": 0.693300553165335, "grad_norm": 0.6394757032394409, "learning_rate": 2.4055226189419018e-05, "loss": 0.8543, "step": 564 }, { "epoch": 0.6945298094652735, "grad_norm": 0.6903315186500549, "learning_rate": 2.3879460251350255e-05, "loss": 0.8152, "step": 565 }, { "epoch": 0.6957590657652121, "grad_norm": 0.9499403238296509, "learning_rate": 2.3704137138354355e-05, "loss": 1.2532, "step": 566 }, { "epoch": 0.6969883220651506, "grad_norm": 1.2395949363708496, "learning_rate": 2.3529259822708e-05, "loss": 1.3153, "step": 567 }, { "epoch": 0.6982175783650891, "grad_norm": 2.5209338665008545, "learning_rate": 2.3354831269130133e-05, "loss": 1.4484, "step": 568 }, { "epoch": 0.6994468346650277, "grad_norm": 3.072936773300171, "learning_rate": 2.318085443473185e-05, "loss": 0.9566, "step": 569 }, { "epoch": 0.7006760909649662, "grad_norm": 2.40030574798584, "learning_rate": 2.300733226896612e-05, "loss": 0.6958, "step": 570 }, { "epoch": 0.7019053472649047, "grad_norm": 3.1356310844421387, "learning_rate": 2.2834267713577904e-05, "loss": 1.261, "step": 571 }, { "epoch": 0.7031346035648433, "grad_norm": 2.5506911277770996, "learning_rate": 2.2661663702554208e-05, "loss": 0.7489, "step": 572 }, { "epoch": 0.7043638598647818, "grad_norm": 2.605987548828125, "learning_rate": 2.2489523162074393e-05, "loss": 0.8642, "step": 573 }, { "epoch": 0.7055931161647203, "grad_norm": 2.3878705501556396, "learning_rate": 2.2317849010460507e-05, "loss": 0.559, "step": 574 }, { "epoch": 0.7068223724646588, "grad_norm": 4.309600353240967, "learning_rate": 2.2146644158127827e-05, "loss": 1.3898, "step": 575 }, { "epoch": 0.7080516287645974, "grad_norm": 0.2340036779642105, "learning_rate": 2.197591150753559e-05, "loss": 1.1237, "step": 576 }, { "epoch": 0.7092808850645359, "grad_norm": 0.2404404878616333, "learning_rate": 2.1805653953137707e-05, "loss": 1.2235, "step": 577 }, { "epoch": 0.7105101413644745, "grad_norm": 0.2546350657939911, "learning_rate": 2.1635874381333714e-05, "loss": 1.1931, "step": 578 }, { "epoch": 0.7117393976644131, "grad_norm": 0.28081899881362915, "learning_rate": 2.1466575670419876e-05, "loss": 1.1851, "step": 579 }, { "epoch": 0.7129686539643516, "grad_norm": 0.27532532811164856, "learning_rate": 2.1297760690540302e-05, "loss": 1.0733, "step": 580 }, { "epoch": 0.7141979102642901, "grad_norm": 0.26407289505004883, "learning_rate": 2.1129432303638352e-05, "loss": 1.0863, "step": 581 }, { "epoch": 0.7154271665642287, "grad_norm": 0.31387853622436523, "learning_rate": 2.0961593363408156e-05, "loss": 1.1808, "step": 582 }, { "epoch": 0.7166564228641672, "grad_norm": 0.3309866487979889, "learning_rate": 2.079424671524616e-05, "loss": 1.0741, "step": 583 }, { "epoch": 0.7178856791641057, "grad_norm": 0.300689697265625, "learning_rate": 2.0627395196202898e-05, "loss": 0.9483, "step": 584 }, { "epoch": 0.7191149354640443, "grad_norm": 0.3478316366672516, "learning_rate": 2.046104163493493e-05, "loss": 1.144, "step": 585 }, { "epoch": 0.7203441917639828, "grad_norm": 0.44858890771865845, "learning_rate": 2.0295188851656892e-05, "loss": 1.1384, "step": 586 }, { "epoch": 0.7215734480639213, "grad_norm": 0.4991127550601959, "learning_rate": 2.0129839658093607e-05, "loss": 1.3527, "step": 587 }, { "epoch": 0.7228027043638598, "grad_norm": 0.5499400496482849, "learning_rate": 1.996499685743254e-05, "loss": 1.2116, "step": 588 }, { "epoch": 0.7240319606637984, "grad_norm": 0.6611432433128357, "learning_rate": 1.980066324427613e-05, "loss": 0.5055, "step": 589 }, { "epoch": 0.7252612169637369, "grad_norm": 0.7008240222930908, "learning_rate": 1.9636841604594557e-05, "loss": 0.7072, "step": 590 }, { "epoch": 0.7264904732636754, "grad_norm": 0.7243725061416626, "learning_rate": 1.9473534715678427e-05, "loss": 0.8031, "step": 591 }, { "epoch": 0.727719729563614, "grad_norm": 0.8577756285667419, "learning_rate": 1.9310745346091714e-05, "loss": 1.1383, "step": 592 }, { "epoch": 0.7289489858635525, "grad_norm": 1.2171465158462524, "learning_rate": 1.9148476255624764e-05, "loss": 0.9733, "step": 593 }, { "epoch": 0.7301782421634911, "grad_norm": 3.537208318710327, "learning_rate": 1.898673019524764e-05, "loss": 1.1197, "step": 594 }, { "epoch": 0.7314074984634297, "grad_norm": 2.420771360397339, "learning_rate": 1.8825509907063327e-05, "loss": 0.7744, "step": 595 }, { "epoch": 0.7326367547633682, "grad_norm": 3.1013336181640625, "learning_rate": 1.8664818124261374e-05, "loss": 0.7931, "step": 596 }, { "epoch": 0.7338660110633067, "grad_norm": 3.5393359661102295, "learning_rate": 1.8504657571071515e-05, "loss": 1.0527, "step": 597 }, { "epoch": 0.7350952673632453, "grad_norm": 2.782148838043213, "learning_rate": 1.8345030962717407e-05, "loss": 0.8034, "step": 598 }, { "epoch": 0.7363245236631838, "grad_norm": 2.12009596824646, "learning_rate": 1.8185941005370745e-05, "loss": 0.4618, "step": 599 }, { "epoch": 0.7375537799631223, "grad_norm": 3.6186935901641846, "learning_rate": 1.802739039610527e-05, "loss": 0.793, "step": 600 }, { "epoch": 0.7387830362630609, "grad_norm": 0.22188998758792877, "learning_rate": 1.786938182285107e-05, "loss": 1.1901, "step": 601 }, { "epoch": 0.7400122925629994, "grad_norm": 0.2288677990436554, "learning_rate": 1.7711917964349062e-05, "loss": 1.1755, "step": 602 }, { "epoch": 0.7412415488629379, "grad_norm": 0.25471025705337524, "learning_rate": 1.7555001490105488e-05, "loss": 1.2616, "step": 603 }, { "epoch": 0.7424708051628764, "grad_norm": 0.23840656876564026, "learning_rate": 1.7398635060346746e-05, "loss": 1.0882, "step": 604 }, { "epoch": 0.743700061462815, "grad_norm": 0.2534352242946625, "learning_rate": 1.7242821325974258e-05, "loss": 1.1328, "step": 605 }, { "epoch": 0.7449293177627535, "grad_norm": 0.279732346534729, "learning_rate": 1.7087562928519514e-05, "loss": 1.1703, "step": 606 }, { "epoch": 0.746158574062692, "grad_norm": 0.3084731698036194, "learning_rate": 1.69328625000993e-05, "loss": 1.1646, "step": 607 }, { "epoch": 0.7473878303626306, "grad_norm": 0.3052552044391632, "learning_rate": 1.6778722663371053e-05, "loss": 1.1481, "step": 608 }, { "epoch": 0.7486170866625691, "grad_norm": 0.3502384126186371, "learning_rate": 1.662514603148847e-05, "loss": 1.0333, "step": 609 }, { "epoch": 0.7498463429625076, "grad_norm": 0.3393804728984833, "learning_rate": 1.6472135208057126e-05, "loss": 1.0056, "step": 610 }, { "epoch": 0.7510755992624463, "grad_norm": 0.3970140814781189, "learning_rate": 1.631969278709041e-05, "loss": 1.1957, "step": 611 }, { "epoch": 0.7523048555623848, "grad_norm": 0.4345152974128723, "learning_rate": 1.616782135296544e-05, "loss": 1.0145, "step": 612 }, { "epoch": 0.7523048555623848, "eval_loss": 0.9307632446289062, "eval_runtime": 64.7426, "eval_samples_per_second": 10.58, "eval_steps_per_second": 5.298, "step": 612 }, { "epoch": 0.7535341118623233, "grad_norm": 0.45276275277137756, "learning_rate": 1.6016523480379382e-05, "loss": 0.8856, "step": 613 }, { "epoch": 0.7547633681622619, "grad_norm": 0.6476858854293823, "learning_rate": 1.5865801734305668e-05, "loss": 0.8316, "step": 614 }, { "epoch": 0.7559926244622004, "grad_norm": 0.6815754771232605, "learning_rate": 1.5715658669950634e-05, "loss": 0.6487, "step": 615 }, { "epoch": 0.7572218807621389, "grad_norm": 0.7536165118217468, "learning_rate": 1.5566096832710154e-05, "loss": 0.9835, "step": 616 }, { "epoch": 0.7584511370620775, "grad_norm": 1.0182470083236694, "learning_rate": 1.541711875812641e-05, "loss": 1.172, "step": 617 }, { "epoch": 0.759680393362016, "grad_norm": 2.240806818008423, "learning_rate": 1.5268726971845037e-05, "loss": 1.3291, "step": 618 }, { "epoch": 0.7609096496619545, "grad_norm": 3.0141806602478027, "learning_rate": 1.5120923989572244e-05, "loss": 0.8101, "step": 619 }, { "epoch": 0.762138905961893, "grad_norm": 3.7101502418518066, "learning_rate": 1.4973712317032135e-05, "loss": 1.0581, "step": 620 }, { "epoch": 0.7633681622618316, "grad_norm": 3.892627000808716, "learning_rate": 1.482709444992425e-05, "loss": 1.007, "step": 621 }, { "epoch": 0.7645974185617701, "grad_norm": 3.3281090259552, "learning_rate": 1.4681072873881312e-05, "loss": 0.7403, "step": 622 }, { "epoch": 0.7658266748617086, "grad_norm": 3.755115509033203, "learning_rate": 1.4535650064427003e-05, "loss": 0.9939, "step": 623 }, { "epoch": 0.7670559311616472, "grad_norm": 2.45129132270813, "learning_rate": 1.439082848693406e-05, "loss": 0.6902, "step": 624 }, { "epoch": 0.7682851874615857, "grad_norm": 3.0480661392211914, "learning_rate": 1.4246610596582444e-05, "loss": 0.6467, "step": 625 }, { "epoch": 0.7695144437615242, "grad_norm": 0.2060498744249344, "learning_rate": 1.41029988383177e-05, "loss": 0.9195, "step": 626 }, { "epoch": 0.7707437000614629, "grad_norm": 0.21947383880615234, "learning_rate": 1.3959995646809549e-05, "loss": 1.0644, "step": 627 }, { "epoch": 0.7719729563614014, "grad_norm": 0.26282572746276855, "learning_rate": 1.381760344641061e-05, "loss": 1.3907, "step": 628 }, { "epoch": 0.7732022126613399, "grad_norm": 0.2563353180885315, "learning_rate": 1.3675824651115276e-05, "loss": 1.2281, "step": 629 }, { "epoch": 0.7744314689612785, "grad_norm": 0.3003043532371521, "learning_rate": 1.3534661664518817e-05, "loss": 1.2247, "step": 630 }, { "epoch": 0.775660725261217, "grad_norm": 0.30270689725875854, "learning_rate": 1.339411687977657e-05, "loss": 1.3467, "step": 631 }, { "epoch": 0.7768899815611555, "grad_norm": 0.3199015259742737, "learning_rate": 1.325419267956346e-05, "loss": 1.1064, "step": 632 }, { "epoch": 0.778119237861094, "grad_norm": 0.33701401948928833, "learning_rate": 1.3114891436033522e-05, "loss": 1.1541, "step": 633 }, { "epoch": 0.7793484941610326, "grad_norm": 0.3596295714378357, "learning_rate": 1.2976215510779755e-05, "loss": 1.0702, "step": 634 }, { "epoch": 0.7805777504609711, "grad_norm": 0.4676484167575836, "learning_rate": 1.2838167254794004e-05, "loss": 0.9612, "step": 635 }, { "epoch": 0.7818070067609096, "grad_norm": 0.43727314472198486, "learning_rate": 1.2700749008427205e-05, "loss": 1.171, "step": 636 }, { "epoch": 0.7830362630608482, "grad_norm": 0.6096376180648804, "learning_rate": 1.2563963101349619e-05, "loss": 1.2183, "step": 637 }, { "epoch": 0.7842655193607867, "grad_norm": 0.7051495909690857, "learning_rate": 1.2427811852511395e-05, "loss": 0.7788, "step": 638 }, { "epoch": 0.7854947756607252, "grad_norm": 0.6735429167747498, "learning_rate": 1.2292297570103229e-05, "loss": 0.7001, "step": 639 }, { "epoch": 0.7867240319606638, "grad_norm": 0.740139901638031, "learning_rate": 1.2157422551517228e-05, "loss": 0.8659, "step": 640 }, { "epoch": 0.7879532882606023, "grad_norm": 0.8181758522987366, "learning_rate": 1.202318908330795e-05, "loss": 1.1304, "step": 641 }, { "epoch": 0.7891825445605408, "grad_norm": 1.1049834489822388, "learning_rate": 1.188959944115372e-05, "loss": 1.0549, "step": 642 }, { "epoch": 0.7904118008604795, "grad_norm": 2.664985418319702, "learning_rate": 1.1756655889817953e-05, "loss": 1.2242, "step": 643 }, { "epoch": 0.791641057160418, "grad_norm": 3.3801634311676025, "learning_rate": 1.1624360683110819e-05, "loss": 0.5464, "step": 644 }, { "epoch": 0.7928703134603565, "grad_norm": 3.139307975769043, "learning_rate": 1.1492716063850973e-05, "loss": 0.9791, "step": 645 }, { "epoch": 0.794099569760295, "grad_norm": 3.501133918762207, "learning_rate": 1.1361724263827633e-05, "loss": 1.0201, "step": 646 }, { "epoch": 0.7953288260602336, "grad_norm": 2.9995079040527344, "learning_rate": 1.123138750376262e-05, "loss": 1.0347, "step": 647 }, { "epoch": 0.7965580823601721, "grad_norm": 3.3414182662963867, "learning_rate": 1.1101707993272825e-05, "loss": 0.912, "step": 648 }, { "epoch": 0.7977873386601106, "grad_norm": 2.3033857345581055, "learning_rate": 1.097268793083266e-05, "loss": 0.551, "step": 649 }, { "epoch": 0.7990165949600492, "grad_norm": 2.474606990814209, "learning_rate": 1.084432950373685e-05, "loss": 0.4882, "step": 650 }, { "epoch": 0.8002458512599877, "grad_norm": 0.21653778851032257, "learning_rate": 1.071663488806331e-05, "loss": 1.1304, "step": 651 }, { "epoch": 0.8014751075599262, "grad_norm": 0.19837996363639832, "learning_rate": 1.0589606248636292e-05, "loss": 0.8167, "step": 652 }, { "epoch": 0.8027043638598648, "grad_norm": 0.2383168786764145, "learning_rate": 1.0463245738989636e-05, "loss": 1.0729, "step": 653 }, { "epoch": 0.8039336201598033, "grad_norm": 0.2854044437408447, "learning_rate": 1.0337555501330281e-05, "loss": 1.1909, "step": 654 }, { "epoch": 0.8051628764597418, "grad_norm": 0.2671623229980469, "learning_rate": 1.0212537666501976e-05, "loss": 1.2883, "step": 655 }, { "epoch": 0.8063921327596804, "grad_norm": 0.2792122960090637, "learning_rate": 1.0088194353949137e-05, "loss": 1.0991, "step": 656 }, { "epoch": 0.8076213890596189, "grad_norm": 0.2957600951194763, "learning_rate": 9.96452767168089e-06, "loss": 1.0984, "step": 657 }, { "epoch": 0.8088506453595574, "grad_norm": 0.31414932012557983, "learning_rate": 9.841539716235387e-06, "loss": 1.0569, "step": 658 }, { "epoch": 0.8100799016594961, "grad_norm": 0.3483067452907562, "learning_rate": 9.719232572644187e-06, "loss": 1.0767, "step": 659 }, { "epoch": 0.8113091579594346, "grad_norm": 0.3564923107624054, "learning_rate": 9.597608314396978e-06, "loss": 1.2318, "step": 660 }, { "epoch": 0.8125384142593731, "grad_norm": 0.40873420238494873, "learning_rate": 9.476669003406403e-06, "loss": 1.2112, "step": 661 }, { "epoch": 0.8137676705593117, "grad_norm": 0.4222586452960968, "learning_rate": 9.356416689973108e-06, "loss": 1.066, "step": 662 }, { "epoch": 0.8149969268592502, "grad_norm": 0.5865674614906311, "learning_rate": 9.236853412750935e-06, "loss": 1.0605, "step": 663 }, { "epoch": 0.8162261831591887, "grad_norm": 0.7420896291732788, "learning_rate": 9.11798119871245e-06, "loss": 0.7511, "step": 664 }, { "epoch": 0.8174554394591272, "grad_norm": 0.6792820692062378, "learning_rate": 8.99980206311452e-06, "loss": 0.9264, "step": 665 }, { "epoch": 0.8186846957590658, "grad_norm": 0.6937956213951111, "learning_rate": 8.882318009464125e-06, "loss": 0.6352, "step": 666 }, { "epoch": 0.8199139520590043, "grad_norm": 0.7615432739257812, "learning_rate": 8.765531029484476e-06, "loss": 0.9749, "step": 667 }, { "epoch": 0.8211432083589428, "grad_norm": 1.0325685739517212, "learning_rate": 8.64944310308114e-06, "loss": 1.2544, "step": 668 }, { "epoch": 0.8223724646588814, "grad_norm": 3.312368631362915, "learning_rate": 8.534056198308582e-06, "loss": 1.1743, "step": 669 }, { "epoch": 0.8236017209588199, "grad_norm": 2.5028300285339355, "learning_rate": 8.419372271336745e-06, "loss": 0.6261, "step": 670 }, { "epoch": 0.8248309772587584, "grad_norm": 2.5545358657836914, "learning_rate": 8.305393266417887e-06, "loss": 0.6315, "step": 671 }, { "epoch": 0.826060233558697, "grad_norm": 2.4186129570007324, "learning_rate": 8.192121115853602e-06, "loss": 0.6056, "step": 672 }, { "epoch": 0.8272894898586355, "grad_norm": 2.6672956943511963, "learning_rate": 8.079557739962128e-06, "loss": 0.644, "step": 673 }, { "epoch": 0.828518746158574, "grad_norm": 3.801270008087158, "learning_rate": 7.967705047045715e-06, "loss": 1.0116, "step": 674 }, { "epoch": 0.8297480024585125, "grad_norm": 2.186417579650879, "learning_rate": 7.856564933358324e-06, "loss": 0.4153, "step": 675 }, { "epoch": 0.8309772587584512, "grad_norm": 0.2179417610168457, "learning_rate": 7.746139283073473e-06, "loss": 1.0057, "step": 676 }, { "epoch": 0.8322065150583897, "grad_norm": 0.2457554191350937, "learning_rate": 7.636429968252257e-06, "loss": 1.2948, "step": 677 }, { "epoch": 0.8334357713583282, "grad_norm": 0.2506256401538849, "learning_rate": 7.527438848811652e-06, "loss": 1.1521, "step": 678 }, { "epoch": 0.8346650276582668, "grad_norm": 0.25727221369743347, "learning_rate": 7.4191677724929906e-06, "loss": 1.1952, "step": 679 }, { "epoch": 0.8358942839582053, "grad_norm": 0.265831857919693, "learning_rate": 7.31161857483057e-06, "loss": 1.1952, "step": 680 }, { "epoch": 0.8371235402581438, "grad_norm": 0.27371159195899963, "learning_rate": 7.204793079120636e-06, "loss": 1.1563, "step": 681 }, { "epoch": 0.8383527965580824, "grad_norm": 0.30959704518318176, "learning_rate": 7.0986930963903575e-06, "loss": 1.122, "step": 682 }, { "epoch": 0.8395820528580209, "grad_norm": 0.335997074842453, "learning_rate": 6.993320425367222e-06, "loss": 1.0265, "step": 683 }, { "epoch": 0.8408113091579594, "grad_norm": 0.3063446879386902, "learning_rate": 6.8886768524485e-06, "loss": 0.8463, "step": 684 }, { "epoch": 0.842040565457898, "grad_norm": 0.34628668427467346, "learning_rate": 6.7847641516709635e-06, "loss": 0.9597, "step": 685 }, { "epoch": 0.8432698217578365, "grad_norm": 0.38880714774131775, "learning_rate": 6.681584084680787e-06, "loss": 0.9935, "step": 686 }, { "epoch": 0.844499078057775, "grad_norm": 0.45289453864097595, "learning_rate": 6.579138400703716e-06, "loss": 0.911, "step": 687 }, { "epoch": 0.8457283343577136, "grad_norm": 0.697723388671875, "learning_rate": 6.4774288365154035e-06, "loss": 0.9489, "step": 688 }, { "epoch": 0.8469575906576521, "grad_norm": 0.7846165895462036, "learning_rate": 6.376457116411971e-06, "loss": 0.9332, "step": 689 }, { "epoch": 0.8481868469575906, "grad_norm": 0.6980641484260559, "learning_rate": 6.2762249521807645e-06, "loss": 0.8282, "step": 690 }, { "epoch": 0.8494161032575291, "grad_norm": 0.6430233716964722, "learning_rate": 6.17673404307132e-06, "loss": 0.7686, "step": 691 }, { "epoch": 0.8506453595574678, "grad_norm": 0.7281519174575806, "learning_rate": 6.077986075766612e-06, "loss": 0.9116, "step": 692 }, { "epoch": 0.8518746158574063, "grad_norm": 1.0536761283874512, "learning_rate": 5.979982724354366e-06, "loss": 1.1009, "step": 693 }, { "epoch": 0.8531038721573448, "grad_norm": 1.9653775691986084, "learning_rate": 5.882725650298787e-06, "loss": 1.1204, "step": 694 }, { "epoch": 0.8543331284572834, "grad_norm": 3.2125988006591797, "learning_rate": 5.7862165024123175e-06, "loss": 1.3958, "step": 695 }, { "epoch": 0.8555623847572219, "grad_norm": 3.528750419616699, "learning_rate": 5.690456916827691e-06, "loss": 0.9974, "step": 696 }, { "epoch": 0.8567916410571604, "grad_norm": 3.617098808288574, "learning_rate": 5.5954485169702306e-06, "loss": 1.1263, "step": 697 }, { "epoch": 0.858020897357099, "grad_norm": 2.575523614883423, "learning_rate": 5.501192913530301e-06, "loss": 0.6504, "step": 698 }, { "epoch": 0.8592501536570375, "grad_norm": 3.0249316692352295, "learning_rate": 5.407691704435991e-06, "loss": 0.6346, "step": 699 }, { "epoch": 0.860479409956976, "grad_norm": 2.3461408615112305, "learning_rate": 5.314946474826066e-06, "loss": 0.4619, "step": 700 }, { "epoch": 0.8617086662569146, "grad_norm": 0.2000708281993866, "learning_rate": 5.222958797023036e-06, "loss": 0.9165, "step": 701 }, { "epoch": 0.8629379225568531, "grad_norm": 0.23434923589229584, "learning_rate": 5.13173023050656e-06, "loss": 1.1903, "step": 702 }, { "epoch": 0.8641671788567916, "grad_norm": 0.25243085622787476, "learning_rate": 5.041262321886958e-06, "loss": 1.2185, "step": 703 }, { "epoch": 0.8653964351567301, "grad_norm": 0.25832071900367737, "learning_rate": 4.951556604879048e-06, "loss": 1.2881, "step": 704 }, { "epoch": 0.8666256914566687, "grad_norm": 0.25066548585891724, "learning_rate": 4.862614600276061e-06, "loss": 1.1272, "step": 705 }, { "epoch": 0.8678549477566072, "grad_norm": 0.27538540959358215, "learning_rate": 4.774437815923938e-06, "loss": 1.003, "step": 706 }, { "epoch": 0.8690842040565457, "grad_norm": 0.27678731083869934, "learning_rate": 4.687027746695727e-06, "loss": 1.0579, "step": 707 }, { "epoch": 0.8703134603564844, "grad_norm": 0.3305651843547821, "learning_rate": 4.600385874466256e-06, "loss": 1.0734, "step": 708 }, { "epoch": 0.8715427166564229, "grad_norm": 0.3679453134536743, "learning_rate": 4.514513668087011e-06, "loss": 1.1202, "step": 709 }, { "epoch": 0.8727719729563614, "grad_norm": 0.32898518443107605, "learning_rate": 4.429412583361209e-06, "loss": 1.1176, "step": 710 }, { "epoch": 0.8740012292563, "grad_norm": 0.37726449966430664, "learning_rate": 4.34508406301915e-06, "loss": 1.1331, "step": 711 }, { "epoch": 0.8752304855562385, "grad_norm": 0.44210806488990784, "learning_rate": 4.261529536693737e-06, "loss": 0.9606, "step": 712 }, { "epoch": 0.876459741856177, "grad_norm": 0.5424479246139526, "learning_rate": 4.178750420896255e-06, "loss": 0.9004, "step": 713 }, { "epoch": 0.8776889981561156, "grad_norm": 0.7757157683372498, "learning_rate": 4.0967481189923384e-06, "loss": 0.8007, "step": 714 }, { "epoch": 0.8789182544560541, "grad_norm": 0.6526496410369873, "learning_rate": 4.015524021178196e-06, "loss": 0.7106, "step": 715 }, { "epoch": 0.8801475107559926, "grad_norm": 0.6553987264633179, "learning_rate": 3.935079504457034e-06, "loss": 0.668, "step": 716 }, { "epoch": 0.8813767670559312, "grad_norm": 0.795111358165741, "learning_rate": 3.8554159326157304e-06, "loss": 1.0273, "step": 717 }, { "epoch": 0.8826060233558697, "grad_norm": 1.3710993528366089, "learning_rate": 3.7765346562016744e-06, "loss": 1.0677, "step": 718 }, { "epoch": 0.8838352796558082, "grad_norm": 2.689974308013916, "learning_rate": 3.6984370124999058e-06, "loss": 1.3152, "step": 719 }, { "epoch": 0.8850645359557467, "grad_norm": 3.0526108741760254, "learning_rate": 3.621124325510422e-06, "loss": 0.7108, "step": 720 }, { "epoch": 0.8862937922556853, "grad_norm": 2.8882553577423096, "learning_rate": 3.5445979059257505e-06, "loss": 0.7952, "step": 721 }, { "epoch": 0.8875230485556238, "grad_norm": 2.545741319656372, "learning_rate": 3.4688590511087304e-06, "loss": 0.5186, "step": 722 }, { "epoch": 0.8887523048555623, "grad_norm": 3.898621082305908, "learning_rate": 3.3939090450704925e-06, "loss": 0.921, "step": 723 }, { "epoch": 0.8899815611555009, "grad_norm": 2.621055841445923, "learning_rate": 3.3197491584487093e-06, "loss": 0.7413, "step": 724 }, { "epoch": 0.8912108174554395, "grad_norm": 2.9799768924713135, "learning_rate": 3.246380648486058e-06, "loss": 0.5268, "step": 725 }, { "epoch": 0.892440073755378, "grad_norm": 0.2122289389371872, "learning_rate": 3.1738047590088803e-06, "loss": 1.0725, "step": 726 }, { "epoch": 0.8936693300553166, "grad_norm": 0.2591450810432434, "learning_rate": 3.10202272040615e-06, "loss": 1.4448, "step": 727 }, { "epoch": 0.8948985863552551, "grad_norm": 0.22906170785427094, "learning_rate": 3.0310357496085405e-06, "loss": 1.0041, "step": 728 }, { "epoch": 0.8961278426551936, "grad_norm": 0.2589828073978424, "learning_rate": 2.9608450500678565e-06, "loss": 1.2252, "step": 729 }, { "epoch": 0.8973570989551322, "grad_norm": 0.27138885855674744, "learning_rate": 2.8914518117366006e-06, "loss": 1.2409, "step": 730 }, { "epoch": 0.8985863552550707, "grad_norm": 0.2877761423587799, "learning_rate": 2.8228572110478133e-06, "loss": 1.1005, "step": 731 }, { "epoch": 0.8998156115550092, "grad_norm": 0.3168583810329437, "learning_rate": 2.755062410895104e-06, "loss": 0.9683, "step": 732 }, { "epoch": 0.9010448678549478, "grad_norm": 0.3234322667121887, "learning_rate": 2.6880685606129664e-06, "loss": 0.979, "step": 733 }, { "epoch": 0.9022741241548863, "grad_norm": 0.33790323138237, "learning_rate": 2.62187679595729e-06, "loss": 1.1439, "step": 734 }, { "epoch": 0.9035033804548248, "grad_norm": 0.34143367409706116, "learning_rate": 2.55648823908608e-06, "loss": 1.1412, "step": 735 }, { "epoch": 0.9047326367547633, "grad_norm": 0.41084036231040955, "learning_rate": 2.4919039985404626e-06, "loss": 1.1742, "step": 736 }, { "epoch": 0.9059618930547019, "grad_norm": 0.5593938231468201, "learning_rate": 2.428125169225881e-06, "loss": 1.0794, "step": 737 }, { "epoch": 0.9071911493546404, "grad_norm": 0.7228390574455261, "learning_rate": 2.36515283239353e-06, "loss": 0.7684, "step": 738 }, { "epoch": 0.9084204056545789, "grad_norm": 0.6799226999282837, "learning_rate": 2.3029880556220074e-06, "loss": 1.0537, "step": 739 }, { "epoch": 0.9096496619545175, "grad_norm": 0.6442016959190369, "learning_rate": 2.241631892799262e-06, "loss": 0.498, "step": 740 }, { "epoch": 0.9108789182544561, "grad_norm": 0.8138104677200317, "learning_rate": 2.181085384104703e-06, "loss": 0.9415, "step": 741 }, { "epoch": 0.9121081745543946, "grad_norm": 0.864895224571228, "learning_rate": 2.121349555991525e-06, "loss": 1.0131, "step": 742 }, { "epoch": 0.9133374308543332, "grad_norm": 1.3241431713104248, "learning_rate": 2.0624254211693894e-06, "loss": 1.0531, "step": 743 }, { "epoch": 0.9145666871542717, "grad_norm": 2.810883045196533, "learning_rate": 2.004313978587186e-06, "loss": 0.8746, "step": 744 }, { "epoch": 0.9157959434542102, "grad_norm": 3.3806657791137695, "learning_rate": 1.9470162134161143e-06, "loss": 0.804, "step": 745 }, { "epoch": 0.9170251997541488, "grad_norm": 3.9051411151885986, "learning_rate": 1.8905330970330259e-06, "loss": 1.1265, "step": 746 }, { "epoch": 0.9182544560540873, "grad_norm": 2.50091290473938, "learning_rate": 1.83486558700387e-06, "loss": 0.8042, "step": 747 }, { "epoch": 0.9194837123540258, "grad_norm": 2.444403648376465, "learning_rate": 1.78001462706755e-06, "loss": 0.7185, "step": 748 }, { "epoch": 0.9207129686539643, "grad_norm": 2.565976619720459, "learning_rate": 1.7259811471198706e-06, "loss": 0.5854, "step": 749 }, { "epoch": 0.9219422249539029, "grad_norm": 3.5841028690338135, "learning_rate": 1.6727660631977893e-06, "loss": 1.1298, "step": 750 }, { "epoch": 0.9231714812538414, "grad_norm": 0.18373464047908783, "learning_rate": 1.620370277463884e-06, "loss": 0.798, "step": 751 }, { "epoch": 0.9244007375537799, "grad_norm": 0.23574425280094147, "learning_rate": 1.5687946781910378e-06, "loss": 1.1075, "step": 752 }, { "epoch": 0.9256299938537185, "grad_norm": 0.25111323595046997, "learning_rate": 1.5180401397474343e-06, "loss": 1.2677, "step": 753 }, { "epoch": 0.926859250153657, "grad_norm": 0.23998814821243286, "learning_rate": 1.4681075225816854e-06, "loss": 1.0985, "step": 754 }, { "epoch": 0.9280885064535955, "grad_norm": 0.2664302587509155, "learning_rate": 1.4189976732082666e-06, "loss": 1.0119, "step": 755 }, { "epoch": 0.9293177627535341, "grad_norm": 0.28511667251586914, "learning_rate": 1.3707114241931328e-06, "loss": 1.2079, "step": 756 }, { "epoch": 0.9305470190534727, "grad_norm": 0.278188556432724, "learning_rate": 1.3232495941396639e-06, "loss": 1.1176, "step": 757 }, { "epoch": 0.9317762753534112, "grad_norm": 0.3081459105014801, "learning_rate": 1.2766129876747413e-06, "loss": 0.9467, "step": 758 }, { "epoch": 0.9330055316533498, "grad_norm": 0.34005647897720337, "learning_rate": 1.2308023954351043e-06, "loss": 1.0071, "step": 759 }, { "epoch": 0.9342347879532883, "grad_norm": 0.3434988856315613, "learning_rate": 1.1858185940539779e-06, "loss": 1.1407, "step": 760 }, { "epoch": 0.9354640442532268, "grad_norm": 0.38441202044487, "learning_rate": 1.1416623461478704e-06, "loss": 1.316, "step": 761 }, { "epoch": 0.9366933005531654, "grad_norm": 0.42989110946655273, "learning_rate": 1.0983344003036912e-06, "loss": 1.2446, "step": 762 }, { "epoch": 0.9379225568531039, "grad_norm": 0.4977233111858368, "learning_rate": 1.055835491066004e-06, "loss": 0.7431, "step": 763 }, { "epoch": 0.9391518131530424, "grad_norm": 0.6383669376373291, "learning_rate": 1.014166338924627e-06, "loss": 0.5211, "step": 764 }, { "epoch": 0.9403810694529809, "grad_norm": 0.7088152766227722, "learning_rate": 9.733276503023692e-07, "loss": 0.537, "step": 765 }, { "epoch": 0.9416103257529195, "grad_norm": 0.6624244451522827, "learning_rate": 9.33320117543085e-07, "loss": 0.8375, "step": 766 }, { "epoch": 0.942839582052858, "grad_norm": 0.7807971239089966, "learning_rate": 8.941444188999393e-07, "loss": 0.7689, "step": 767 }, { "epoch": 0.9440688383527965, "grad_norm": 1.2817997932434082, "learning_rate": 8.558012185238939e-07, "loss": 1.1247, "step": 768 }, { "epoch": 0.9452980946527351, "grad_norm": 3.324708938598633, "learning_rate": 8.182911664524562e-07, "loss": 1.0981, "step": 769 }, { "epoch": 0.9465273509526736, "grad_norm": 3.574130058288574, "learning_rate": 7.816148985986483e-07, "loss": 0.6504, "step": 770 }, { "epoch": 0.9477566072526121, "grad_norm": 3.618056297302246, "learning_rate": 7.457730367402549e-07, "loss": 0.6916, "step": 771 }, { "epoch": 0.9489858635525507, "grad_norm": 2.9001710414886475, "learning_rate": 7.107661885092321e-07, "loss": 0.8261, "step": 772 }, { "epoch": 0.9502151198524893, "grad_norm": 2.4319024085998535, "learning_rate": 6.765949473814648e-07, "loss": 0.9126, "step": 773 }, { "epoch": 0.9514443761524278, "grad_norm": 3.393937110900879, "learning_rate": 6.432598926666589e-07, "loss": 0.8197, "step": 774 }, { "epoch": 0.9526736324523664, "grad_norm": 3.406257390975952, "learning_rate": 6.107615894985375e-07, "loss": 0.9015, "step": 775 }, { "epoch": 0.9539028887523049, "grad_norm": 0.21785248816013336, "learning_rate": 5.791005888252765e-07, "loss": 0.9666, "step": 776 }, { "epoch": 0.9551321450522434, "grad_norm": 0.22744600474834442, "learning_rate": 5.482774274001401e-07, "loss": 1.1927, "step": 777 }, { "epoch": 0.956361401352182, "grad_norm": 0.26558175683021545, "learning_rate": 5.18292627772382e-07, "loss": 1.3098, "step": 778 }, { "epoch": 0.9575906576521205, "grad_norm": 0.25804319977760315, "learning_rate": 4.891466982783977e-07, "loss": 1.2578, "step": 779 }, { "epoch": 0.958819913952059, "grad_norm": 0.2447444498538971, "learning_rate": 4.60840133033108e-07, "loss": 0.9595, "step": 780 }, { "epoch": 0.9600491702519975, "grad_norm": 0.2831968069076538, "learning_rate": 4.3337341192157265e-07, "loss": 1.0676, "step": 781 }, { "epoch": 0.9612784265519361, "grad_norm": 0.2845923602581024, "learning_rate": 4.067470005908625e-07, "loss": 0.9645, "step": 782 }, { "epoch": 0.9625076828518746, "grad_norm": 0.3282490372657776, "learning_rate": 3.809613504421661e-07, "loss": 0.8996, "step": 783 }, { "epoch": 0.9637369391518131, "grad_norm": 0.3805837631225586, "learning_rate": 3.5601689862311826e-07, "loss": 0.9756, "step": 784 }, { "epoch": 0.9649661954517517, "grad_norm": 0.3583146631717682, "learning_rate": 3.3191406802041693e-07, "loss": 1.0574, "step": 785 }, { "epoch": 0.9661954517516902, "grad_norm": 0.4697728753089905, "learning_rate": 3.0865326725263435e-07, "loss": 1.4301, "step": 786 }, { "epoch": 0.9674247080516287, "grad_norm": 0.6786931157112122, "learning_rate": 2.8623489066329503e-07, "loss": 1.0623, "step": 787 }, { "epoch": 0.9686539643515673, "grad_norm": 0.6815045475959778, "learning_rate": 2.646593183142088e-07, "loss": 0.7782, "step": 788 }, { "epoch": 0.9698832206515058, "grad_norm": 0.7787659764289856, "learning_rate": 2.4392691597898143e-07, "loss": 0.8097, "step": 789 }, { "epoch": 0.9711124769514444, "grad_norm": 0.7595354318618774, "learning_rate": 2.2403803513686428e-07, "loss": 0.9502, "step": 790 }, { "epoch": 0.972341733251383, "grad_norm": 0.8005998730659485, "learning_rate": 2.0499301296676432e-07, "loss": 0.7545, "step": 791 }, { "epoch": 0.9735709895513215, "grad_norm": 1.0289498567581177, "learning_rate": 1.8679217234154334e-07, "loss": 1.2025, "step": 792 }, { "epoch": 0.97480024585126, "grad_norm": 1.7563585042953491, "learning_rate": 1.6943582182253336e-07, "loss": 1.1175, "step": 793 }, { "epoch": 0.9760295021511985, "grad_norm": 3.998744487762451, "learning_rate": 1.5292425565430757e-07, "loss": 1.2125, "step": 794 }, { "epoch": 0.9772587584511371, "grad_norm": 2.2866718769073486, "learning_rate": 1.372577537597064e-07, "loss": 0.3866, "step": 795 }, { "epoch": 0.9784880147510756, "grad_norm": 3.594325542449951, "learning_rate": 1.224365817350692e-07, "loss": 0.8022, "step": 796 }, { "epoch": 0.9797172710510141, "grad_norm": 2.57466197013855, "learning_rate": 1.0846099084574346e-07, "loss": 0.8229, "step": 797 }, { "epoch": 0.9809465273509527, "grad_norm": 2.2920215129852295, "learning_rate": 9.533121802183797e-08, "loss": 0.509, "step": 798 }, { "epoch": 0.9821757836508912, "grad_norm": 2.825840711593628, "learning_rate": 8.304748585417078e-08, "loss": 0.9473, "step": 799 }, { "epoch": 0.9834050399508297, "grad_norm": 3.6677334308624268, "learning_rate": 7.161000259053308e-08, "loss": 0.7938, "step": 800 }, { "epoch": 0.9846342962507683, "grad_norm": 0.2495596706867218, "learning_rate": 6.10189621321422e-08, "loss": 1.2466, "step": 801 }, { "epoch": 0.9858635525507068, "grad_norm": 0.2586044371128082, "learning_rate": 5.127454403034415e-08, "loss": 1.3163, "step": 802 }, { "epoch": 0.9870928088506453, "grad_norm": 0.2908371388912201, "learning_rate": 4.2376913483599404e-08, "loss": 1.2477, "step": 803 }, { "epoch": 0.9883220651505839, "grad_norm": 0.3172759413719177, "learning_rate": 3.4326221334640695e-08, "loss": 1.1196, "step": 804 }, { "epoch": 0.9895513214505224, "grad_norm": 0.33819660544395447, "learning_rate": 2.712260406795286e-08, "loss": 1.0077, "step": 805 }, { "epoch": 0.990780577750461, "grad_norm": 0.353371262550354, "learning_rate": 2.076618380744133e-08, "loss": 1.0194, "step": 806 }, { "epoch": 0.9920098340503996, "grad_norm": 0.4687137007713318, "learning_rate": 1.525706831437268e-08, "loss": 1.199, "step": 807 }, { "epoch": 0.9932390903503381, "grad_norm": 0.6552625298500061, "learning_rate": 1.0595350985526109e-08, "loss": 0.5601, "step": 808 }, { "epoch": 0.9944683466502766, "grad_norm": 0.6662526726722717, "learning_rate": 6.781110851633576e-09, "loss": 0.8824, "step": 809 }, { "epoch": 0.9956976029502151, "grad_norm": 1.0151597261428833, "learning_rate": 3.814412576025328e-09, "loss": 1.1593, "step": 810 }, { "epoch": 0.9969268592501537, "grad_norm": 2.71447491645813, "learning_rate": 1.6953064535474295e-09, "loss": 0.7048, "step": 811 }, { "epoch": 0.9981561155500922, "grad_norm": 3.1842808723449707, "learning_rate": 4.238284096902412e-10, "loss": 0.8335, "step": 812 }, { "epoch": 0.9993853718500307, "grad_norm": 2.6523120403289795, "learning_rate": 0.0, "loss": 0.6896, "step": 813 } ], "logging_steps": 1, "max_steps": 813, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 204, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6756836906985062e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }