{ "best_metric": null, "best_model_checkpoint": null, "epoch": 30.0, "eval_steps": 500, "global_step": 71610, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04189359028068706, "grad_norm": 1.1499052047729492, "learning_rate": 4.993017734953219e-05, "loss": 9.973, "step": 100 }, { "epoch": 0.08378718056137412, "grad_norm": 1.1211209297180176, "learning_rate": 4.986035469906438e-05, "loss": 9.315, "step": 200 }, { "epoch": 0.12568077084206117, "grad_norm": 1.0281990766525269, "learning_rate": 4.979053204859657e-05, "loss": 8.8239, "step": 300 }, { "epoch": 0.16757436112274823, "grad_norm": 1.1534616947174072, "learning_rate": 4.972070939812876e-05, "loss": 8.4254, "step": 400 }, { "epoch": 0.20946795140343527, "grad_norm": 0.9906216859817505, "learning_rate": 4.9650886747660944e-05, "loss": 8.1225, "step": 500 }, { "epoch": 0.25136154168412234, "grad_norm": 0.9559820294380188, "learning_rate": 4.958106409719313e-05, "loss": 7.8557, "step": 600 }, { "epoch": 0.2932551319648094, "grad_norm": 0.7887147665023804, "learning_rate": 4.951124144672532e-05, "loss": 7.7282, "step": 700 }, { "epoch": 0.33514872224549647, "grad_norm": 0.6530473232269287, "learning_rate": 4.944141879625751e-05, "loss": 7.6586, "step": 800 }, { "epoch": 0.3770423125261835, "grad_norm": 0.6546779870986938, "learning_rate": 4.9371596145789694e-05, "loss": 7.6438, "step": 900 }, { "epoch": 0.41893590280687054, "grad_norm": 0.5660322308540344, "learning_rate": 4.9301773495321885e-05, "loss": 7.6237, "step": 1000 }, { "epoch": 0.4608294930875576, "grad_norm": 0.6094719171524048, "learning_rate": 4.9231950844854076e-05, "loss": 7.5937, "step": 1100 }, { "epoch": 0.5027230833682447, "grad_norm": 0.6101346015930176, "learning_rate": 4.916212819438626e-05, "loss": 7.5661, "step": 1200 }, { "epoch": 0.5446166736489317, "grad_norm": 0.5943477749824524, "learning_rate": 4.909230554391845e-05, "loss": 7.6007, "step": 1300 }, { "epoch": 0.5865102639296188, "grad_norm": 0.6604776382446289, "learning_rate": 4.902248289345064e-05, "loss": 7.568, "step": 1400 }, { "epoch": 0.6284038542103059, "grad_norm": 0.6151777505874634, "learning_rate": 4.8952660242982826e-05, "loss": 7.5521, "step": 1500 }, { "epoch": 0.6702974444909929, "grad_norm": 0.6381381750106812, "learning_rate": 4.888283759251501e-05, "loss": 7.5478, "step": 1600 }, { "epoch": 0.7121910347716799, "grad_norm": 0.6552081108093262, "learning_rate": 4.88130149420472e-05, "loss": 7.5293, "step": 1700 }, { "epoch": 0.754084625052367, "grad_norm": 0.6973659992218018, "learning_rate": 4.874319229157939e-05, "loss": 7.4983, "step": 1800 }, { "epoch": 0.795978215333054, "grad_norm": 0.8130584955215454, "learning_rate": 4.8673369641111576e-05, "loss": 7.5213, "step": 1900 }, { "epoch": 0.8378718056137411, "grad_norm": 0.8530446887016296, "learning_rate": 4.860354699064377e-05, "loss": 7.5105, "step": 2000 }, { "epoch": 0.8797653958944281, "grad_norm": 0.8059477210044861, "learning_rate": 4.853372434017596e-05, "loss": 7.508, "step": 2100 }, { "epoch": 0.9216589861751152, "grad_norm": 0.7378331422805786, "learning_rate": 4.846390168970814e-05, "loss": 7.482, "step": 2200 }, { "epoch": 0.9635525764558023, "grad_norm": 1.1823194026947021, "learning_rate": 4.839407903924033e-05, "loss": 7.4778, "step": 2300 }, { "epoch": 1.0054461667364893, "grad_norm": 0.8547298908233643, "learning_rate": 4.832425638877252e-05, "loss": 7.4529, "step": 2400 }, { "epoch": 1.0473397570171763, "grad_norm": 0.8535734415054321, "learning_rate": 4.825443373830471e-05, "loss": 7.4374, "step": 2500 }, { "epoch": 1.0892333472978635, "grad_norm": 0.994597852230072, "learning_rate": 4.818461108783689e-05, "loss": 7.4306, "step": 2600 }, { "epoch": 1.1311269375785504, "grad_norm": 1.2056490182876587, "learning_rate": 4.8114788437369084e-05, "loss": 7.4322, "step": 2700 }, { "epoch": 1.1730205278592376, "grad_norm": 1.2451157569885254, "learning_rate": 4.8044965786901275e-05, "loss": 7.4205, "step": 2800 }, { "epoch": 1.2149141181399246, "grad_norm": 0.9964780211448669, "learning_rate": 4.797514313643346e-05, "loss": 7.4031, "step": 2900 }, { "epoch": 1.2568077084206117, "grad_norm": 0.8989804983139038, "learning_rate": 4.790532048596565e-05, "loss": 7.4043, "step": 3000 }, { "epoch": 1.2987012987012987, "grad_norm": 1.1330469846725464, "learning_rate": 4.783549783549784e-05, "loss": 7.4031, "step": 3100 }, { "epoch": 1.3405948889819856, "grad_norm": 0.9531299471855164, "learning_rate": 4.7765675185030025e-05, "loss": 7.3866, "step": 3200 }, { "epoch": 1.3824884792626728, "grad_norm": 1.0342323780059814, "learning_rate": 4.7695852534562216e-05, "loss": 7.3477, "step": 3300 }, { "epoch": 1.42438206954336, "grad_norm": 1.0523111820220947, "learning_rate": 4.76260298840944e-05, "loss": 7.3726, "step": 3400 }, { "epoch": 1.466275659824047, "grad_norm": 1.298751711845398, "learning_rate": 4.755620723362659e-05, "loss": 7.3484, "step": 3500 }, { "epoch": 1.508169250104734, "grad_norm": 1.0065233707427979, "learning_rate": 4.7486384583158775e-05, "loss": 7.3567, "step": 3600 }, { "epoch": 1.550062840385421, "grad_norm": 1.2989579439163208, "learning_rate": 4.7416561932690966e-05, "loss": 7.3275, "step": 3700 }, { "epoch": 1.591956430666108, "grad_norm": 1.0343406200408936, "learning_rate": 4.734673928222316e-05, "loss": 7.3353, "step": 3800 }, { "epoch": 1.6338500209467952, "grad_norm": 0.9944115281105042, "learning_rate": 4.727691663175534e-05, "loss": 7.3304, "step": 3900 }, { "epoch": 1.6757436112274822, "grad_norm": 1.102974534034729, "learning_rate": 4.720709398128753e-05, "loss": 7.3128, "step": 4000 }, { "epoch": 1.7176372015081691, "grad_norm": 1.112282156944275, "learning_rate": 4.713727133081972e-05, "loss": 7.333, "step": 4100 }, { "epoch": 1.7595307917888563, "grad_norm": 1.2143328189849854, "learning_rate": 4.7067448680351914e-05, "loss": 7.3342, "step": 4200 }, { "epoch": 1.8014243820695435, "grad_norm": 1.1656922101974487, "learning_rate": 4.69976260298841e-05, "loss": 7.2995, "step": 4300 }, { "epoch": 1.8433179723502304, "grad_norm": 1.2085694074630737, "learning_rate": 4.692780337941628e-05, "loss": 7.2939, "step": 4400 }, { "epoch": 1.8852115626309174, "grad_norm": 1.1366217136383057, "learning_rate": 4.685798072894847e-05, "loss": 7.3079, "step": 4500 }, { "epoch": 1.9271051529116046, "grad_norm": 1.5368098020553589, "learning_rate": 4.678815807848066e-05, "loss": 7.2682, "step": 4600 }, { "epoch": 1.9689987431922917, "grad_norm": 1.1548763513565063, "learning_rate": 4.671833542801285e-05, "loss": 7.2708, "step": 4700 }, { "epoch": 2.0108923334729787, "grad_norm": 1.1510928869247437, "learning_rate": 4.664851277754504e-05, "loss": 7.2794, "step": 4800 }, { "epoch": 2.0527859237536656, "grad_norm": 1.103461503982544, "learning_rate": 4.6578690127077224e-05, "loss": 7.2612, "step": 4900 }, { "epoch": 2.0946795140343526, "grad_norm": 1.2767215967178345, "learning_rate": 4.6508867476609414e-05, "loss": 7.2436, "step": 5000 }, { "epoch": 2.13657310431504, "grad_norm": 1.3062710762023926, "learning_rate": 4.6439044826141605e-05, "loss": 7.2162, "step": 5100 }, { "epoch": 2.178466694595727, "grad_norm": 1.2461299896240234, "learning_rate": 4.6369222175673796e-05, "loss": 7.2387, "step": 5200 }, { "epoch": 2.220360284876414, "grad_norm": 1.5427358150482178, "learning_rate": 4.629939952520598e-05, "loss": 7.2004, "step": 5300 }, { "epoch": 2.262253875157101, "grad_norm": 1.390331506729126, "learning_rate": 4.6229576874738165e-05, "loss": 7.2403, "step": 5400 }, { "epoch": 2.3041474654377883, "grad_norm": 1.4087032079696655, "learning_rate": 4.6159754224270356e-05, "loss": 7.2157, "step": 5500 }, { "epoch": 2.346041055718475, "grad_norm": 1.2417359352111816, "learning_rate": 4.608993157380254e-05, "loss": 7.2277, "step": 5600 }, { "epoch": 2.387934645999162, "grad_norm": 1.4267281293869019, "learning_rate": 4.602010892333473e-05, "loss": 7.1999, "step": 5700 }, { "epoch": 2.429828236279849, "grad_norm": 1.3897684812545776, "learning_rate": 4.595028627286692e-05, "loss": 7.2155, "step": 5800 }, { "epoch": 2.471721826560536, "grad_norm": 1.326821208000183, "learning_rate": 4.5880463622399106e-05, "loss": 7.1705, "step": 5900 }, { "epoch": 2.5136154168412235, "grad_norm": 1.2585749626159668, "learning_rate": 4.58106409719313e-05, "loss": 7.1787, "step": 6000 }, { "epoch": 2.5555090071219104, "grad_norm": 1.4856244325637817, "learning_rate": 4.574081832146349e-05, "loss": 7.1923, "step": 6100 }, { "epoch": 2.5974025974025974, "grad_norm": 1.2883421182632446, "learning_rate": 4.567099567099568e-05, "loss": 7.1776, "step": 6200 }, { "epoch": 2.6392961876832843, "grad_norm": 1.4935518503189087, "learning_rate": 4.560117302052786e-05, "loss": 7.1711, "step": 6300 }, { "epoch": 2.6811897779639713, "grad_norm": 1.3920152187347412, "learning_rate": 4.553135037006005e-05, "loss": 7.1292, "step": 6400 }, { "epoch": 2.7230833682446587, "grad_norm": 1.2802495956420898, "learning_rate": 4.546152771959224e-05, "loss": 7.1558, "step": 6500 }, { "epoch": 2.7649769585253456, "grad_norm": 1.4111789464950562, "learning_rate": 4.5392403295629106e-05, "loss": 7.172, "step": 6600 }, { "epoch": 2.8068705488060326, "grad_norm": 1.6390964984893799, "learning_rate": 4.53225806451613e-05, "loss": 7.1263, "step": 6700 }, { "epoch": 2.84876413908672, "grad_norm": 1.4132812023162842, "learning_rate": 4.525275799469348e-05, "loss": 7.1259, "step": 6800 }, { "epoch": 2.890657729367407, "grad_norm": 1.4943978786468506, "learning_rate": 4.518293534422567e-05, "loss": 7.1252, "step": 6900 }, { "epoch": 2.932551319648094, "grad_norm": 1.3022414445877075, "learning_rate": 4.5113112693757856e-05, "loss": 7.107, "step": 7000 }, { "epoch": 2.974444909928781, "grad_norm": 1.4270446300506592, "learning_rate": 4.504329004329004e-05, "loss": 7.1278, "step": 7100 }, { "epoch": 3.016338500209468, "grad_norm": 1.3672137260437012, "learning_rate": 4.497346739282223e-05, "loss": 7.1282, "step": 7200 }, { "epoch": 3.058232090490155, "grad_norm": 1.955368995666504, "learning_rate": 4.490364474235442e-05, "loss": 7.1123, "step": 7300 }, { "epoch": 3.100125680770842, "grad_norm": 1.3990498781204224, "learning_rate": 4.483382209188661e-05, "loss": 7.1117, "step": 7400 }, { "epoch": 3.142019271051529, "grad_norm": 1.6294671297073364, "learning_rate": 4.47639994414188e-05, "loss": 7.0721, "step": 7500 }, { "epoch": 3.183912861332216, "grad_norm": 1.3939063549041748, "learning_rate": 4.469417679095099e-05, "loss": 7.0599, "step": 7600 }, { "epoch": 3.225806451612903, "grad_norm": 1.918155312538147, "learning_rate": 4.462435414048318e-05, "loss": 7.0759, "step": 7700 }, { "epoch": 3.2677000418935904, "grad_norm": 1.3072093725204468, "learning_rate": 4.455453149001536e-05, "loss": 7.0784, "step": 7800 }, { "epoch": 3.3095936321742774, "grad_norm": 1.3794573545455933, "learning_rate": 4.4484708839547554e-05, "loss": 7.0726, "step": 7900 }, { "epoch": 3.3514872224549643, "grad_norm": 1.4223533868789673, "learning_rate": 4.441488618907974e-05, "loss": 7.0513, "step": 8000 }, { "epoch": 3.3933808127356513, "grad_norm": 1.695520043373108, "learning_rate": 4.434506353861192e-05, "loss": 7.0586, "step": 8100 }, { "epoch": 3.4352744030163387, "grad_norm": 2.290275812149048, "learning_rate": 4.4275240888144113e-05, "loss": 7.0429, "step": 8200 }, { "epoch": 3.4771679932970256, "grad_norm": 1.5322943925857544, "learning_rate": 4.4205418237676304e-05, "loss": 7.0196, "step": 8300 }, { "epoch": 3.5190615835777126, "grad_norm": 1.4767639636993408, "learning_rate": 4.4135595587208495e-05, "loss": 7.0643, "step": 8400 }, { "epoch": 3.5609551738583995, "grad_norm": 1.4343881607055664, "learning_rate": 4.406577293674068e-05, "loss": 7.0153, "step": 8500 }, { "epoch": 3.602848764139087, "grad_norm": 1.7641897201538086, "learning_rate": 4.399664851277755e-05, "loss": 7.013, "step": 8600 }, { "epoch": 3.644742354419774, "grad_norm": 1.9688279628753662, "learning_rate": 4.392682586230973e-05, "loss": 6.9985, "step": 8700 }, { "epoch": 3.686635944700461, "grad_norm": 1.7434871196746826, "learning_rate": 4.385700321184192e-05, "loss": 7.0142, "step": 8800 }, { "epoch": 3.728529534981148, "grad_norm": 1.550470232963562, "learning_rate": 4.378718056137411e-05, "loss": 6.9975, "step": 8900 }, { "epoch": 3.7704231252618348, "grad_norm": 1.759869933128357, "learning_rate": 4.37173579109063e-05, "loss": 6.9821, "step": 9000 }, { "epoch": 3.812316715542522, "grad_norm": 2.052905797958374, "learning_rate": 4.364753526043849e-05, "loss": 6.9917, "step": 9100 }, { "epoch": 3.854210305823209, "grad_norm": 1.8859872817993164, "learning_rate": 4.357771260997068e-05, "loss": 6.9934, "step": 9200 }, { "epoch": 3.896103896103896, "grad_norm": 1.8349354267120361, "learning_rate": 4.3507889959502863e-05, "loss": 6.9861, "step": 9300 }, { "epoch": 3.937997486384583, "grad_norm": 2.519893169403076, "learning_rate": 4.3438067309035054e-05, "loss": 6.9611, "step": 9400 }, { "epoch": 3.97989107666527, "grad_norm": 1.506072759628296, "learning_rate": 4.336824465856724e-05, "loss": 6.9688, "step": 9500 }, { "epoch": 4.021784666945957, "grad_norm": 1.5342004299163818, "learning_rate": 4.329842200809943e-05, "loss": 6.9582, "step": 9600 }, { "epoch": 4.063678257226645, "grad_norm": 1.6476861238479614, "learning_rate": 4.3228599357631614e-05, "loss": 6.9712, "step": 9700 }, { "epoch": 4.105571847507331, "grad_norm": 2.112595319747925, "learning_rate": 4.3158776707163805e-05, "loss": 6.9568, "step": 9800 }, { "epoch": 4.147465437788019, "grad_norm": 2.390101194381714, "learning_rate": 4.3088954056695996e-05, "loss": 6.9501, "step": 9900 }, { "epoch": 4.189359028068705, "grad_norm": 1.900177240371704, "learning_rate": 4.301913140622818e-05, "loss": 6.935, "step": 10000 }, { "epoch": 4.231252618349393, "grad_norm": 1.7032443284988403, "learning_rate": 4.294930875576037e-05, "loss": 6.9343, "step": 10100 }, { "epoch": 4.27314620863008, "grad_norm": 1.8393847942352295, "learning_rate": 4.287948610529256e-05, "loss": 6.8927, "step": 10200 }, { "epoch": 4.3150397989107665, "grad_norm": 2.046727180480957, "learning_rate": 4.280966345482475e-05, "loss": 6.9156, "step": 10300 }, { "epoch": 4.356933389191454, "grad_norm": 1.832216501235962, "learning_rate": 4.273984080435694e-05, "loss": 6.89, "step": 10400 }, { "epoch": 4.39882697947214, "grad_norm": 1.8682448863983154, "learning_rate": 4.267001815388912e-05, "loss": 6.8995, "step": 10500 }, { "epoch": 4.440720569752828, "grad_norm": 2.0732340812683105, "learning_rate": 4.260089372992599e-05, "loss": 6.9094, "step": 10600 }, { "epoch": 4.482614160033515, "grad_norm": 1.6016206741333008, "learning_rate": 4.253107107945818e-05, "loss": 6.913, "step": 10700 }, { "epoch": 4.524507750314202, "grad_norm": 2.063062906265259, "learning_rate": 4.246124842899037e-05, "loss": 6.8943, "step": 10800 }, { "epoch": 4.566401340594889, "grad_norm": 1.9563026428222656, "learning_rate": 4.2391425778522555e-05, "loss": 6.8986, "step": 10900 }, { "epoch": 4.6082949308755765, "grad_norm": 1.8872498273849487, "learning_rate": 4.2321603128054746e-05, "loss": 6.8883, "step": 11000 }, { "epoch": 4.650188521156263, "grad_norm": 2.1376144886016846, "learning_rate": 4.225178047758693e-05, "loss": 6.8716, "step": 11100 }, { "epoch": 4.69208211143695, "grad_norm": 1.938679575920105, "learning_rate": 4.218195782711912e-05, "loss": 6.8836, "step": 11200 }, { "epoch": 4.733975701717637, "grad_norm": 1.9372957944869995, "learning_rate": 4.2112135176651305e-05, "loss": 6.8925, "step": 11300 }, { "epoch": 4.775869291998324, "grad_norm": 2.716827630996704, "learning_rate": 4.2042312526183496e-05, "loss": 6.8284, "step": 11400 }, { "epoch": 4.817762882279011, "grad_norm": 1.942700743675232, "learning_rate": 4.197248987571568e-05, "loss": 6.8753, "step": 11500 }, { "epoch": 4.859656472559698, "grad_norm": 2.026385545730591, "learning_rate": 4.190266722524787e-05, "loss": 6.8707, "step": 11600 }, { "epoch": 4.901550062840386, "grad_norm": 1.7594517469406128, "learning_rate": 4.183284457478006e-05, "loss": 6.8427, "step": 11700 }, { "epoch": 4.943443653121072, "grad_norm": 1.8161801099777222, "learning_rate": 4.176302192431225e-05, "loss": 6.8519, "step": 11800 }, { "epoch": 4.9853372434017595, "grad_norm": 2.6034481525421143, "learning_rate": 4.169319927384444e-05, "loss": 6.8448, "step": 11900 }, { "epoch": 5.027230833682447, "grad_norm": 1.93776535987854, "learning_rate": 4.162337662337663e-05, "loss": 6.8071, "step": 12000 }, { "epoch": 5.0691244239631335, "grad_norm": 2.0754964351654053, "learning_rate": 4.155355397290881e-05, "loss": 6.8386, "step": 12100 }, { "epoch": 5.111018014243821, "grad_norm": 2.0640342235565186, "learning_rate": 4.1483731322440996e-05, "loss": 6.8402, "step": 12200 }, { "epoch": 5.152911604524507, "grad_norm": 1.8218064308166504, "learning_rate": 4.141390867197319e-05, "loss": 6.8153, "step": 12300 }, { "epoch": 5.194805194805195, "grad_norm": 2.0181634426116943, "learning_rate": 4.134408602150538e-05, "loss": 6.8104, "step": 12400 }, { "epoch": 5.236698785085882, "grad_norm": 2.5224316120147705, "learning_rate": 4.127426337103757e-05, "loss": 6.8355, "step": 12500 }, { "epoch": 5.278592375366569, "grad_norm": 3.1008002758026123, "learning_rate": 4.120513894707444e-05, "loss": 6.8384, "step": 12600 }, { "epoch": 5.320485965647256, "grad_norm": 1.8872394561767578, "learning_rate": 4.113531629660662e-05, "loss": 6.8087, "step": 12700 }, { "epoch": 5.362379555927943, "grad_norm": 2.109281063079834, "learning_rate": 4.1065493646138805e-05, "loss": 6.8161, "step": 12800 }, { "epoch": 5.40427314620863, "grad_norm": 1.7881128787994385, "learning_rate": 4.0995670995670996e-05, "loss": 6.8215, "step": 12900 }, { "epoch": 5.446166736489317, "grad_norm": 2.5179624557495117, "learning_rate": 4.092584834520319e-05, "loss": 6.7883, "step": 13000 }, { "epoch": 5.488060326770004, "grad_norm": 2.4349751472473145, "learning_rate": 4.085602569473537e-05, "loss": 6.792, "step": 13100 }, { "epoch": 5.529953917050691, "grad_norm": 2.011018991470337, "learning_rate": 4.078620304426756e-05, "loss": 6.7846, "step": 13200 }, { "epoch": 5.571847507331379, "grad_norm": 2.519958019256592, "learning_rate": 4.071638039379975e-05, "loss": 6.7887, "step": 13300 }, { "epoch": 5.613741097612065, "grad_norm": 1.9241886138916016, "learning_rate": 4.064655774333194e-05, "loss": 6.7662, "step": 13400 }, { "epoch": 5.655634687892753, "grad_norm": 1.8995391130447388, "learning_rate": 4.057673509286413e-05, "loss": 6.7672, "step": 13500 }, { "epoch": 5.697528278173439, "grad_norm": 2.1511363983154297, "learning_rate": 4.050691244239632e-05, "loss": 6.7867, "step": 13600 }, { "epoch": 5.7394218684541265, "grad_norm": 1.8995012044906616, "learning_rate": 4.04370897919285e-05, "loss": 6.7563, "step": 13700 }, { "epoch": 5.781315458734814, "grad_norm": 1.83163321018219, "learning_rate": 4.036726714146069e-05, "loss": 6.7848, "step": 13800 }, { "epoch": 5.8232090490155, "grad_norm": 2.2616159915924072, "learning_rate": 4.029744449099288e-05, "loss": 6.7896, "step": 13900 }, { "epoch": 5.865102639296188, "grad_norm": 2.0548572540283203, "learning_rate": 4.0228320067029746e-05, "loss": 6.7633, "step": 14000 }, { "epoch": 5.906996229576874, "grad_norm": 2.4749302864074707, "learning_rate": 4.015849741656194e-05, "loss": 6.7267, "step": 14100 }, { "epoch": 5.948889819857562, "grad_norm": 1.906648874282837, "learning_rate": 4.008867476609413e-05, "loss": 6.7645, "step": 14200 }, { "epoch": 5.990783410138249, "grad_norm": 2.0839619636535645, "learning_rate": 4.001885211562631e-05, "loss": 6.8082, "step": 14300 }, { "epoch": 6.032677000418936, "grad_norm": 2.1202664375305176, "learning_rate": 3.9949029465158496e-05, "loss": 6.7625, "step": 14400 }, { "epoch": 6.074570590699623, "grad_norm": 1.988951563835144, "learning_rate": 3.987920681469069e-05, "loss": 6.7413, "step": 14500 }, { "epoch": 6.11646418098031, "grad_norm": 2.4327659606933594, "learning_rate": 3.980938416422287e-05, "loss": 6.7123, "step": 14600 }, { "epoch": 6.158357771260997, "grad_norm": 2.07710599899292, "learning_rate": 3.973956151375506e-05, "loss": 6.7316, "step": 14700 }, { "epoch": 6.200251361541684, "grad_norm": 1.9640876054763794, "learning_rate": 3.966973886328725e-05, "loss": 6.752, "step": 14800 }, { "epoch": 6.242144951822371, "grad_norm": 2.3012888431549072, "learning_rate": 3.959991621281944e-05, "loss": 6.7188, "step": 14900 }, { "epoch": 6.284038542103058, "grad_norm": 2.0262773036956787, "learning_rate": 3.953009356235163e-05, "loss": 6.7255, "step": 15000 }, { "epoch": 6.325932132383746, "grad_norm": 1.8689815998077393, "learning_rate": 3.946027091188382e-05, "loss": 6.7132, "step": 15100 }, { "epoch": 6.367825722664432, "grad_norm": 2.188612937927246, "learning_rate": 3.939044826141601e-05, "loss": 6.7407, "step": 15200 }, { "epoch": 6.4097193129451195, "grad_norm": 2.0168368816375732, "learning_rate": 3.9320625610948195e-05, "loss": 6.7132, "step": 15300 }, { "epoch": 6.451612903225806, "grad_norm": 2.496889352798462, "learning_rate": 3.925080296048038e-05, "loss": 6.7003, "step": 15400 }, { "epoch": 6.4935064935064934, "grad_norm": 2.1601486206054688, "learning_rate": 3.918098031001257e-05, "loss": 6.7056, "step": 15500 }, { "epoch": 6.535400083787181, "grad_norm": 2.300112009048462, "learning_rate": 3.9111157659544754e-05, "loss": 6.7314, "step": 15600 }, { "epoch": 6.577293674067867, "grad_norm": 2.321880578994751, "learning_rate": 3.9041335009076945e-05, "loss": 6.7166, "step": 15700 }, { "epoch": 6.619187264348555, "grad_norm": 2.029465913772583, "learning_rate": 3.8971512358609136e-05, "loss": 6.6908, "step": 15800 }, { "epoch": 6.661080854629242, "grad_norm": 2.258577585220337, "learning_rate": 3.890168970814133e-05, "loss": 6.7359, "step": 15900 }, { "epoch": 6.702974444909929, "grad_norm": 2.3579437732696533, "learning_rate": 3.883186705767351e-05, "loss": 6.7021, "step": 16000 }, { "epoch": 6.744868035190616, "grad_norm": 2.236828565597534, "learning_rate": 3.87620444072057e-05, "loss": 6.6897, "step": 16100 }, { "epoch": 6.786761625471303, "grad_norm": 2.6255593299865723, "learning_rate": 3.869222175673789e-05, "loss": 6.6899, "step": 16200 }, { "epoch": 6.82865521575199, "grad_norm": 2.297067880630493, "learning_rate": 3.862239910627008e-05, "loss": 6.7058, "step": 16300 }, { "epoch": 6.870548806032677, "grad_norm": 2.440605640411377, "learning_rate": 3.8553274682306945e-05, "loss": 6.6559, "step": 16400 }, { "epoch": 6.912442396313364, "grad_norm": 2.0427000522613525, "learning_rate": 3.848345203183913e-05, "loss": 6.6799, "step": 16500 }, { "epoch": 6.954335986594051, "grad_norm": 2.0323081016540527, "learning_rate": 3.841362938137132e-05, "loss": 6.6863, "step": 16600 }, { "epoch": 6.996229576874738, "grad_norm": 3.407731533050537, "learning_rate": 3.834380673090351e-05, "loss": 6.6767, "step": 16700 }, { "epoch": 7.038123167155425, "grad_norm": 2.112870931625366, "learning_rate": 3.8273984080435695e-05, "loss": 6.682, "step": 16800 }, { "epoch": 7.080016757436113, "grad_norm": 2.710810422897339, "learning_rate": 3.8204161429967886e-05, "loss": 6.7046, "step": 16900 }, { "epoch": 7.121910347716799, "grad_norm": 2.0754942893981934, "learning_rate": 3.813433877950007e-05, "loss": 6.6511, "step": 17000 }, { "epoch": 7.1638039379974865, "grad_norm": 3.1009552478790283, "learning_rate": 3.8064516129032254e-05, "loss": 6.666, "step": 17100 }, { "epoch": 7.205697528278174, "grad_norm": 2.1582441329956055, "learning_rate": 3.7994693478564445e-05, "loss": 6.6574, "step": 17200 }, { "epoch": 7.24759111855886, "grad_norm": 2.680147647857666, "learning_rate": 3.7924870828096636e-05, "loss": 6.6814, "step": 17300 }, { "epoch": 7.289484708839548, "grad_norm": 2.0264320373535156, "learning_rate": 3.785504817762883e-05, "loss": 6.668, "step": 17400 }, { "epoch": 7.331378299120234, "grad_norm": 2.032093048095703, "learning_rate": 3.778522552716101e-05, "loss": 6.6603, "step": 17500 }, { "epoch": 7.373271889400922, "grad_norm": 2.4837894439697266, "learning_rate": 3.77154028766932e-05, "loss": 6.6817, "step": 17600 }, { "epoch": 7.415165479681609, "grad_norm": 2.70166015625, "learning_rate": 3.764558022622539e-05, "loss": 6.6657, "step": 17700 }, { "epoch": 7.457059069962296, "grad_norm": 2.3508477210998535, "learning_rate": 3.757575757575758e-05, "loss": 6.6314, "step": 17800 }, { "epoch": 7.498952660242983, "grad_norm": 2.450437307357788, "learning_rate": 3.750593492528977e-05, "loss": 6.6551, "step": 17900 }, { "epoch": 7.5408462505236695, "grad_norm": 1.9939864873886108, "learning_rate": 3.743611227482195e-05, "loss": 6.6128, "step": 18000 }, { "epoch": 7.582739840804357, "grad_norm": 2.470285177230835, "learning_rate": 3.736628962435414e-05, "loss": 6.6126, "step": 18100 }, { "epoch": 7.624633431085044, "grad_norm": 2.5651469230651855, "learning_rate": 3.729646697388633e-05, "loss": 6.6694, "step": 18200 }, { "epoch": 7.666527021365731, "grad_norm": 2.361785650253296, "learning_rate": 3.722664432341852e-05, "loss": 6.6349, "step": 18300 }, { "epoch": 7.708420611646418, "grad_norm": 2.371994972229004, "learning_rate": 3.715682167295071e-05, "loss": 6.6483, "step": 18400 }, { "epoch": 7.750314201927106, "grad_norm": 2.862107038497925, "learning_rate": 3.708769724898758e-05, "loss": 6.634, "step": 18500 }, { "epoch": 7.792207792207792, "grad_norm": 2.815486192703247, "learning_rate": 3.701787459851976e-05, "loss": 6.6324, "step": 18600 }, { "epoch": 7.8341013824884795, "grad_norm": 1.930017352104187, "learning_rate": 3.6948051948051945e-05, "loss": 6.6275, "step": 18700 }, { "epoch": 7.875994972769166, "grad_norm": 3.1758625507354736, "learning_rate": 3.6878229297584136e-05, "loss": 6.6529, "step": 18800 }, { "epoch": 7.9178885630498534, "grad_norm": 2.1219429969787598, "learning_rate": 3.680840664711633e-05, "loss": 6.6085, "step": 18900 }, { "epoch": 7.95978215333054, "grad_norm": 2.1965785026550293, "learning_rate": 3.673858399664851e-05, "loss": 6.6206, "step": 19000 }, { "epoch": 8.001675743611228, "grad_norm": 2.489473581314087, "learning_rate": 3.66687613461807e-05, "loss": 6.6089, "step": 19100 }, { "epoch": 8.043569333891915, "grad_norm": 2.3411850929260254, "learning_rate": 3.659893869571289e-05, "loss": 6.6286, "step": 19200 }, { "epoch": 8.085462924172601, "grad_norm": 2.32071590423584, "learning_rate": 3.6529116045245084e-05, "loss": 6.5984, "step": 19300 }, { "epoch": 8.12735651445329, "grad_norm": 2.402956247329712, "learning_rate": 3.645929339477727e-05, "loss": 6.5952, "step": 19400 }, { "epoch": 8.169250104733976, "grad_norm": 2.6951029300689697, "learning_rate": 3.638947074430946e-05, "loss": 6.6106, "step": 19500 }, { "epoch": 8.211143695014663, "grad_norm": 2.807187080383301, "learning_rate": 3.6319648093841643e-05, "loss": 6.6109, "step": 19600 }, { "epoch": 8.253037285295349, "grad_norm": 2.798614025115967, "learning_rate": 3.624982544337383e-05, "loss": 6.6052, "step": 19700 }, { "epoch": 8.294930875576037, "grad_norm": 4.015589237213135, "learning_rate": 3.618000279290602e-05, "loss": 6.5995, "step": 19800 }, { "epoch": 8.336824465856724, "grad_norm": 2.6923959255218506, "learning_rate": 3.611018014243821e-05, "loss": 6.5855, "step": 19900 }, { "epoch": 8.37871805613741, "grad_norm": 2.112994909286499, "learning_rate": 3.6040357491970394e-05, "loss": 6.5968, "step": 20000 }, { "epoch": 8.420611646418099, "grad_norm": 2.8196451663970947, "learning_rate": 3.5970534841502585e-05, "loss": 6.5977, "step": 20100 }, { "epoch": 8.462505236698785, "grad_norm": 2.2421326637268066, "learning_rate": 3.5900712191034776e-05, "loss": 6.5846, "step": 20200 }, { "epoch": 8.504398826979472, "grad_norm": 2.634634256362915, "learning_rate": 3.583088954056697e-05, "loss": 6.5955, "step": 20300 }, { "epoch": 8.54629241726016, "grad_norm": 2.101125955581665, "learning_rate": 3.576106689009915e-05, "loss": 6.6013, "step": 20400 }, { "epoch": 8.588186007540846, "grad_norm": 2.719330072402954, "learning_rate": 3.569194246613601e-05, "loss": 6.5668, "step": 20500 }, { "epoch": 8.630079597821533, "grad_norm": 2.283790349960327, "learning_rate": 3.56221198156682e-05, "loss": 6.6107, "step": 20600 }, { "epoch": 8.671973188102221, "grad_norm": 2.1805171966552734, "learning_rate": 3.5552297165200393e-05, "loss": 6.5875, "step": 20700 }, { "epoch": 8.713866778382908, "grad_norm": 2.6632487773895264, "learning_rate": 3.5482474514732584e-05, "loss": 6.613, "step": 20800 }, { "epoch": 8.755760368663594, "grad_norm": 2.3296337127685547, "learning_rate": 3.541265186426477e-05, "loss": 6.5628, "step": 20900 }, { "epoch": 8.79765395894428, "grad_norm": 2.8429343700408936, "learning_rate": 3.534282921379696e-05, "loss": 6.5823, "step": 21000 }, { "epoch": 8.839547549224969, "grad_norm": 2.4361233711242676, "learning_rate": 3.527300656332915e-05, "loss": 6.5853, "step": 21100 }, { "epoch": 8.881441139505656, "grad_norm": 2.5633111000061035, "learning_rate": 3.5203183912861335e-05, "loss": 6.5979, "step": 21200 }, { "epoch": 8.923334729786342, "grad_norm": 2.350463628768921, "learning_rate": 3.513336126239352e-05, "loss": 6.5744, "step": 21300 }, { "epoch": 8.96522832006703, "grad_norm": 2.456291675567627, "learning_rate": 3.506353861192571e-05, "loss": 6.57, "step": 21400 }, { "epoch": 9.007121910347717, "grad_norm": 2.401036262512207, "learning_rate": 3.49937159614579e-05, "loss": 6.5614, "step": 21500 }, { "epoch": 9.049015500628403, "grad_norm": 2.5537233352661133, "learning_rate": 3.4923893310990085e-05, "loss": 6.5836, "step": 21600 }, { "epoch": 9.090909090909092, "grad_norm": 2.6386375427246094, "learning_rate": 3.4854070660522276e-05, "loss": 6.6178, "step": 21700 }, { "epoch": 9.132802681189778, "grad_norm": 2.508533477783203, "learning_rate": 3.478424801005447e-05, "loss": 6.5761, "step": 21800 }, { "epoch": 9.174696271470465, "grad_norm": 3.1510419845581055, "learning_rate": 3.471442535958665e-05, "loss": 6.558, "step": 21900 }, { "epoch": 9.216589861751151, "grad_norm": 2.6325526237487793, "learning_rate": 3.464460270911884e-05, "loss": 6.5661, "step": 22000 }, { "epoch": 9.25848345203184, "grad_norm": 2.9870827198028564, "learning_rate": 3.457478005865103e-05, "loss": 6.5392, "step": 22100 }, { "epoch": 9.300377042312526, "grad_norm": 2.4924209117889404, "learning_rate": 3.450495740818322e-05, "loss": 6.547, "step": 22200 }, { "epoch": 9.342270632593213, "grad_norm": 2.3227298259735107, "learning_rate": 3.44351347577154e-05, "loss": 6.5306, "step": 22300 }, { "epoch": 9.3841642228739, "grad_norm": 2.867182731628418, "learning_rate": 3.436531210724759e-05, "loss": 6.5628, "step": 22400 }, { "epoch": 9.426057813154587, "grad_norm": 2.2619149684906006, "learning_rate": 3.429548945677978e-05, "loss": 6.5192, "step": 22500 }, { "epoch": 9.467951403435274, "grad_norm": 2.232321262359619, "learning_rate": 3.422636503281665e-05, "loss": 6.56, "step": 22600 }, { "epoch": 9.509844993715962, "grad_norm": 2.4485862255096436, "learning_rate": 3.4156542382348835e-05, "loss": 6.557, "step": 22700 }, { "epoch": 9.551738583996649, "grad_norm": 2.4476943016052246, "learning_rate": 3.4086719731881026e-05, "loss": 6.5314, "step": 22800 }, { "epoch": 9.593632174277335, "grad_norm": 2.491731643676758, "learning_rate": 3.401689708141321e-05, "loss": 6.4952, "step": 22900 }, { "epoch": 9.635525764558023, "grad_norm": 2.6474783420562744, "learning_rate": 3.39470744309454e-05, "loss": 6.5499, "step": 23000 }, { "epoch": 9.67741935483871, "grad_norm": 2.5691514015197754, "learning_rate": 3.3877251780477585e-05, "loss": 6.5417, "step": 23100 }, { "epoch": 9.719312945119396, "grad_norm": 2.601832151412964, "learning_rate": 3.3807429130009776e-05, "loss": 6.5584, "step": 23200 }, { "epoch": 9.761206535400083, "grad_norm": 3.481239080429077, "learning_rate": 3.373760647954197e-05, "loss": 6.5403, "step": 23300 }, { "epoch": 9.803100125680771, "grad_norm": 3.0747485160827637, "learning_rate": 3.366778382907415e-05, "loss": 6.5751, "step": 23400 }, { "epoch": 9.844993715961458, "grad_norm": 2.2310988903045654, "learning_rate": 3.359796117860634e-05, "loss": 6.5046, "step": 23500 }, { "epoch": 9.886887306242144, "grad_norm": 2.4555273056030273, "learning_rate": 3.352813852813853e-05, "loss": 6.5544, "step": 23600 }, { "epoch": 9.928780896522833, "grad_norm": 3.1235666275024414, "learning_rate": 3.345831587767072e-05, "loss": 6.5396, "step": 23700 }, { "epoch": 9.970674486803519, "grad_norm": 2.2766611576080322, "learning_rate": 3.338849322720291e-05, "loss": 6.5542, "step": 23800 }, { "epoch": 10.012568077084206, "grad_norm": 3.0408995151519775, "learning_rate": 3.331867057673509e-05, "loss": 6.4978, "step": 23900 }, { "epoch": 10.054461667364894, "grad_norm": 2.8702831268310547, "learning_rate": 3.3248847926267283e-05, "loss": 6.5264, "step": 24000 }, { "epoch": 10.09635525764558, "grad_norm": 2.9117937088012695, "learning_rate": 3.317902527579947e-05, "loss": 6.5028, "step": 24100 }, { "epoch": 10.138248847926267, "grad_norm": 2.925631046295166, "learning_rate": 3.310920262533166e-05, "loss": 6.5143, "step": 24200 }, { "epoch": 10.180142438206955, "grad_norm": 2.6605536937713623, "learning_rate": 3.303937997486385e-05, "loss": 6.5394, "step": 24300 }, { "epoch": 10.222036028487642, "grad_norm": 2.31357479095459, "learning_rate": 3.2969557324396034e-05, "loss": 6.5224, "step": 24400 }, { "epoch": 10.263929618768328, "grad_norm": 2.6544747352600098, "learning_rate": 3.2899734673928225e-05, "loss": 6.5035, "step": 24500 }, { "epoch": 10.305823209049015, "grad_norm": 2.5945372581481934, "learning_rate": 3.2830610249965085e-05, "loss": 6.4977, "step": 24600 }, { "epoch": 10.347716799329703, "grad_norm": 3.120873212814331, "learning_rate": 3.2760787599497276e-05, "loss": 6.5399, "step": 24700 }, { "epoch": 10.38961038961039, "grad_norm": 2.705008029937744, "learning_rate": 3.269096494902947e-05, "loss": 6.4938, "step": 24800 }, { "epoch": 10.431503979891076, "grad_norm": 2.2395503520965576, "learning_rate": 3.262114229856166e-05, "loss": 6.4854, "step": 24900 }, { "epoch": 10.473397570171764, "grad_norm": 2.5891764163970947, "learning_rate": 3.255131964809384e-05, "loss": 6.5107, "step": 25000 }, { "epoch": 10.51529116045245, "grad_norm": 3.115931749343872, "learning_rate": 3.248149699762603e-05, "loss": 6.5389, "step": 25100 }, { "epoch": 10.557184750733137, "grad_norm": 2.264437675476074, "learning_rate": 3.2411674347158224e-05, "loss": 6.51, "step": 25200 }, { "epoch": 10.599078341013826, "grad_norm": 3.449631690979004, "learning_rate": 3.234185169669041e-05, "loss": 6.5161, "step": 25300 }, { "epoch": 10.640971931294512, "grad_norm": 2.478337526321411, "learning_rate": 3.227202904622259e-05, "loss": 6.5019, "step": 25400 }, { "epoch": 10.682865521575199, "grad_norm": 3.2756478786468506, "learning_rate": 3.2202206395754784e-05, "loss": 6.4869, "step": 25500 }, { "epoch": 10.724759111855885, "grad_norm": 2.7576985359191895, "learning_rate": 3.213238374528697e-05, "loss": 6.5206, "step": 25600 }, { "epoch": 10.766652702136573, "grad_norm": 2.200963020324707, "learning_rate": 3.206256109481916e-05, "loss": 6.48, "step": 25700 }, { "epoch": 10.80854629241726, "grad_norm": 2.7358744144439697, "learning_rate": 3.199273844435135e-05, "loss": 6.5126, "step": 25800 }, { "epoch": 10.850439882697946, "grad_norm": 2.7179319858551025, "learning_rate": 3.192291579388354e-05, "loss": 6.4699, "step": 25900 }, { "epoch": 10.892333472978635, "grad_norm": 2.811340808868408, "learning_rate": 3.1853093143415725e-05, "loss": 6.5056, "step": 26000 }, { "epoch": 10.934227063259321, "grad_norm": 3.010690450668335, "learning_rate": 3.1783270492947916e-05, "loss": 6.5103, "step": 26100 }, { "epoch": 10.976120653540008, "grad_norm": 3.213487148284912, "learning_rate": 3.171344784248011e-05, "loss": 6.4874, "step": 26200 }, { "epoch": 11.018014243820696, "grad_norm": 2.5710039138793945, "learning_rate": 3.164362519201229e-05, "loss": 6.4919, "step": 26300 }, { "epoch": 11.059907834101383, "grad_norm": 2.6933746337890625, "learning_rate": 3.1573802541544475e-05, "loss": 6.5284, "step": 26400 }, { "epoch": 11.101801424382069, "grad_norm": 3.775012254714966, "learning_rate": 3.1503979891076666e-05, "loss": 6.4894, "step": 26500 }, { "epoch": 11.143695014662757, "grad_norm": 3.2401301860809326, "learning_rate": 3.1434855467113534e-05, "loss": 6.4721, "step": 26600 }, { "epoch": 11.185588604943444, "grad_norm": 2.642794132232666, "learning_rate": 3.1365032816645725e-05, "loss": 6.4797, "step": 26700 }, { "epoch": 11.22748219522413, "grad_norm": 3.191567897796631, "learning_rate": 3.129521016617791e-05, "loss": 6.5022, "step": 26800 }, { "epoch": 11.269375785504817, "grad_norm": 2.816554307937622, "learning_rate": 3.12253875157101e-05, "loss": 6.4853, "step": 26900 }, { "epoch": 11.311269375785505, "grad_norm": 2.8666136264801025, "learning_rate": 3.1155564865242284e-05, "loss": 6.4839, "step": 27000 }, { "epoch": 11.353162966066192, "grad_norm": 2.9831254482269287, "learning_rate": 3.1085742214774475e-05, "loss": 6.5082, "step": 27100 }, { "epoch": 11.395056556346878, "grad_norm": 2.7065083980560303, "learning_rate": 3.101591956430666e-05, "loss": 6.4412, "step": 27200 }, { "epoch": 11.436950146627566, "grad_norm": 2.5580694675445557, "learning_rate": 3.094609691383885e-05, "loss": 6.4849, "step": 27300 }, { "epoch": 11.478843736908253, "grad_norm": 2.571390390396118, "learning_rate": 3.087627426337104e-05, "loss": 6.4689, "step": 27400 }, { "epoch": 11.52073732718894, "grad_norm": 2.835906982421875, "learning_rate": 3.0806451612903225e-05, "loss": 6.4887, "step": 27500 }, { "epoch": 11.562630917469628, "grad_norm": 3.1355161666870117, "learning_rate": 3.0736628962435416e-05, "loss": 6.4568, "step": 27600 }, { "epoch": 11.604524507750314, "grad_norm": 3.0155599117279053, "learning_rate": 3.066680631196761e-05, "loss": 6.4607, "step": 27700 }, { "epoch": 11.646418098031, "grad_norm": 2.6346957683563232, "learning_rate": 3.059698366149979e-05, "loss": 6.4706, "step": 27800 }, { "epoch": 11.688311688311689, "grad_norm": 2.4353625774383545, "learning_rate": 3.052716101103198e-05, "loss": 6.482, "step": 27900 }, { "epoch": 11.730205278592376, "grad_norm": 3.29835844039917, "learning_rate": 3.045733836056417e-05, "loss": 6.4625, "step": 28000 }, { "epoch": 11.772098868873062, "grad_norm": 2.233579158782959, "learning_rate": 3.038751571009636e-05, "loss": 6.4727, "step": 28100 }, { "epoch": 11.813992459153749, "grad_norm": 2.5708439350128174, "learning_rate": 3.0317693059628545e-05, "loss": 6.4751, "step": 28200 }, { "epoch": 11.855886049434437, "grad_norm": 2.29488205909729, "learning_rate": 3.0247870409160732e-05, "loss": 6.4599, "step": 28300 }, { "epoch": 11.897779639715123, "grad_norm": 2.858208179473877, "learning_rate": 3.0178047758692923e-05, "loss": 6.469, "step": 28400 }, { "epoch": 11.93967322999581, "grad_norm": 2.854923725128174, "learning_rate": 3.0108225108225107e-05, "loss": 6.4995, "step": 28500 }, { "epoch": 11.981566820276498, "grad_norm": 2.590484857559204, "learning_rate": 3.003910068426198e-05, "loss": 6.4508, "step": 28600 }, { "epoch": 12.023460410557185, "grad_norm": 3.3479676246643066, "learning_rate": 2.9969278033794163e-05, "loss": 6.4581, "step": 28700 }, { "epoch": 12.065354000837871, "grad_norm": 2.7855923175811768, "learning_rate": 2.9899455383326354e-05, "loss": 6.4744, "step": 28800 }, { "epoch": 12.10724759111856, "grad_norm": 3.2668962478637695, "learning_rate": 2.982963273285854e-05, "loss": 6.4731, "step": 28900 }, { "epoch": 12.149141181399246, "grad_norm": 2.850735664367676, "learning_rate": 2.9759810082390725e-05, "loss": 6.4788, "step": 29000 }, { "epoch": 12.191034771679933, "grad_norm": 2.9676952362060547, "learning_rate": 2.9689987431922916e-05, "loss": 6.4525, "step": 29100 }, { "epoch": 12.23292836196062, "grad_norm": 2.604408025741577, "learning_rate": 2.9620164781455107e-05, "loss": 6.4564, "step": 29200 }, { "epoch": 12.274821952241307, "grad_norm": 2.974653482437134, "learning_rate": 2.9550342130987295e-05, "loss": 6.463, "step": 29300 }, { "epoch": 12.316715542521994, "grad_norm": 3.372664213180542, "learning_rate": 2.9480519480519482e-05, "loss": 6.464, "step": 29400 }, { "epoch": 12.35860913280268, "grad_norm": 2.6891355514526367, "learning_rate": 2.941069683005167e-05, "loss": 6.4674, "step": 29500 }, { "epoch": 12.400502723083369, "grad_norm": 2.964113473892212, "learning_rate": 2.934087417958386e-05, "loss": 6.4539, "step": 29600 }, { "epoch": 12.442396313364055, "grad_norm": 2.7328097820281982, "learning_rate": 2.9271051529116045e-05, "loss": 6.4224, "step": 29700 }, { "epoch": 12.484289903644742, "grad_norm": 2.6205203533172607, "learning_rate": 2.9201228878648236e-05, "loss": 6.4266, "step": 29800 }, { "epoch": 12.52618349392543, "grad_norm": 3.681053400039673, "learning_rate": 2.9131406228180424e-05, "loss": 6.4549, "step": 29900 }, { "epoch": 12.568077084206116, "grad_norm": 2.9732627868652344, "learning_rate": 2.9061583577712608e-05, "loss": 6.4466, "step": 30000 }, { "epoch": 12.609970674486803, "grad_norm": 3.47816801071167, "learning_rate": 2.89917609272448e-05, "loss": 6.4408, "step": 30100 }, { "epoch": 12.651864264767491, "grad_norm": 2.70326566696167, "learning_rate": 2.892193827677699e-05, "loss": 6.4444, "step": 30200 }, { "epoch": 12.693757855048178, "grad_norm": 2.9219532012939453, "learning_rate": 2.8852115626309177e-05, "loss": 6.4183, "step": 30300 }, { "epoch": 12.735651445328864, "grad_norm": 2.8546571731567383, "learning_rate": 2.878229297584136e-05, "loss": 6.4399, "step": 30400 }, { "epoch": 12.777545035609553, "grad_norm": 2.95047926902771, "learning_rate": 2.8712470325373552e-05, "loss": 6.4396, "step": 30500 }, { "epoch": 12.819438625890239, "grad_norm": 3.397934675216675, "learning_rate": 2.8643345901410416e-05, "loss": 6.438, "step": 30600 }, { "epoch": 12.861332216170926, "grad_norm": 2.625852346420288, "learning_rate": 2.8573523250942607e-05, "loss": 6.4363, "step": 30700 }, { "epoch": 12.903225806451612, "grad_norm": 2.5299527645111084, "learning_rate": 2.85037006004748e-05, "loss": 6.3952, "step": 30800 }, { "epoch": 12.9451193967323, "grad_norm": 2.6445415019989014, "learning_rate": 2.8433877950006983e-05, "loss": 6.4559, "step": 30900 }, { "epoch": 12.987012987012987, "grad_norm": 2.9675769805908203, "learning_rate": 2.836405529953917e-05, "loss": 6.4447, "step": 31000 }, { "epoch": 13.028906577293673, "grad_norm": 2.607391119003296, "learning_rate": 2.829423264907136e-05, "loss": 6.446, "step": 31100 }, { "epoch": 13.070800167574362, "grad_norm": 3.196765661239624, "learning_rate": 2.8224409998603545e-05, "loss": 6.4336, "step": 31200 }, { "epoch": 13.112693757855048, "grad_norm": 5.778535842895508, "learning_rate": 2.8154587348135736e-05, "loss": 6.4339, "step": 31300 }, { "epoch": 13.154587348135735, "grad_norm": 3.0479419231414795, "learning_rate": 2.8084764697667927e-05, "loss": 6.4147, "step": 31400 }, { "epoch": 13.196480938416423, "grad_norm": 2.6787302494049072, "learning_rate": 2.8014942047200115e-05, "loss": 6.4312, "step": 31500 }, { "epoch": 13.23837452869711, "grad_norm": 2.7929670810699463, "learning_rate": 2.79451193967323e-05, "loss": 6.4224, "step": 31600 }, { "epoch": 13.280268118977796, "grad_norm": 2.722101926803589, "learning_rate": 2.787529674626449e-05, "loss": 6.4247, "step": 31700 }, { "epoch": 13.322161709258484, "grad_norm": 3.295348644256592, "learning_rate": 2.780547409579668e-05, "loss": 6.4435, "step": 31800 }, { "epoch": 13.36405529953917, "grad_norm": 2.5780696868896484, "learning_rate": 2.7735651445328865e-05, "loss": 6.406, "step": 31900 }, { "epoch": 13.405948889819857, "grad_norm": 2.955299139022827, "learning_rate": 2.7665828794861053e-05, "loss": 6.4633, "step": 32000 }, { "epoch": 13.447842480100544, "grad_norm": 3.8027708530426025, "learning_rate": 2.7596006144393244e-05, "loss": 6.4445, "step": 32100 }, { "epoch": 13.489736070381232, "grad_norm": 2.6895995140075684, "learning_rate": 2.7526183493925428e-05, "loss": 6.4015, "step": 32200 }, { "epoch": 13.531629660661919, "grad_norm": 2.6936516761779785, "learning_rate": 2.745636084345762e-05, "loss": 6.4211, "step": 32300 }, { "epoch": 13.573523250942605, "grad_norm": 2.948420763015747, "learning_rate": 2.738653819298981e-05, "loss": 6.4042, "step": 32400 }, { "epoch": 13.615416841223293, "grad_norm": 2.763885974884033, "learning_rate": 2.7316715542521997e-05, "loss": 6.393, "step": 32500 }, { "epoch": 13.65731043150398, "grad_norm": 3.1601672172546387, "learning_rate": 2.724759111855886e-05, "loss": 6.4398, "step": 32600 }, { "epoch": 13.699204021784666, "grad_norm": 2.4161715507507324, "learning_rate": 2.7177768468091052e-05, "loss": 6.401, "step": 32700 }, { "epoch": 13.741097612065355, "grad_norm": 3.0796055793762207, "learning_rate": 2.7107945817623236e-05, "loss": 6.4265, "step": 32800 }, { "epoch": 13.782991202346041, "grad_norm": 3.6223697662353516, "learning_rate": 2.7038123167155427e-05, "loss": 6.4075, "step": 32900 }, { "epoch": 13.824884792626728, "grad_norm": 2.6991615295410156, "learning_rate": 2.696830051668762e-05, "loss": 6.3912, "step": 33000 }, { "epoch": 13.866778382907416, "grad_norm": 3.1701860427856445, "learning_rate": 2.6898477866219803e-05, "loss": 6.4173, "step": 33100 }, { "epoch": 13.908671973188103, "grad_norm": 2.915432929992676, "learning_rate": 2.682865521575199e-05, "loss": 6.4179, "step": 33200 }, { "epoch": 13.950565563468789, "grad_norm": 3.155080795288086, "learning_rate": 2.675883256528418e-05, "loss": 6.3895, "step": 33300 }, { "epoch": 13.992459153749476, "grad_norm": 3.3861114978790283, "learning_rate": 2.6689009914816365e-05, "loss": 6.4279, "step": 33400 }, { "epoch": 14.034352744030164, "grad_norm": 3.301805019378662, "learning_rate": 2.6619187264348556e-05, "loss": 6.4072, "step": 33500 }, { "epoch": 14.07624633431085, "grad_norm": 3.305147171020508, "learning_rate": 2.6549364613880744e-05, "loss": 6.3949, "step": 33600 }, { "epoch": 14.118139924591537, "grad_norm": 2.7602477073669434, "learning_rate": 2.6479541963412935e-05, "loss": 6.4048, "step": 33700 }, { "epoch": 14.160033514872225, "grad_norm": 2.5257952213287354, "learning_rate": 2.640971931294512e-05, "loss": 6.4033, "step": 33800 }, { "epoch": 14.201927105152912, "grad_norm": 2.4649853706359863, "learning_rate": 2.633989666247731e-05, "loss": 6.374, "step": 33900 }, { "epoch": 14.243820695433598, "grad_norm": 2.7136335372924805, "learning_rate": 2.6270074012009497e-05, "loss": 6.3993, "step": 34000 }, { "epoch": 14.285714285714286, "grad_norm": 2.801712989807129, "learning_rate": 2.6200251361541685e-05, "loss": 6.4059, "step": 34100 }, { "epoch": 14.327607875994973, "grad_norm": 2.7054030895233154, "learning_rate": 2.6130428711073873e-05, "loss": 6.431, "step": 34200 }, { "epoch": 14.36950146627566, "grad_norm": 2.653932809829712, "learning_rate": 2.6060606060606063e-05, "loss": 6.4035, "step": 34300 }, { "epoch": 14.411395056556348, "grad_norm": 2.5450570583343506, "learning_rate": 2.5990783410138248e-05, "loss": 6.417, "step": 34400 }, { "epoch": 14.453288646837034, "grad_norm": 2.9578003883361816, "learning_rate": 2.592096075967044e-05, "loss": 6.4087, "step": 34500 }, { "epoch": 14.49518223711772, "grad_norm": 2.9408493041992188, "learning_rate": 2.5851836335707303e-05, "loss": 6.3936, "step": 34600 }, { "epoch": 14.537075827398407, "grad_norm": 2.756441116333008, "learning_rate": 2.5782013685239494e-05, "loss": 6.404, "step": 34700 }, { "epoch": 14.578969417679096, "grad_norm": 3.685004711151123, "learning_rate": 2.571219103477168e-05, "loss": 6.3932, "step": 34800 }, { "epoch": 14.620863007959782, "grad_norm": 2.670825719833374, "learning_rate": 2.5642368384303872e-05, "loss": 6.3839, "step": 34900 }, { "epoch": 14.662756598240469, "grad_norm": 3.0986082553863525, "learning_rate": 2.5572545733836056e-05, "loss": 6.3782, "step": 35000 }, { "epoch": 14.704650188521157, "grad_norm": 3.003432273864746, "learning_rate": 2.5502723083368247e-05, "loss": 6.3775, "step": 35100 }, { "epoch": 14.746543778801843, "grad_norm": 2.752516269683838, "learning_rate": 2.5432900432900435e-05, "loss": 6.3731, "step": 35200 }, { "epoch": 14.78843736908253, "grad_norm": 2.7697649002075195, "learning_rate": 2.536307778243262e-05, "loss": 6.3701, "step": 35300 }, { "epoch": 14.830330959363218, "grad_norm": 3.0245521068573, "learning_rate": 2.529325513196481e-05, "loss": 6.3916, "step": 35400 }, { "epoch": 14.872224549643905, "grad_norm": 3.1849350929260254, "learning_rate": 2.5223432481497e-05, "loss": 6.3993, "step": 35500 }, { "epoch": 14.914118139924591, "grad_norm": 3.6655123233795166, "learning_rate": 2.5153609831029185e-05, "loss": 6.3791, "step": 35600 }, { "epoch": 14.95601173020528, "grad_norm": 3.2252790927886963, "learning_rate": 2.5083787180561376e-05, "loss": 6.3865, "step": 35700 }, { "epoch": 14.997905320485966, "grad_norm": 2.8366169929504395, "learning_rate": 2.5013964530093564e-05, "loss": 6.3897, "step": 35800 }, { "epoch": 15.039798910766653, "grad_norm": 2.757725715637207, "learning_rate": 2.494414187962575e-05, "loss": 6.376, "step": 35900 }, { "epoch": 15.081692501047339, "grad_norm": 3.1640422344207764, "learning_rate": 2.4874319229157942e-05, "loss": 6.3796, "step": 36000 }, { "epoch": 15.123586091328027, "grad_norm": 2.849719285964966, "learning_rate": 2.480449657869013e-05, "loss": 6.3765, "step": 36100 }, { "epoch": 15.165479681608714, "grad_norm": 2.7223923206329346, "learning_rate": 2.4734673928222314e-05, "loss": 6.3953, "step": 36200 }, { "epoch": 15.2073732718894, "grad_norm": 3.173750162124634, "learning_rate": 2.4664851277754505e-05, "loss": 6.3724, "step": 36300 }, { "epoch": 15.249266862170089, "grad_norm": 3.054779529571533, "learning_rate": 2.4595028627286692e-05, "loss": 6.3764, "step": 36400 }, { "epoch": 15.291160452450775, "grad_norm": 3.277862071990967, "learning_rate": 2.4525205976818883e-05, "loss": 6.3583, "step": 36500 }, { "epoch": 15.333054042731462, "grad_norm": 2.9208297729492188, "learning_rate": 2.4456081552855748e-05, "loss": 6.3878, "step": 36600 }, { "epoch": 15.37494763301215, "grad_norm": 2.5356411933898926, "learning_rate": 2.4386258902387935e-05, "loss": 6.3705, "step": 36700 }, { "epoch": 15.416841223292836, "grad_norm": 2.8953468799591064, "learning_rate": 2.4316436251920126e-05, "loss": 6.3947, "step": 36800 }, { "epoch": 15.458734813573523, "grad_norm": 2.9166266918182373, "learning_rate": 2.424661360145231e-05, "loss": 6.3809, "step": 36900 }, { "epoch": 15.50062840385421, "grad_norm": 3.4554710388183594, "learning_rate": 2.41767909509845e-05, "loss": 6.3746, "step": 37000 }, { "epoch": 15.542521994134898, "grad_norm": 3.7208077907562256, "learning_rate": 2.410696830051669e-05, "loss": 6.3758, "step": 37100 }, { "epoch": 15.584415584415584, "grad_norm": 3.3161842823028564, "learning_rate": 2.4037145650048876e-05, "loss": 6.3744, "step": 37200 }, { "epoch": 15.62630917469627, "grad_norm": 2.4062047004699707, "learning_rate": 2.3967322999581064e-05, "loss": 6.381, "step": 37300 }, { "epoch": 15.668202764976959, "grad_norm": 3.1894476413726807, "learning_rate": 2.389750034911325e-05, "loss": 6.3895, "step": 37400 }, { "epoch": 15.710096355257646, "grad_norm": 2.9203104972839355, "learning_rate": 2.3827677698645442e-05, "loss": 6.363, "step": 37500 }, { "epoch": 15.751989945538332, "grad_norm": 3.000694513320923, "learning_rate": 2.375785504817763e-05, "loss": 6.3837, "step": 37600 }, { "epoch": 15.79388353581902, "grad_norm": 2.838684558868408, "learning_rate": 2.368803239770982e-05, "loss": 6.3859, "step": 37700 }, { "epoch": 15.835777126099707, "grad_norm": 2.648862361907959, "learning_rate": 2.3618209747242005e-05, "loss": 6.3411, "step": 37800 }, { "epoch": 15.877670716380393, "grad_norm": 3.5438232421875, "learning_rate": 2.3548387096774193e-05, "loss": 6.3627, "step": 37900 }, { "epoch": 15.91956430666108, "grad_norm": 2.8182501792907715, "learning_rate": 2.3478564446306384e-05, "loss": 6.3731, "step": 38000 }, { "epoch": 15.961457896941768, "grad_norm": 3.3253772258758545, "learning_rate": 2.340874179583857e-05, "loss": 6.396, "step": 38100 }, { "epoch": 16.003351487222456, "grad_norm": 3.668926954269409, "learning_rate": 2.3338919145370762e-05, "loss": 6.3437, "step": 38200 }, { "epoch": 16.045245077503143, "grad_norm": 3.028989315032959, "learning_rate": 2.3269096494902946e-05, "loss": 6.3837, "step": 38300 }, { "epoch": 16.08713866778383, "grad_norm": 3.220702648162842, "learning_rate": 2.3199273844435134e-05, "loss": 6.3609, "step": 38400 }, { "epoch": 16.129032258064516, "grad_norm": 3.1788036823272705, "learning_rate": 2.3129451193967325e-05, "loss": 6.3723, "step": 38500 }, { "epoch": 16.170925848345203, "grad_norm": 3.351151466369629, "learning_rate": 2.306032677000419e-05, "loss": 6.3731, "step": 38600 }, { "epoch": 16.21281943862589, "grad_norm": 2.933992862701416, "learning_rate": 2.299050411953638e-05, "loss": 6.3654, "step": 38700 }, { "epoch": 16.25471302890658, "grad_norm": 4.2123589515686035, "learning_rate": 2.2920681469068568e-05, "loss": 6.3364, "step": 38800 }, { "epoch": 16.296606619187266, "grad_norm": 2.9287397861480713, "learning_rate": 2.2850858818600755e-05, "loss": 6.3643, "step": 38900 }, { "epoch": 16.338500209467952, "grad_norm": 2.6518173217773438, "learning_rate": 2.2781036168132943e-05, "loss": 6.3538, "step": 39000 }, { "epoch": 16.38039379974864, "grad_norm": 3.490497589111328, "learning_rate": 2.271121351766513e-05, "loss": 6.365, "step": 39100 }, { "epoch": 16.422287390029325, "grad_norm": 3.090874195098877, "learning_rate": 2.264139086719732e-05, "loss": 6.3513, "step": 39200 }, { "epoch": 16.46418098031001, "grad_norm": 2.793083429336548, "learning_rate": 2.257156821672951e-05, "loss": 6.3815, "step": 39300 }, { "epoch": 16.506074570590698, "grad_norm": 2.656334638595581, "learning_rate": 2.2501745566261696e-05, "loss": 6.3677, "step": 39400 }, { "epoch": 16.547968160871388, "grad_norm": 2.950857162475586, "learning_rate": 2.2431922915793884e-05, "loss": 6.3601, "step": 39500 }, { "epoch": 16.589861751152075, "grad_norm": 2.948397636413574, "learning_rate": 2.236210026532607e-05, "loss": 6.3633, "step": 39600 }, { "epoch": 16.63175534143276, "grad_norm": 3.759934902191162, "learning_rate": 2.2292277614858262e-05, "loss": 6.3664, "step": 39700 }, { "epoch": 16.673648931713448, "grad_norm": 2.6607794761657715, "learning_rate": 2.222245496439045e-05, "loss": 6.3659, "step": 39800 }, { "epoch": 16.715542521994134, "grad_norm": 3.2569267749786377, "learning_rate": 2.2152632313922638e-05, "loss": 6.3477, "step": 39900 }, { "epoch": 16.75743611227482, "grad_norm": 3.1701977252960205, "learning_rate": 2.2082809663454825e-05, "loss": 6.3466, "step": 40000 }, { "epoch": 16.79932970255551, "grad_norm": 2.8855369091033936, "learning_rate": 2.2012987012987013e-05, "loss": 6.3774, "step": 40100 }, { "epoch": 16.841223292836197, "grad_norm": 2.8468215465545654, "learning_rate": 2.1943164362519204e-05, "loss": 6.3388, "step": 40200 }, { "epoch": 16.883116883116884, "grad_norm": 3.3314404487609863, "learning_rate": 2.187334171205139e-05, "loss": 6.3658, "step": 40300 }, { "epoch": 16.92501047339757, "grad_norm": 3.023106336593628, "learning_rate": 2.180351906158358e-05, "loss": 6.3443, "step": 40400 }, { "epoch": 16.966904063678257, "grad_norm": 3.2845230102539062, "learning_rate": 2.1733696411115766e-05, "loss": 6.3785, "step": 40500 }, { "epoch": 17.008797653958943, "grad_norm": 2.805790424346924, "learning_rate": 2.166457198715263e-05, "loss": 6.3792, "step": 40600 }, { "epoch": 17.05069124423963, "grad_norm": 2.893737554550171, "learning_rate": 2.159474933668482e-05, "loss": 6.3138, "step": 40700 }, { "epoch": 17.09258483452032, "grad_norm": 3.238863945007324, "learning_rate": 2.1525624912721686e-05, "loss": 6.3686, "step": 40800 }, { "epoch": 17.134478424801006, "grad_norm": 3.403582811355591, "learning_rate": 2.1455802262253877e-05, "loss": 6.3312, "step": 40900 }, { "epoch": 17.176372015081693, "grad_norm": 2.963287353515625, "learning_rate": 2.1385979611786064e-05, "loss": 6.3515, "step": 41000 }, { "epoch": 17.21826560536238, "grad_norm": 3.867340087890625, "learning_rate": 2.1316156961318255e-05, "loss": 6.3566, "step": 41100 }, { "epoch": 17.260159195643066, "grad_norm": 2.841190814971924, "learning_rate": 2.124633431085044e-05, "loss": 6.3308, "step": 41200 }, { "epoch": 17.302052785923753, "grad_norm": 2.872523307800293, "learning_rate": 2.1176511660382627e-05, "loss": 6.3433, "step": 41300 }, { "epoch": 17.34394637620444, "grad_norm": 3.156465530395508, "learning_rate": 2.1106689009914818e-05, "loss": 6.3779, "step": 41400 }, { "epoch": 17.38583996648513, "grad_norm": 3.5904667377471924, "learning_rate": 2.1036866359447005e-05, "loss": 6.3402, "step": 41500 }, { "epoch": 17.427733556765816, "grad_norm": 3.5753939151763916, "learning_rate": 2.0967043708979196e-05, "loss": 6.3572, "step": 41600 }, { "epoch": 17.469627147046502, "grad_norm": 3.129514217376709, "learning_rate": 2.089722105851138e-05, "loss": 6.3302, "step": 41700 }, { "epoch": 17.51152073732719, "grad_norm": 2.988732099533081, "learning_rate": 2.0827398408043568e-05, "loss": 6.3807, "step": 41800 }, { "epoch": 17.553414327607875, "grad_norm": 2.857875108718872, "learning_rate": 2.075757575757576e-05, "loss": 6.3519, "step": 41900 }, { "epoch": 17.59530791788856, "grad_norm": 4.023842811584473, "learning_rate": 2.0687753107107947e-05, "loss": 6.3467, "step": 42000 }, { "epoch": 17.63720150816925, "grad_norm": 3.049686908721924, "learning_rate": 2.0617930456640137e-05, "loss": 6.3306, "step": 42100 }, { "epoch": 17.679095098449938, "grad_norm": 3.3211073875427246, "learning_rate": 2.054810780617232e-05, "loss": 6.3611, "step": 42200 }, { "epoch": 17.720988688730625, "grad_norm": 3.064138174057007, "learning_rate": 2.047828515570451e-05, "loss": 6.3217, "step": 42300 }, { "epoch": 17.76288227901131, "grad_norm": 2.7812724113464355, "learning_rate": 2.04084625052367e-05, "loss": 6.3131, "step": 42400 }, { "epoch": 17.804775869291998, "grad_norm": 2.5516164302825928, "learning_rate": 2.0338639854768888e-05, "loss": 6.3428, "step": 42500 }, { "epoch": 17.846669459572684, "grad_norm": 2.9599711894989014, "learning_rate": 2.026881720430108e-05, "loss": 6.3545, "step": 42600 }, { "epoch": 17.88856304985337, "grad_norm": 2.8674137592315674, "learning_rate": 2.0198994553833263e-05, "loss": 6.3302, "step": 42700 }, { "epoch": 17.93045664013406, "grad_norm": 3.3227078914642334, "learning_rate": 2.012917190336545e-05, "loss": 6.3278, "step": 42800 }, { "epoch": 17.972350230414747, "grad_norm": 3.080399751663208, "learning_rate": 2.005934925289764e-05, "loss": 6.3206, "step": 42900 }, { "epoch": 18.014243820695434, "grad_norm": 4.004719257354736, "learning_rate": 1.998952660242983e-05, "loss": 6.3407, "step": 43000 }, { "epoch": 18.05613741097612, "grad_norm": 2.8186423778533936, "learning_rate": 1.991970395196202e-05, "loss": 6.3136, "step": 43100 }, { "epoch": 18.098031001256807, "grad_norm": 2.81748104095459, "learning_rate": 1.9849881301494204e-05, "loss": 6.3353, "step": 43200 }, { "epoch": 18.139924591537493, "grad_norm": 2.9991416931152344, "learning_rate": 1.9780058651026395e-05, "loss": 6.3194, "step": 43300 }, { "epoch": 18.181818181818183, "grad_norm": 3.4876794815063477, "learning_rate": 1.9710236000558583e-05, "loss": 6.3293, "step": 43400 }, { "epoch": 18.22371177209887, "grad_norm": 3.0756711959838867, "learning_rate": 1.964041335009077e-05, "loss": 6.341, "step": 43500 }, { "epoch": 18.265605362379556, "grad_norm": 3.171670436859131, "learning_rate": 1.9570590699622958e-05, "loss": 6.3075, "step": 43600 }, { "epoch": 18.307498952660243, "grad_norm": 3.3317439556121826, "learning_rate": 1.9500768049155145e-05, "loss": 6.3436, "step": 43700 }, { "epoch": 18.34939254294093, "grad_norm": 2.924349308013916, "learning_rate": 1.9430945398687336e-05, "loss": 6.3217, "step": 43800 }, { "epoch": 18.391286133221616, "grad_norm": 3.247955560684204, "learning_rate": 1.9361122748219524e-05, "loss": 6.3324, "step": 43900 }, { "epoch": 18.433179723502302, "grad_norm": 3.340263843536377, "learning_rate": 1.929130009775171e-05, "loss": 6.2993, "step": 44000 }, { "epoch": 18.475073313782993, "grad_norm": 2.973019599914551, "learning_rate": 1.92214774472839e-05, "loss": 6.3292, "step": 44100 }, { "epoch": 18.51696690406368, "grad_norm": 3.5055582523345947, "learning_rate": 1.9151654796816086e-05, "loss": 6.3175, "step": 44200 }, { "epoch": 18.558860494344366, "grad_norm": 2.9543776512145996, "learning_rate": 1.9081832146348277e-05, "loss": 6.3206, "step": 44300 }, { "epoch": 18.600754084625052, "grad_norm": 2.790940284729004, "learning_rate": 1.9012009495880465e-05, "loss": 6.3383, "step": 44400 }, { "epoch": 18.64264767490574, "grad_norm": 3.419908285140991, "learning_rate": 1.8942186845412653e-05, "loss": 6.3329, "step": 44500 }, { "epoch": 18.684541265186425, "grad_norm": 3.3396215438842773, "learning_rate": 1.887236419494484e-05, "loss": 6.312, "step": 44600 }, { "epoch": 18.726434855467115, "grad_norm": 2.6713643074035645, "learning_rate": 1.8802541544477028e-05, "loss": 6.315, "step": 44700 }, { "epoch": 18.7683284457478, "grad_norm": 3.2764880657196045, "learning_rate": 1.8733417120513895e-05, "loss": 6.3311, "step": 44800 }, { "epoch": 18.810222036028488, "grad_norm": 3.602581739425659, "learning_rate": 1.8663594470046083e-05, "loss": 6.327, "step": 44900 }, { "epoch": 18.852115626309175, "grad_norm": 3.052971124649048, "learning_rate": 1.8593771819578274e-05, "loss": 6.2911, "step": 45000 }, { "epoch": 18.89400921658986, "grad_norm": 3.0912699699401855, "learning_rate": 1.852394916911046e-05, "loss": 6.3057, "step": 45100 }, { "epoch": 18.935902806870548, "grad_norm": 2.631545305252075, "learning_rate": 1.845412651864265e-05, "loss": 6.3381, "step": 45200 }, { "epoch": 18.977796397151234, "grad_norm": 3.8213324546813965, "learning_rate": 1.8384303868174836e-05, "loss": 6.3123, "step": 45300 }, { "epoch": 19.019689987431924, "grad_norm": 3.3717353343963623, "learning_rate": 1.8314481217707024e-05, "loss": 6.3194, "step": 45400 }, { "epoch": 19.06158357771261, "grad_norm": 2.831409215927124, "learning_rate": 1.8244658567239215e-05, "loss": 6.3383, "step": 45500 }, { "epoch": 19.103477167993297, "grad_norm": 2.915093183517456, "learning_rate": 1.8174835916771403e-05, "loss": 6.3208, "step": 45600 }, { "epoch": 19.145370758273984, "grad_norm": 3.1236917972564697, "learning_rate": 1.810501326630359e-05, "loss": 6.3089, "step": 45700 }, { "epoch": 19.18726434855467, "grad_norm": 3.2876298427581787, "learning_rate": 1.8035190615835778e-05, "loss": 6.2975, "step": 45800 }, { "epoch": 19.229157938835357, "grad_norm": 2.6437103748321533, "learning_rate": 1.7965367965367965e-05, "loss": 6.3341, "step": 45900 }, { "epoch": 19.271051529116047, "grad_norm": 2.9252028465270996, "learning_rate": 1.7895545314900156e-05, "loss": 6.3404, "step": 46000 }, { "epoch": 19.312945119396733, "grad_norm": 3.4250340461730957, "learning_rate": 1.7825722664432344e-05, "loss": 6.3072, "step": 46100 }, { "epoch": 19.35483870967742, "grad_norm": 3.1287946701049805, "learning_rate": 1.775590001396453e-05, "loss": 6.3022, "step": 46200 }, { "epoch": 19.396732299958106, "grad_norm": 3.4577419757843018, "learning_rate": 1.76867755900014e-05, "loss": 6.2938, "step": 46300 }, { "epoch": 19.438625890238793, "grad_norm": 3.7131240367889404, "learning_rate": 1.7616952939533586e-05, "loss": 6.3088, "step": 46400 }, { "epoch": 19.48051948051948, "grad_norm": 3.6799802780151367, "learning_rate": 1.7547130289065774e-05, "loss": 6.3326, "step": 46500 }, { "epoch": 19.522413070800166, "grad_norm": 2.834351062774658, "learning_rate": 1.747730763859796e-05, "loss": 6.2952, "step": 46600 }, { "epoch": 19.564306661080856, "grad_norm": 3.0629451274871826, "learning_rate": 1.7407484988130152e-05, "loss": 6.3185, "step": 46700 }, { "epoch": 19.606200251361543, "grad_norm": 3.4801712036132812, "learning_rate": 1.733766233766234e-05, "loss": 6.3003, "step": 46800 }, { "epoch": 19.64809384164223, "grad_norm": 2.8250389099121094, "learning_rate": 1.7267839687194524e-05, "loss": 6.3033, "step": 46900 }, { "epoch": 19.689987431922916, "grad_norm": 3.5964672565460205, "learning_rate": 1.7198017036726715e-05, "loss": 6.293, "step": 47000 }, { "epoch": 19.731881022203602, "grad_norm": 2.7947146892547607, "learning_rate": 1.7128194386258903e-05, "loss": 6.2884, "step": 47100 }, { "epoch": 19.77377461248429, "grad_norm": 3.0473551750183105, "learning_rate": 1.7058371735791094e-05, "loss": 6.312, "step": 47200 }, { "epoch": 19.81566820276498, "grad_norm": 3.1810736656188965, "learning_rate": 1.698854908532328e-05, "loss": 6.3102, "step": 47300 }, { "epoch": 19.857561793045665, "grad_norm": 3.0046746730804443, "learning_rate": 1.6918726434855465e-05, "loss": 6.3115, "step": 47400 }, { "epoch": 19.89945538332635, "grad_norm": 2.6985220909118652, "learning_rate": 1.6848903784387656e-05, "loss": 6.3132, "step": 47500 }, { "epoch": 19.941348973607038, "grad_norm": 2.958906650543213, "learning_rate": 1.6779081133919844e-05, "loss": 6.3024, "step": 47600 }, { "epoch": 19.983242563887725, "grad_norm": 3.5484089851379395, "learning_rate": 1.6709258483452035e-05, "loss": 6.2989, "step": 47700 }, { "epoch": 20.02513615416841, "grad_norm": 4.328272342681885, "learning_rate": 1.6639435832984222e-05, "loss": 6.3162, "step": 47800 }, { "epoch": 20.067029744449098, "grad_norm": 3.0396926403045654, "learning_rate": 1.6569613182516407e-05, "loss": 6.3004, "step": 47900 }, { "epoch": 20.108923334729788, "grad_norm": 3.328972339630127, "learning_rate": 1.6499790532048598e-05, "loss": 6.2855, "step": 48000 }, { "epoch": 20.150816925010474, "grad_norm": 3.301114320755005, "learning_rate": 1.6429967881580785e-05, "loss": 6.2874, "step": 48100 }, { "epoch": 20.19271051529116, "grad_norm": 3.297041177749634, "learning_rate": 1.6360145231112976e-05, "loss": 6.3089, "step": 48200 }, { "epoch": 20.234604105571847, "grad_norm": 2.9122605323791504, "learning_rate": 1.6290322580645164e-05, "loss": 6.3157, "step": 48300 }, { "epoch": 20.276497695852534, "grad_norm": 2.8182084560394287, "learning_rate": 1.6220499930177348e-05, "loss": 6.3118, "step": 48400 }, { "epoch": 20.31839128613322, "grad_norm": 3.8560192584991455, "learning_rate": 1.615067727970954e-05, "loss": 6.2858, "step": 48500 }, { "epoch": 20.36028487641391, "grad_norm": 2.457240581512451, "learning_rate": 1.6080854629241726e-05, "loss": 6.3077, "step": 48600 }, { "epoch": 20.402178466694597, "grad_norm": 3.5376362800598145, "learning_rate": 1.6011031978773917e-05, "loss": 6.2892, "step": 48700 }, { "epoch": 20.444072056975283, "grad_norm": 3.3489222526550293, "learning_rate": 1.59412093283061e-05, "loss": 6.2973, "step": 48800 }, { "epoch": 20.48596564725597, "grad_norm": 3.600166082382202, "learning_rate": 1.587138667783829e-05, "loss": 6.31, "step": 48900 }, { "epoch": 20.527859237536656, "grad_norm": 3.255598783493042, "learning_rate": 1.580156402737048e-05, "loss": 6.2389, "step": 49000 }, { "epoch": 20.569752827817343, "grad_norm": 3.166994094848633, "learning_rate": 1.5731741376902668e-05, "loss": 6.303, "step": 49100 }, { "epoch": 20.61164641809803, "grad_norm": 3.615269184112549, "learning_rate": 1.566191872643486e-05, "loss": 6.281, "step": 49200 }, { "epoch": 20.65354000837872, "grad_norm": 3.1495063304901123, "learning_rate": 1.5592096075967043e-05, "loss": 6.2666, "step": 49300 }, { "epoch": 20.695433598659406, "grad_norm": 2.9170730113983154, "learning_rate": 1.552227342549923e-05, "loss": 6.2738, "step": 49400 }, { "epoch": 20.737327188940093, "grad_norm": 3.0922224521636963, "learning_rate": 1.545245077503142e-05, "loss": 6.2805, "step": 49500 }, { "epoch": 20.77922077922078, "grad_norm": 3.088012933731079, "learning_rate": 1.538262812456361e-05, "loss": 6.2906, "step": 49600 }, { "epoch": 20.821114369501466, "grad_norm": 2.939486503601074, "learning_rate": 1.53128054740958e-05, "loss": 6.2636, "step": 49700 }, { "epoch": 20.863007959782152, "grad_norm": 3.597949743270874, "learning_rate": 1.5242982823627986e-05, "loss": 6.2745, "step": 49800 }, { "epoch": 20.90490155006284, "grad_norm": 3.4760777950286865, "learning_rate": 1.5173160173160175e-05, "loss": 6.2702, "step": 49900 }, { "epoch": 20.94679514034353, "grad_norm": 3.04856014251709, "learning_rate": 1.5103337522692362e-05, "loss": 6.2841, "step": 50000 }, { "epoch": 20.988688730624215, "grad_norm": 2.849895477294922, "learning_rate": 1.503351487222455e-05, "loss": 6.2814, "step": 50100 }, { "epoch": 21.0305823209049, "grad_norm": 3.1246280670166016, "learning_rate": 1.496369222175674e-05, "loss": 6.2754, "step": 50200 }, { "epoch": 21.072475911185588, "grad_norm": 3.303846836090088, "learning_rate": 1.4894567797793605e-05, "loss": 6.2661, "step": 50300 }, { "epoch": 21.114369501466275, "grad_norm": 3.5818755626678467, "learning_rate": 1.4824745147325794e-05, "loss": 6.2804, "step": 50400 }, { "epoch": 21.15626309174696, "grad_norm": 3.0695786476135254, "learning_rate": 1.4754922496857982e-05, "loss": 6.284, "step": 50500 }, { "epoch": 21.19815668202765, "grad_norm": 3.6067614555358887, "learning_rate": 1.4685099846390168e-05, "loss": 6.2863, "step": 50600 }, { "epoch": 21.240050272308338, "grad_norm": 3.2230417728424072, "learning_rate": 1.4615277195922359e-05, "loss": 6.287, "step": 50700 }, { "epoch": 21.281943862589024, "grad_norm": 3.059466600418091, "learning_rate": 1.4545454545454545e-05, "loss": 6.2442, "step": 50800 }, { "epoch": 21.32383745286971, "grad_norm": 3.7770040035247803, "learning_rate": 1.4475631894986736e-05, "loss": 6.2612, "step": 50900 }, { "epoch": 21.365731043150397, "grad_norm": 3.3269879817962646, "learning_rate": 1.4405809244518923e-05, "loss": 6.2985, "step": 51000 }, { "epoch": 21.407624633431084, "grad_norm": 2.649940252304077, "learning_rate": 1.4335986594051109e-05, "loss": 6.2343, "step": 51100 }, { "epoch": 21.44951822371177, "grad_norm": 3.4042983055114746, "learning_rate": 1.42661639435833e-05, "loss": 6.2701, "step": 51200 }, { "epoch": 21.49141181399246, "grad_norm": 3.1958000659942627, "learning_rate": 1.4196341293115486e-05, "loss": 6.2866, "step": 51300 }, { "epoch": 21.533305404273147, "grad_norm": 3.6010313034057617, "learning_rate": 1.4126518642647677e-05, "loss": 6.2683, "step": 51400 }, { "epoch": 21.575198994553833, "grad_norm": 3.429414749145508, "learning_rate": 1.4056695992179864e-05, "loss": 6.2408, "step": 51500 }, { "epoch": 21.61709258483452, "grad_norm": 3.069561004638672, "learning_rate": 1.3986873341712054e-05, "loss": 6.2641, "step": 51600 }, { "epoch": 21.658986175115206, "grad_norm": 3.575247287750244, "learning_rate": 1.3917050691244241e-05, "loss": 6.2722, "step": 51700 }, { "epoch": 21.700879765395893, "grad_norm": 3.033505439758301, "learning_rate": 1.3847228040776427e-05, "loss": 6.2424, "step": 51800 }, { "epoch": 21.742773355676583, "grad_norm": 3.287740707397461, "learning_rate": 1.3777405390308618e-05, "loss": 6.2516, "step": 51900 }, { "epoch": 21.78466694595727, "grad_norm": 3.0363028049468994, "learning_rate": 1.3707582739840804e-05, "loss": 6.2641, "step": 52000 }, { "epoch": 21.826560536237956, "grad_norm": 3.1549689769744873, "learning_rate": 1.3637760089372995e-05, "loss": 6.2335, "step": 52100 }, { "epoch": 21.868454126518643, "grad_norm": 3.8512282371520996, "learning_rate": 1.3567937438905182e-05, "loss": 6.2729, "step": 52200 }, { "epoch": 21.91034771679933, "grad_norm": 4.0751824378967285, "learning_rate": 1.3498813014942047e-05, "loss": 6.2397, "step": 52300 }, { "epoch": 21.952241307080016, "grad_norm": 3.375235080718994, "learning_rate": 1.3428990364474236e-05, "loss": 6.2316, "step": 52400 }, { "epoch": 21.994134897360702, "grad_norm": 3.093156337738037, "learning_rate": 1.3359167714006423e-05, "loss": 6.2468, "step": 52500 }, { "epoch": 22.036028487641392, "grad_norm": 3.729182243347168, "learning_rate": 1.3289345063538614e-05, "loss": 6.2366, "step": 52600 }, { "epoch": 22.07792207792208, "grad_norm": 3.4075732231140137, "learning_rate": 1.32195224130708e-05, "loss": 6.2693, "step": 52700 }, { "epoch": 22.119815668202765, "grad_norm": 2.9553005695343018, "learning_rate": 1.3149699762602988e-05, "loss": 6.2592, "step": 52800 }, { "epoch": 22.16170925848345, "grad_norm": 3.094538688659668, "learning_rate": 1.3079877112135177e-05, "loss": 6.26, "step": 52900 }, { "epoch": 22.203602848764138, "grad_norm": 3.907914161682129, "learning_rate": 1.3010054461667365e-05, "loss": 6.2711, "step": 53000 }, { "epoch": 22.245496439044825, "grad_norm": 3.7182159423828125, "learning_rate": 1.2940231811199554e-05, "loss": 6.2713, "step": 53100 }, { "epoch": 22.287390029325515, "grad_norm": 2.8652303218841553, "learning_rate": 1.2870409160731741e-05, "loss": 6.2325, "step": 53200 }, { "epoch": 22.3292836196062, "grad_norm": 3.190359592437744, "learning_rate": 1.2800586510263929e-05, "loss": 6.2563, "step": 53300 }, { "epoch": 22.371177209886888, "grad_norm": 3.372394561767578, "learning_rate": 1.2730763859796118e-05, "loss": 6.2489, "step": 53400 }, { "epoch": 22.413070800167574, "grad_norm": 3.340397596359253, "learning_rate": 1.2660941209328306e-05, "loss": 6.2147, "step": 53500 }, { "epoch": 22.45496439044826, "grad_norm": 3.1127400398254395, "learning_rate": 1.2591118558860495e-05, "loss": 6.2588, "step": 53600 }, { "epoch": 22.496857980728947, "grad_norm": 4.315746307373047, "learning_rate": 1.2521295908392683e-05, "loss": 6.2641, "step": 53700 }, { "epoch": 22.538751571009634, "grad_norm": 3.204827070236206, "learning_rate": 1.2451473257924872e-05, "loss": 6.2506, "step": 53800 }, { "epoch": 22.580645161290324, "grad_norm": 3.653074026107788, "learning_rate": 1.238165060745706e-05, "loss": 6.2512, "step": 53900 }, { "epoch": 22.62253875157101, "grad_norm": 3.8693697452545166, "learning_rate": 1.2311827956989249e-05, "loss": 6.2515, "step": 54000 }, { "epoch": 22.664432341851697, "grad_norm": 3.9418985843658447, "learning_rate": 1.2242005306521436e-05, "loss": 6.2522, "step": 54100 }, { "epoch": 22.706325932132383, "grad_norm": 3.328951358795166, "learning_rate": 1.2172182656053624e-05, "loss": 6.2244, "step": 54200 }, { "epoch": 22.74821952241307, "grad_norm": 3.251552104949951, "learning_rate": 1.210305823209049e-05, "loss": 6.2413, "step": 54300 }, { "epoch": 22.790113112693756, "grad_norm": 3.0756313800811768, "learning_rate": 1.2033235581622679e-05, "loss": 6.2343, "step": 54400 }, { "epoch": 22.832006702974446, "grad_norm": 3.174830913543701, "learning_rate": 1.1963412931154867e-05, "loss": 6.2445, "step": 54500 }, { "epoch": 22.873900293255133, "grad_norm": 2.831454038619995, "learning_rate": 1.1893590280687056e-05, "loss": 6.2457, "step": 54600 }, { "epoch": 22.91579388353582, "grad_norm": 3.3783247470855713, "learning_rate": 1.1823767630219245e-05, "loss": 6.2202, "step": 54700 }, { "epoch": 22.957687473816506, "grad_norm": 3.4505226612091064, "learning_rate": 1.1753944979751431e-05, "loss": 6.2329, "step": 54800 }, { "epoch": 22.999581064097192, "grad_norm": 4.203530311584473, "learning_rate": 1.168412232928362e-05, "loss": 6.2464, "step": 54900 }, { "epoch": 23.04147465437788, "grad_norm": 3.295198678970337, "learning_rate": 1.1614299678815808e-05, "loss": 6.2163, "step": 55000 }, { "epoch": 23.083368244658566, "grad_norm": 3.6795082092285156, "learning_rate": 1.1544477028347997e-05, "loss": 6.2108, "step": 55100 }, { "epoch": 23.125261834939256, "grad_norm": 3.7577404975891113, "learning_rate": 1.1474654377880186e-05, "loss": 6.2406, "step": 55200 }, { "epoch": 23.167155425219942, "grad_norm": 4.524641036987305, "learning_rate": 1.1404831727412372e-05, "loss": 6.2449, "step": 55300 }, { "epoch": 23.20904901550063, "grad_norm": 3.3049490451812744, "learning_rate": 1.1335009076944561e-05, "loss": 6.202, "step": 55400 }, { "epoch": 23.250942605781315, "grad_norm": 3.6244115829467773, "learning_rate": 1.1265186426476749e-05, "loss": 6.2214, "step": 55500 }, { "epoch": 23.292836196062, "grad_norm": 3.1158556938171387, "learning_rate": 1.1195363776008938e-05, "loss": 6.2247, "step": 55600 }, { "epoch": 23.334729786342688, "grad_norm": 3.208771228790283, "learning_rate": 1.1125541125541126e-05, "loss": 6.2416, "step": 55700 }, { "epoch": 23.376623376623378, "grad_norm": 4.181106090545654, "learning_rate": 1.1055718475073313e-05, "loss": 6.2343, "step": 55800 }, { "epoch": 23.418516966904065, "grad_norm": 2.8972866535186768, "learning_rate": 1.0985895824605503e-05, "loss": 6.2186, "step": 55900 }, { "epoch": 23.46041055718475, "grad_norm": 3.1691384315490723, "learning_rate": 1.091607317413769e-05, "loss": 6.2328, "step": 56000 }, { "epoch": 23.502304147465438, "grad_norm": 3.214346408843994, "learning_rate": 1.084625052366988e-05, "loss": 6.2356, "step": 56100 }, { "epoch": 23.544197737746124, "grad_norm": 3.0547690391540527, "learning_rate": 1.0776427873202067e-05, "loss": 6.2245, "step": 56200 }, { "epoch": 23.58609132802681, "grad_norm": 3.6090760231018066, "learning_rate": 1.0707303449238935e-05, "loss": 6.2634, "step": 56300 }, { "epoch": 23.627984918307497, "grad_norm": 3.210068702697754, "learning_rate": 1.0637480798771122e-05, "loss": 6.2126, "step": 56400 }, { "epoch": 23.669878508588187, "grad_norm": 3.872507095336914, "learning_rate": 1.056765814830331e-05, "loss": 6.2286, "step": 56500 }, { "epoch": 23.711772098868874, "grad_norm": 4.503695011138916, "learning_rate": 1.0497835497835499e-05, "loss": 6.2156, "step": 56600 }, { "epoch": 23.75366568914956, "grad_norm": 3.963315486907959, "learning_rate": 1.0428012847367686e-05, "loss": 6.2247, "step": 56700 }, { "epoch": 23.795559279430247, "grad_norm": 3.4394917488098145, "learning_rate": 1.0358190196899876e-05, "loss": 6.234, "step": 56800 }, { "epoch": 23.837452869710933, "grad_norm": 3.403167724609375, "learning_rate": 1.0288367546432063e-05, "loss": 6.2045, "step": 56900 }, { "epoch": 23.87934645999162, "grad_norm": 2.8274378776550293, "learning_rate": 1.0218544895964251e-05, "loss": 6.2121, "step": 57000 }, { "epoch": 23.92124005027231, "grad_norm": 3.277188301086426, "learning_rate": 1.0148722245496438e-05, "loss": 6.222, "step": 57100 }, { "epoch": 23.963133640552996, "grad_norm": 3.0735063552856445, "learning_rate": 1.0078899595028628e-05, "loss": 6.2257, "step": 57200 }, { "epoch": 24.005027230833683, "grad_norm": 3.6680026054382324, "learning_rate": 1.0009076944560817e-05, "loss": 6.2131, "step": 57300 }, { "epoch": 24.04692082111437, "grad_norm": 3.134713888168335, "learning_rate": 9.939254294093005e-06, "loss": 6.2241, "step": 57400 }, { "epoch": 24.088814411395056, "grad_norm": 2.9466712474823, "learning_rate": 9.869431643625192e-06, "loss": 6.2158, "step": 57500 }, { "epoch": 24.130708001675742, "grad_norm": 3.468949794769287, "learning_rate": 9.79960899315738e-06, "loss": 6.1793, "step": 57600 }, { "epoch": 24.17260159195643, "grad_norm": 3.5487060546875, "learning_rate": 9.729786342689569e-06, "loss": 6.2218, "step": 57700 }, { "epoch": 24.21449518223712, "grad_norm": 4.345893383026123, "learning_rate": 9.659963692221758e-06, "loss": 6.2023, "step": 57800 }, { "epoch": 24.256388772517806, "grad_norm": 2.9016401767730713, "learning_rate": 9.590141041753946e-06, "loss": 6.23, "step": 57900 }, { "epoch": 24.298282362798492, "grad_norm": 4.17023229598999, "learning_rate": 9.520318391286135e-06, "loss": 6.2114, "step": 58000 }, { "epoch": 24.34017595307918, "grad_norm": 3.322115421295166, "learning_rate": 9.45049574081832e-06, "loss": 6.204, "step": 58100 }, { "epoch": 24.382069543359865, "grad_norm": 3.709805488586426, "learning_rate": 9.38067309035051e-06, "loss": 6.2087, "step": 58200 }, { "epoch": 24.42396313364055, "grad_norm": 3.225588798522949, "learning_rate": 9.311548666387376e-06, "loss": 6.2436, "step": 58300 }, { "epoch": 24.46585672392124, "grad_norm": 3.1229472160339355, "learning_rate": 9.241726015919565e-06, "loss": 6.2253, "step": 58400 }, { "epoch": 24.507750314201928, "grad_norm": 3.4445230960845947, "learning_rate": 9.171903365451753e-06, "loss": 6.2254, "step": 58500 }, { "epoch": 24.549643904482615, "grad_norm": 4.2796807289123535, "learning_rate": 9.102080714983942e-06, "loss": 6.221, "step": 58600 }, { "epoch": 24.5915374947633, "grad_norm": 3.2323966026306152, "learning_rate": 9.03225806451613e-06, "loss": 6.228, "step": 58700 }, { "epoch": 24.633431085043988, "grad_norm": 4.064596652984619, "learning_rate": 8.962435414048317e-06, "loss": 6.2363, "step": 58800 }, { "epoch": 24.675324675324674, "grad_norm": 3.068544864654541, "learning_rate": 8.893310990085183e-06, "loss": 6.2508, "step": 58900 }, { "epoch": 24.71721826560536, "grad_norm": 2.6201155185699463, "learning_rate": 8.823488339617372e-06, "loss": 6.2193, "step": 59000 }, { "epoch": 24.75911185588605, "grad_norm": 4.960629463195801, "learning_rate": 8.753665689149562e-06, "loss": 6.1999, "step": 59100 }, { "epoch": 24.801005446166737, "grad_norm": 3.191586971282959, "learning_rate": 8.683843038681749e-06, "loss": 6.2203, "step": 59200 }, { "epoch": 24.842899036447424, "grad_norm": 3.224745512008667, "learning_rate": 8.614020388213937e-06, "loss": 6.212, "step": 59300 }, { "epoch": 24.88479262672811, "grad_norm": 3.450741767883301, "learning_rate": 8.544197737746124e-06, "loss": 6.2386, "step": 59400 }, { "epoch": 24.926686217008797, "grad_norm": 4.297729969024658, "learning_rate": 8.474375087278313e-06, "loss": 6.2088, "step": 59500 }, { "epoch": 24.968579807289483, "grad_norm": 3.376110553741455, "learning_rate": 8.404552436810503e-06, "loss": 6.2176, "step": 59600 }, { "epoch": 25.010473397570173, "grad_norm": 3.0211358070373535, "learning_rate": 8.33472978634269e-06, "loss": 6.1906, "step": 59700 }, { "epoch": 25.05236698785086, "grad_norm": 2.8490803241729736, "learning_rate": 8.264907135874878e-06, "loss": 6.2, "step": 59800 }, { "epoch": 25.094260578131546, "grad_norm": 3.0233705043792725, "learning_rate": 8.195084485407065e-06, "loss": 6.1886, "step": 59900 }, { "epoch": 25.136154168412233, "grad_norm": 3.7582995891571045, "learning_rate": 8.125261834939255e-06, "loss": 6.2064, "step": 60000 }, { "epoch": 25.17804775869292, "grad_norm": 3.128079891204834, "learning_rate": 8.055439184471442e-06, "loss": 6.2264, "step": 60100 }, { "epoch": 25.219941348973606, "grad_norm": 3.1808972358703613, "learning_rate": 7.985616534003632e-06, "loss": 6.2149, "step": 60200 }, { "epoch": 25.261834939254292, "grad_norm": 3.2326996326446533, "learning_rate": 7.91579388353582e-06, "loss": 6.2142, "step": 60300 }, { "epoch": 25.303728529534983, "grad_norm": 3.267465114593506, "learning_rate": 7.845971233068007e-06, "loss": 6.2439, "step": 60400 }, { "epoch": 25.34562211981567, "grad_norm": 3.691075563430786, "learning_rate": 7.776148582600196e-06, "loss": 6.2178, "step": 60500 }, { "epoch": 25.387515710096356, "grad_norm": 3.290562152862549, "learning_rate": 7.706325932132383e-06, "loss": 6.2165, "step": 60600 }, { "epoch": 25.429409300377042, "grad_norm": 4.553886413574219, "learning_rate": 7.636503281664573e-06, "loss": 6.2165, "step": 60700 }, { "epoch": 25.47130289065773, "grad_norm": 4.013444423675537, "learning_rate": 7.566680631196761e-06, "loss": 6.2122, "step": 60800 }, { "epoch": 25.513196480938415, "grad_norm": 4.044810771942139, "learning_rate": 7.496857980728948e-06, "loss": 6.2533, "step": 60900 }, { "epoch": 25.555090071219105, "grad_norm": 3.788613796234131, "learning_rate": 7.427035330261137e-06, "loss": 6.2039, "step": 61000 }, { "epoch": 25.59698366149979, "grad_norm": 3.317281484603882, "learning_rate": 7.3572126797933255e-06, "loss": 6.2228, "step": 61100 }, { "epoch": 25.638877251780478, "grad_norm": 3.4238085746765137, "learning_rate": 7.287390029325514e-06, "loss": 6.1979, "step": 61200 }, { "epoch": 25.680770842061165, "grad_norm": 3.1558725833892822, "learning_rate": 7.217567378857702e-06, "loss": 6.2044, "step": 61300 }, { "epoch": 25.72266443234185, "grad_norm": 2.939328670501709, "learning_rate": 7.147744728389889e-06, "loss": 6.2312, "step": 61400 }, { "epoch": 25.764558022622538, "grad_norm": 4.0037455558776855, "learning_rate": 7.0779220779220775e-06, "loss": 6.228, "step": 61500 }, { "epoch": 25.806451612903224, "grad_norm": 4.4582343101501465, "learning_rate": 7.008099427454267e-06, "loss": 6.2132, "step": 61600 }, { "epoch": 25.848345203183914, "grad_norm": 3.006201982498169, "learning_rate": 6.938276776986455e-06, "loss": 6.2242, "step": 61700 }, { "epoch": 25.8902387934646, "grad_norm": 3.6898059844970703, "learning_rate": 6.8684541265186436e-06, "loss": 6.2134, "step": 61800 }, { "epoch": 25.932132383745287, "grad_norm": 3.3489785194396973, "learning_rate": 6.798631476050832e-06, "loss": 6.2042, "step": 61900 }, { "epoch": 25.974025974025974, "grad_norm": 3.2489922046661377, "learning_rate": 6.729507052087698e-06, "loss": 6.2212, "step": 62000 }, { "epoch": 26.01591956430666, "grad_norm": 4.022356033325195, "learning_rate": 6.659684401619885e-06, "loss": 6.2423, "step": 62100 }, { "epoch": 26.057813154587347, "grad_norm": 4.803937911987305, "learning_rate": 6.589861751152074e-06, "loss": 6.2319, "step": 62200 }, { "epoch": 26.099706744868037, "grad_norm": 3.7283337116241455, "learning_rate": 6.520039100684262e-06, "loss": 6.1924, "step": 62300 }, { "epoch": 26.141600335148723, "grad_norm": 3.817946672439575, "learning_rate": 6.450216450216451e-06, "loss": 6.2039, "step": 62400 }, { "epoch": 26.18349392542941, "grad_norm": 3.4621963500976562, "learning_rate": 6.380393799748639e-06, "loss": 6.214, "step": 62500 }, { "epoch": 26.225387515710096, "grad_norm": 4.458475112915039, "learning_rate": 6.310571149280827e-06, "loss": 6.2327, "step": 62600 }, { "epoch": 26.267281105990783, "grad_norm": 3.1324493885040283, "learning_rate": 6.240748498813015e-06, "loss": 6.2518, "step": 62700 }, { "epoch": 26.30917469627147, "grad_norm": 3.410626173019409, "learning_rate": 6.1709258483452034e-06, "loss": 6.2054, "step": 62800 }, { "epoch": 26.351068286552156, "grad_norm": 3.221602201461792, "learning_rate": 6.101103197877392e-06, "loss": 6.2297, "step": 62900 }, { "epoch": 26.392961876832846, "grad_norm": 3.1413893699645996, "learning_rate": 6.031280547409579e-06, "loss": 6.2134, "step": 63000 }, { "epoch": 26.434855467113533, "grad_norm": 3.3834433555603027, "learning_rate": 5.961457896941768e-06, "loss": 6.167, "step": 63100 }, { "epoch": 26.47674905739422, "grad_norm": 3.016921281814575, "learning_rate": 5.891635246473957e-06, "loss": 6.2181, "step": 63200 }, { "epoch": 26.518642647674906, "grad_norm": 3.4190244674682617, "learning_rate": 5.821812596006145e-06, "loss": 6.172, "step": 63300 }, { "epoch": 26.560536237955592, "grad_norm": 3.519742488861084, "learning_rate": 5.751989945538333e-06, "loss": 6.2144, "step": 63400 }, { "epoch": 26.60242982823628, "grad_norm": 3.083923101425171, "learning_rate": 5.682167295070521e-06, "loss": 6.204, "step": 63500 }, { "epoch": 26.64432341851697, "grad_norm": 3.8977878093719482, "learning_rate": 5.612344644602709e-06, "loss": 6.1759, "step": 63600 }, { "epoch": 26.686217008797655, "grad_norm": 3.5598249435424805, "learning_rate": 5.5425219941348974e-06, "loss": 6.2233, "step": 63700 }, { "epoch": 26.72811059907834, "grad_norm": 3.6333513259887695, "learning_rate": 5.472699343667086e-06, "loss": 6.2133, "step": 63800 }, { "epoch": 26.770004189359028, "grad_norm": 3.2468085289001465, "learning_rate": 5.402876693199274e-06, "loss": 6.2081, "step": 63900 }, { "epoch": 26.811897779639715, "grad_norm": 3.6896772384643555, "learning_rate": 5.333054042731463e-06, "loss": 6.1935, "step": 64000 }, { "epoch": 26.8537913699204, "grad_norm": 3.263144016265869, "learning_rate": 5.26323139226365e-06, "loss": 6.2127, "step": 64100 }, { "epoch": 26.895684960201088, "grad_norm": 3.2848362922668457, "learning_rate": 5.193408741795839e-06, "loss": 6.2074, "step": 64200 }, { "epoch": 26.937578550481778, "grad_norm": 3.675541639328003, "learning_rate": 5.123586091328027e-06, "loss": 6.2015, "step": 64300 }, { "epoch": 26.979472140762464, "grad_norm": 3.413780689239502, "learning_rate": 5.0537634408602155e-06, "loss": 6.2218, "step": 64400 }, { "epoch": 27.02136573104315, "grad_norm": 4.108157634735107, "learning_rate": 4.983940790392404e-06, "loss": 6.212, "step": 64500 }, { "epoch": 27.063259321323837, "grad_norm": 3.7690155506134033, "learning_rate": 4.9141181399245915e-06, "loss": 6.22, "step": 64600 }, { "epoch": 27.105152911604524, "grad_norm": 3.379786491394043, "learning_rate": 4.84429548945678e-06, "loss": 6.2334, "step": 64700 }, { "epoch": 27.14704650188521, "grad_norm": 3.5175390243530273, "learning_rate": 4.774472838988968e-06, "loss": 6.1932, "step": 64800 }, { "epoch": 27.1889400921659, "grad_norm": 2.8454129695892334, "learning_rate": 4.704650188521157e-06, "loss": 6.2087, "step": 64900 }, { "epoch": 27.230833682446587, "grad_norm": 3.4630961418151855, "learning_rate": 4.634827538053345e-06, "loss": 6.2142, "step": 65000 }, { "epoch": 27.272727272727273, "grad_norm": 3.339860677719116, "learning_rate": 4.565004887585533e-06, "loss": 6.1772, "step": 65100 }, { "epoch": 27.31462086300796, "grad_norm": 3.0743260383605957, "learning_rate": 4.495182237117721e-06, "loss": 6.2044, "step": 65200 }, { "epoch": 27.356514453288646, "grad_norm": 3.2576496601104736, "learning_rate": 4.4253595866499095e-06, "loss": 6.2191, "step": 65300 }, { "epoch": 27.398408043569333, "grad_norm": 3.326819896697998, "learning_rate": 4.355536936182097e-06, "loss": 6.1762, "step": 65400 }, { "epoch": 27.44030163385002, "grad_norm": 3.4447667598724365, "learning_rate": 4.285714285714286e-06, "loss": 6.1823, "step": 65500 }, { "epoch": 27.48219522413071, "grad_norm": 3.4771687984466553, "learning_rate": 4.215891635246475e-06, "loss": 6.228, "step": 65600 }, { "epoch": 27.524088814411396, "grad_norm": 3.3457424640655518, "learning_rate": 4.146068984778662e-06, "loss": 6.1651, "step": 65700 }, { "epoch": 27.565982404692082, "grad_norm": 3.006155490875244, "learning_rate": 4.076246334310851e-06, "loss": 6.2026, "step": 65800 }, { "epoch": 27.60787599497277, "grad_norm": 4.228708744049072, "learning_rate": 4.006423683843038e-06, "loss": 6.1923, "step": 65900 }, { "epoch": 27.649769585253456, "grad_norm": 3.4744226932525635, "learning_rate": 3.937299259879905e-06, "loss": 6.1891, "step": 66000 }, { "epoch": 27.691663175534142, "grad_norm": 3.8300633430480957, "learning_rate": 3.867476609412093e-06, "loss": 6.2237, "step": 66100 }, { "epoch": 27.733556765814832, "grad_norm": 2.9689528942108154, "learning_rate": 3.7976539589442818e-06, "loss": 6.217, "step": 66200 }, { "epoch": 27.77545035609552, "grad_norm": 3.1309947967529297, "learning_rate": 3.7278313084764698e-06, "loss": 6.2061, "step": 66300 }, { "epoch": 27.817343946376205, "grad_norm": 3.4571166038513184, "learning_rate": 3.658008658008658e-06, "loss": 6.1863, "step": 66400 }, { "epoch": 27.85923753665689, "grad_norm": 3.354229211807251, "learning_rate": 3.5881860075408466e-06, "loss": 6.1996, "step": 66500 }, { "epoch": 27.901131126937578, "grad_norm": 3.745568037033081, "learning_rate": 3.5183633570730346e-06, "loss": 6.1839, "step": 66600 }, { "epoch": 27.943024717218265, "grad_norm": 3.356715440750122, "learning_rate": 3.448540706605223e-06, "loss": 6.2048, "step": 66700 }, { "epoch": 27.98491830749895, "grad_norm": 2.964492082595825, "learning_rate": 3.378718056137411e-06, "loss": 6.228, "step": 66800 }, { "epoch": 28.02681189777964, "grad_norm": 3.336606502532959, "learning_rate": 3.3088954056695994e-06, "loss": 6.1953, "step": 66900 }, { "epoch": 28.068705488060328, "grad_norm": 3.264971971511841, "learning_rate": 3.239072755201788e-06, "loss": 6.1783, "step": 67000 }, { "epoch": 28.110599078341014, "grad_norm": 3.4968082904815674, "learning_rate": 3.169250104733976e-06, "loss": 6.2117, "step": 67100 }, { "epoch": 28.1524926686217, "grad_norm": 3.4082252979278564, "learning_rate": 3.099427454266164e-06, "loss": 6.2278, "step": 67200 }, { "epoch": 28.194386258902387, "grad_norm": 3.52056884765625, "learning_rate": 3.029604803798352e-06, "loss": 6.2037, "step": 67300 }, { "epoch": 28.236279849183074, "grad_norm": 3.6062779426574707, "learning_rate": 2.9597821533305406e-06, "loss": 6.1952, "step": 67400 }, { "epoch": 28.278173439463764, "grad_norm": 3.158705472946167, "learning_rate": 2.889959502862729e-06, "loss": 6.2067, "step": 67500 }, { "epoch": 28.32006702974445, "grad_norm": 3.6732075214385986, "learning_rate": 2.820136852394917e-06, "loss": 6.1752, "step": 67600 }, { "epoch": 28.361960620025137, "grad_norm": 2.842560291290283, "learning_rate": 2.7503142019271054e-06, "loss": 6.1823, "step": 67700 }, { "epoch": 28.403854210305823, "grad_norm": 3.412233591079712, "learning_rate": 2.6804915514592934e-06, "loss": 6.1997, "step": 67800 }, { "epoch": 28.44574780058651, "grad_norm": 2.8313143253326416, "learning_rate": 2.610668900991482e-06, "loss": 6.187, "step": 67900 }, { "epoch": 28.487641390867196, "grad_norm": 3.122307300567627, "learning_rate": 2.541544477028348e-06, "loss": 6.1977, "step": 68000 }, { "epoch": 28.529534981147883, "grad_norm": 3.4732697010040283, "learning_rate": 2.4717218265605365e-06, "loss": 6.1913, "step": 68100 }, { "epoch": 28.571428571428573, "grad_norm": 3.3936917781829834, "learning_rate": 2.4018991760927245e-06, "loss": 6.2216, "step": 68200 }, { "epoch": 28.61332216170926, "grad_norm": 3.2980170249938965, "learning_rate": 2.332076525624913e-06, "loss": 6.1989, "step": 68300 }, { "epoch": 28.655215751989946, "grad_norm": 4.099823951721191, "learning_rate": 2.2622538751571013e-06, "loss": 6.1755, "step": 68400 }, { "epoch": 28.697109342270632, "grad_norm": 3.7930960655212402, "learning_rate": 2.1924312246892893e-06, "loss": 6.2155, "step": 68500 }, { "epoch": 28.73900293255132, "grad_norm": 3.620065212249756, "learning_rate": 2.1226085742214777e-06, "loss": 6.2041, "step": 68600 }, { "epoch": 28.780896522832006, "grad_norm": 3.3451735973358154, "learning_rate": 2.0527859237536657e-06, "loss": 6.2095, "step": 68700 }, { "epoch": 28.822790113112696, "grad_norm": 4.007857799530029, "learning_rate": 1.982963273285854e-06, "loss": 6.2283, "step": 68800 }, { "epoch": 28.864683703393382, "grad_norm": 4.236888885498047, "learning_rate": 1.9131406228180425e-06, "loss": 6.1799, "step": 68900 }, { "epoch": 28.90657729367407, "grad_norm": 3.222273111343384, "learning_rate": 1.8433179723502305e-06, "loss": 6.2008, "step": 69000 }, { "epoch": 28.948470883954755, "grad_norm": 3.8649580478668213, "learning_rate": 1.7734953218824187e-06, "loss": 6.194, "step": 69100 }, { "epoch": 28.99036447423544, "grad_norm": 3.9630191326141357, "learning_rate": 1.7036726714146071e-06, "loss": 6.2083, "step": 69200 }, { "epoch": 29.032258064516128, "grad_norm": 3.9617035388946533, "learning_rate": 1.6338500209467953e-06, "loss": 6.2197, "step": 69300 }, { "epoch": 29.074151654796815, "grad_norm": 3.4647514820098877, "learning_rate": 1.5640273704789835e-06, "loss": 6.1919, "step": 69400 }, { "epoch": 29.116045245077505, "grad_norm": 3.7548468112945557, "learning_rate": 1.4942047200111717e-06, "loss": 6.171, "step": 69500 }, { "epoch": 29.15793883535819, "grad_norm": 3.4267735481262207, "learning_rate": 1.42438206954336e-06, "loss": 6.2031, "step": 69600 }, { "epoch": 29.199832425638878, "grad_norm": 3.166888952255249, "learning_rate": 1.3545594190755483e-06, "loss": 6.1937, "step": 69700 }, { "epoch": 29.241726015919564, "grad_norm": 2.9794344902038574, "learning_rate": 1.2847367686077363e-06, "loss": 6.2068, "step": 69800 }, { "epoch": 29.28361960620025, "grad_norm": 3.056293249130249, "learning_rate": 1.2149141181399247e-06, "loss": 6.1652, "step": 69900 }, { "epoch": 29.325513196480937, "grad_norm": 3.851149320602417, "learning_rate": 1.145789694176791e-06, "loss": 6.2129, "step": 70000 }, { "epoch": 29.367406786761624, "grad_norm": 3.670929193496704, "learning_rate": 1.0759670437089792e-06, "loss": 6.2068, "step": 70100 }, { "epoch": 29.409300377042314, "grad_norm": 3.3581252098083496, "learning_rate": 1.0061443932411674e-06, "loss": 6.2208, "step": 70200 }, { "epoch": 29.451193967323, "grad_norm": 3.7551257610321045, "learning_rate": 9.363217427733557e-07, "loss": 6.179, "step": 70300 }, { "epoch": 29.493087557603687, "grad_norm": 2.9767682552337646, "learning_rate": 8.664990923055439e-07, "loss": 6.2157, "step": 70400 }, { "epoch": 29.534981147884373, "grad_norm": 3.3218774795532227, "learning_rate": 7.966764418377322e-07, "loss": 6.1773, "step": 70500 }, { "epoch": 29.57687473816506, "grad_norm": 4.360437870025635, "learning_rate": 7.268537913699204e-07, "loss": 6.1985, "step": 70600 }, { "epoch": 29.618768328445746, "grad_norm": 3.544264078140259, "learning_rate": 6.570311409021087e-07, "loss": 6.1973, "step": 70700 }, { "epoch": 29.660661918726436, "grad_norm": 3.7416069507598877, "learning_rate": 5.872084904342969e-07, "loss": 6.2031, "step": 70800 }, { "epoch": 29.702555509007123, "grad_norm": 3.0346035957336426, "learning_rate": 5.173858399664851e-07, "loss": 6.2123, "step": 70900 }, { "epoch": 29.74444909928781, "grad_norm": 3.2308425903320312, "learning_rate": 4.4756318949867344e-07, "loss": 6.2106, "step": 71000 }, { "epoch": 29.786342689568496, "grad_norm": 3.0109570026397705, "learning_rate": 3.7774053903086163e-07, "loss": 6.213, "step": 71100 }, { "epoch": 29.828236279849182, "grad_norm": 3.733609199523926, "learning_rate": 3.079178885630499e-07, "loss": 6.1984, "step": 71200 }, { "epoch": 29.87012987012987, "grad_norm": 3.5430541038513184, "learning_rate": 2.3809523809523814e-07, "loss": 6.183, "step": 71300 }, { "epoch": 29.912023460410555, "grad_norm": 3.1964950561523438, "learning_rate": 1.6827258762742634e-07, "loss": 6.1817, "step": 71400 }, { "epoch": 29.953917050691246, "grad_norm": 3.6197755336761475, "learning_rate": 9.844993715961458e-08, "loss": 6.1907, "step": 71500 }, { "epoch": 29.995810640971932, "grad_norm": 3.035473346710205, "learning_rate": 2.862728669180282e-08, "loss": 6.1677, "step": 71600 }, { "epoch": 30.0, "step": 71610, "total_flos": 781486986700800.0, "train_loss": 6.528281962779993, "train_runtime": 3948.1737, "train_samples_per_second": 580.354, "train_steps_per_second": 18.137 } ], "logging_steps": 100, "max_steps": 71610, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 781486986700800.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }