|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9993853718500307, |
|
"eval_steps": 204, |
|
"global_step": 813, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001229256299938537, |
|
"grad_norm": 0.19411148130893707, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.1612, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.001229256299938537, |
|
"eval_loss": 2.1468453407287598, |
|
"eval_runtime": 66.4837, |
|
"eval_samples_per_second": 10.303, |
|
"eval_steps_per_second": 5.159, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002458512599877074, |
|
"grad_norm": 0.2264145463705063, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.4401, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0036877688998156115, |
|
"grad_norm": 0.2364473193883896, |
|
"learning_rate": 6e-06, |
|
"loss": 1.4676, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.004917025199754148, |
|
"grad_norm": 0.24018821120262146, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.3851, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.006146281499692686, |
|
"grad_norm": 0.23238497972488403, |
|
"learning_rate": 1e-05, |
|
"loss": 1.213, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.007375537799631223, |
|
"grad_norm": 0.24634625017642975, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.2627, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.008604794099569761, |
|
"grad_norm": 0.26495596766471863, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 1.3908, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.009834050399508297, |
|
"grad_norm": 0.2719455361366272, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.3814, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.011063306699446834, |
|
"grad_norm": 0.26454323530197144, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.2438, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.012292562999385371, |
|
"grad_norm": 0.3004608750343323, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3694, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.013521819299323909, |
|
"grad_norm": 0.3035408854484558, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 1.4792, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.014751075599262446, |
|
"grad_norm": 0.4270775020122528, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.1673, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.015980331899200985, |
|
"grad_norm": 0.4388391971588135, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.5171, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.017209588199139522, |
|
"grad_norm": 0.7133700847625732, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.0732, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01843884449907806, |
|
"grad_norm": 1.026840329170227, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1705, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.019668100799016593, |
|
"grad_norm": 0.7934454679489136, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.3509, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.02089735709895513, |
|
"grad_norm": 0.8138520121574402, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.181, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.022126613398893668, |
|
"grad_norm": 1.7830528020858765, |
|
"learning_rate": 3.6e-05, |
|
"loss": 2.1836, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.023355869698832205, |
|
"grad_norm": 10.527496337890625, |
|
"learning_rate": 3.8e-05, |
|
"loss": 3.3749, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.024585125998770743, |
|
"grad_norm": 6.364173889160156, |
|
"learning_rate": 4e-05, |
|
"loss": 2.9191, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02581438229870928, |
|
"grad_norm": 7.087876796722412, |
|
"learning_rate": 4.2e-05, |
|
"loss": 3.0788, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.027043638598647817, |
|
"grad_norm": 5.370169639587402, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 2.7809, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.028272894898586354, |
|
"grad_norm": 4.118806838989258, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 2.7475, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.02950215119852489, |
|
"grad_norm": 4.46057653427124, |
|
"learning_rate": 4.8e-05, |
|
"loss": 2.6906, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03073140749846343, |
|
"grad_norm": 3.8601913452148438, |
|
"learning_rate": 5e-05, |
|
"loss": 2.652, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03196066379840197, |
|
"grad_norm": 0.19972144067287445, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.1845, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.03318992009834051, |
|
"grad_norm": 0.21230019629001617, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 1.2875, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.034419176398279044, |
|
"grad_norm": 0.22694356739521027, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.1829, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03564843269821758, |
|
"grad_norm": 0.2587474584579468, |
|
"learning_rate": 5.8e-05, |
|
"loss": 1.3909, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.03687768899815612, |
|
"grad_norm": 0.24409259855747223, |
|
"learning_rate": 6e-05, |
|
"loss": 1.1671, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03810694529809465, |
|
"grad_norm": 0.26323097944259644, |
|
"learning_rate": 6.2e-05, |
|
"loss": 1.2055, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.03933620159803319, |
|
"grad_norm": 0.2842409908771515, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 1.3051, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.040565457897971724, |
|
"grad_norm": 0.32476744055747986, |
|
"learning_rate": 6.6e-05, |
|
"loss": 1.0828, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.04179471419791026, |
|
"grad_norm": 0.32893380522727966, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 1.3896, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0430239704978488, |
|
"grad_norm": 0.3359004855155945, |
|
"learning_rate": 7e-05, |
|
"loss": 1.3713, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.044253226797787336, |
|
"grad_norm": 0.5471646189689636, |
|
"learning_rate": 7.2e-05, |
|
"loss": 1.2572, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04548248309772587, |
|
"grad_norm": 0.5404387712478638, |
|
"learning_rate": 7.4e-05, |
|
"loss": 1.2321, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.04671173939766441, |
|
"grad_norm": 1.0199828147888184, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.8434, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04794099569760295, |
|
"grad_norm": 1.5890088081359863, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.7781, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.049170251997541485, |
|
"grad_norm": 0.7897126078605652, |
|
"learning_rate": 8e-05, |
|
"loss": 1.1982, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05039950829748002, |
|
"grad_norm": 0.7874982953071594, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.3432, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.05162876459741856, |
|
"grad_norm": 1.3902230262756348, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.7817, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0528580208973571, |
|
"grad_norm": 4.006369590759277, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.8832, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.054087277197295634, |
|
"grad_norm": 7.2500996589660645, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.3256, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.05531653349723417, |
|
"grad_norm": 5.088122844696045, |
|
"learning_rate": 9e-05, |
|
"loss": 1.4272, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05654578979717271, |
|
"grad_norm": 2.9680252075195312, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.8063, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.057775046097111246, |
|
"grad_norm": 3.4886820316314697, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.7842, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.05900430239704978, |
|
"grad_norm": 2.635120153427124, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.7775, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06023355869698832, |
|
"grad_norm": 2.7715940475463867, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.6795, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.06146281499692686, |
|
"grad_norm": 4.598182678222656, |
|
"learning_rate": 0.0001, |
|
"loss": 2.0141, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0626920712968654, |
|
"grad_norm": 0.4595154821872711, |
|
"learning_rate": 9.999957617159031e-05, |
|
"loss": 1.1302, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.06392132759680394, |
|
"grad_norm": 0.3996050953865051, |
|
"learning_rate": 9.999830469354645e-05, |
|
"loss": 1.3499, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.06515058389674247, |
|
"grad_norm": 0.4248620867729187, |
|
"learning_rate": 9.999618558742398e-05, |
|
"loss": 1.4393, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.06637984019668101, |
|
"grad_norm": 0.37063130736351013, |
|
"learning_rate": 9.999321888914836e-05, |
|
"loss": 1.4761, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.06760909649661954, |
|
"grad_norm": 0.3327302038669586, |
|
"learning_rate": 9.998940464901447e-05, |
|
"loss": 1.1365, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06883835279655809, |
|
"grad_norm": 0.3424387276172638, |
|
"learning_rate": 9.998474293168562e-05, |
|
"loss": 1.2037, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.07006760909649662, |
|
"grad_norm": 0.34453633427619934, |
|
"learning_rate": 9.997923381619256e-05, |
|
"loss": 0.9586, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.07129686539643516, |
|
"grad_norm": 0.3327544033527374, |
|
"learning_rate": 9.997287739593206e-05, |
|
"loss": 1.3026, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0725261216963737, |
|
"grad_norm": 0.33542299270629883, |
|
"learning_rate": 9.996567377866537e-05, |
|
"loss": 1.1601, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.07375537799631224, |
|
"grad_norm": 0.5743572115898132, |
|
"learning_rate": 9.99576230865164e-05, |
|
"loss": 1.3892, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07498463429625077, |
|
"grad_norm": 0.4624180495738983, |
|
"learning_rate": 9.994872545596966e-05, |
|
"loss": 1.2519, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0762138905961893, |
|
"grad_norm": 0.6259918808937073, |
|
"learning_rate": 9.993898103786786e-05, |
|
"loss": 1.315, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.07744314689612784, |
|
"grad_norm": 0.6186118125915527, |
|
"learning_rate": 9.992838999740947e-05, |
|
"loss": 0.877, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.07867240319606637, |
|
"grad_norm": 0.6788893342018127, |
|
"learning_rate": 9.991695251414583e-05, |
|
"loss": 0.886, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.07990165949600492, |
|
"grad_norm": 0.7688488960266113, |
|
"learning_rate": 9.990466878197817e-05, |
|
"loss": 0.7427, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08113091579594345, |
|
"grad_norm": 0.6739158630371094, |
|
"learning_rate": 9.989153900915427e-05, |
|
"loss": 1.091, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.08236017209588199, |
|
"grad_norm": 1.0515763759613037, |
|
"learning_rate": 9.987756341826493e-05, |
|
"loss": 1.4195, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.08358942839582052, |
|
"grad_norm": 2.324380397796631, |
|
"learning_rate": 9.98627422462403e-05, |
|
"loss": 1.8108, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.08481868469575907, |
|
"grad_norm": 4.131134510040283, |
|
"learning_rate": 9.98470757443457e-05, |
|
"loss": 1.2769, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0860479409956976, |
|
"grad_norm": 6.158152103424072, |
|
"learning_rate": 9.983056417817747e-05, |
|
"loss": 1.609, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08727719729563614, |
|
"grad_norm": 2.710057020187378, |
|
"learning_rate": 9.981320782765846e-05, |
|
"loss": 1.6382, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.08850645359557467, |
|
"grad_norm": 2.729590654373169, |
|
"learning_rate": 9.979500698703323e-05, |
|
"loss": 1.8179, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.08973570989551322, |
|
"grad_norm": 2.1861114501953125, |
|
"learning_rate": 9.977596196486314e-05, |
|
"loss": 1.7416, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.09096496619545175, |
|
"grad_norm": 2.614532947540283, |
|
"learning_rate": 9.975607308402101e-05, |
|
"loss": 1.8413, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.09219422249539029, |
|
"grad_norm": 3.3295183181762695, |
|
"learning_rate": 9.973534068168579e-05, |
|
"loss": 2.1946, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09342347879532882, |
|
"grad_norm": 0.3009834885597229, |
|
"learning_rate": 9.97137651093367e-05, |
|
"loss": 1.1058, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.09465273509526737, |
|
"grad_norm": 0.2889084815979004, |
|
"learning_rate": 9.969134673274738e-05, |
|
"loss": 1.0812, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0958819913952059, |
|
"grad_norm": 0.26639047265052795, |
|
"learning_rate": 9.966808593197959e-05, |
|
"loss": 1.2787, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.09711124769514444, |
|
"grad_norm": 0.2839871048927307, |
|
"learning_rate": 9.964398310137688e-05, |
|
"loss": 1.2314, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.09834050399508297, |
|
"grad_norm": 0.29856863617897034, |
|
"learning_rate": 9.961903864955783e-05, |
|
"loss": 1.1781, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09956976029502151, |
|
"grad_norm": 0.3113296329975128, |
|
"learning_rate": 9.959325299940914e-05, |
|
"loss": 1.1297, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.10079901659496004, |
|
"grad_norm": 0.3259466290473938, |
|
"learning_rate": 9.956662658807842e-05, |
|
"loss": 1.3892, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.10202827289489859, |
|
"grad_norm": 0.3366626501083374, |
|
"learning_rate": 9.95391598669669e-05, |
|
"loss": 1.1833, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.10325752919483712, |
|
"grad_norm": 0.3032483458518982, |
|
"learning_rate": 9.95108533017216e-05, |
|
"loss": 1.1729, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.10448678549477566, |
|
"grad_norm": 0.4028280973434448, |
|
"learning_rate": 9.948170737222762e-05, |
|
"loss": 1.1019, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1057160417947142, |
|
"grad_norm": 0.3796052932739258, |
|
"learning_rate": 9.945172257259986e-05, |
|
"loss": 1.3822, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.10694529809465274, |
|
"grad_norm": 0.3956368565559387, |
|
"learning_rate": 9.942089941117472e-05, |
|
"loss": 1.2101, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.10817455439459127, |
|
"grad_norm": 0.5040555596351624, |
|
"learning_rate": 9.938923841050147e-05, |
|
"loss": 1.059, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.10940381069452981, |
|
"grad_norm": 0.7209507822990417, |
|
"learning_rate": 9.935674010733336e-05, |
|
"loss": 0.9387, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.11063306699446834, |
|
"grad_norm": 0.6711410284042358, |
|
"learning_rate": 9.932340505261855e-05, |
|
"loss": 0.9325, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11186232329440689, |
|
"grad_norm": 0.670559823513031, |
|
"learning_rate": 9.928923381149078e-05, |
|
"loss": 1.1188, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.11309157959434542, |
|
"grad_norm": 1.4009896516799927, |
|
"learning_rate": 9.925422696325975e-05, |
|
"loss": 1.4021, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.11432083589428396, |
|
"grad_norm": 2.7449545860290527, |
|
"learning_rate": 9.921838510140135e-05, |
|
"loss": 1.7181, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.11555009219422249, |
|
"grad_norm": 3.5462844371795654, |
|
"learning_rate": 9.918170883354755e-05, |
|
"loss": 1.4934, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.11677934849416104, |
|
"grad_norm": 3.204674005508423, |
|
"learning_rate": 9.914419878147611e-05, |
|
"loss": 1.2952, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.11800860479409957, |
|
"grad_norm": 2.583436965942383, |
|
"learning_rate": 9.910585558110006e-05, |
|
"loss": 1.418, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.11923786109403811, |
|
"grad_norm": 3.0214803218841553, |
|
"learning_rate": 9.906667988245692e-05, |
|
"loss": 1.8579, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.12046711739397664, |
|
"grad_norm": 2.359790325164795, |
|
"learning_rate": 9.902667234969764e-05, |
|
"loss": 1.2705, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.12169637369391519, |
|
"grad_norm": 2.093607187271118, |
|
"learning_rate": 9.898583366107538e-05, |
|
"loss": 1.4241, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.12292562999385372, |
|
"grad_norm": 2.613720655441284, |
|
"learning_rate": 9.8944164508934e-05, |
|
"loss": 1.7558, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12415488629379226, |
|
"grad_norm": 0.29464319348335266, |
|
"learning_rate": 9.890166559969631e-05, |
|
"loss": 1.1966, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.1253841425937308, |
|
"grad_norm": 0.27224430441856384, |
|
"learning_rate": 9.885833765385212e-05, |
|
"loss": 1.3172, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.12661339889366932, |
|
"grad_norm": 0.2738960385322571, |
|
"learning_rate": 9.881418140594603e-05, |
|
"loss": 1.2875, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.12784265519360788, |
|
"grad_norm": 0.274746298789978, |
|
"learning_rate": 9.876919760456492e-05, |
|
"loss": 1.3156, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1290719114935464, |
|
"grad_norm": 0.3050672113895416, |
|
"learning_rate": 9.872338701232526e-05, |
|
"loss": 1.2426, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.13030116779348494, |
|
"grad_norm": 0.2726648449897766, |
|
"learning_rate": 9.867675040586034e-05, |
|
"loss": 1.1997, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.13153042409342347, |
|
"grad_norm": 0.2615199685096741, |
|
"learning_rate": 9.862928857580687e-05, |
|
"loss": 1.1518, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.13275968039336203, |
|
"grad_norm": 0.27568066120147705, |
|
"learning_rate": 9.858100232679175e-05, |
|
"loss": 0.9874, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.13398893669330056, |
|
"grad_norm": 0.29168951511383057, |
|
"learning_rate": 9.853189247741833e-05, |
|
"loss": 1.2147, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.1352181929932391, |
|
"grad_norm": 0.30630671977996826, |
|
"learning_rate": 9.848195986025257e-05, |
|
"loss": 1.2474, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.13644744929317762, |
|
"grad_norm": 0.3246194124221802, |
|
"learning_rate": 9.843120532180896e-05, |
|
"loss": 1.1839, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.13767670559311618, |
|
"grad_norm": 0.34899017214775085, |
|
"learning_rate": 9.837962972253612e-05, |
|
"loss": 1.2389, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.1389059618930547, |
|
"grad_norm": 0.3848627805709839, |
|
"learning_rate": 9.83272339368022e-05, |
|
"loss": 1.1833, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.14013521819299324, |
|
"grad_norm": 0.4109489917755127, |
|
"learning_rate": 9.827401885288013e-05, |
|
"loss": 1.1026, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.14136447449293177, |
|
"grad_norm": 0.6600728034973145, |
|
"learning_rate": 9.821998537293245e-05, |
|
"loss": 1.4073, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.14259373079287033, |
|
"grad_norm": 0.5556017756462097, |
|
"learning_rate": 9.816513441299613e-05, |
|
"loss": 0.6878, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.14382298709280886, |
|
"grad_norm": 0.5937761068344116, |
|
"learning_rate": 9.810946690296698e-05, |
|
"loss": 0.7988, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.1450522433927474, |
|
"grad_norm": 0.6892157196998596, |
|
"learning_rate": 9.80529837865839e-05, |
|
"loss": 1.2152, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.14628149969268592, |
|
"grad_norm": 1.1046031713485718, |
|
"learning_rate": 9.799568602141283e-05, |
|
"loss": 1.4396, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.14751075599262448, |
|
"grad_norm": 3.366898536682129, |
|
"learning_rate": 9.793757457883062e-05, |
|
"loss": 1.6062, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.148740012292563, |
|
"grad_norm": 4.46527624130249, |
|
"learning_rate": 9.787865044400848e-05, |
|
"loss": 1.041, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.14996926859250154, |
|
"grad_norm": 3.8992013931274414, |
|
"learning_rate": 9.781891461589531e-05, |
|
"loss": 1.6166, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.15119852489244007, |
|
"grad_norm": 2.6794042587280273, |
|
"learning_rate": 9.775836810720074e-05, |
|
"loss": 1.5444, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.1524277811923786, |
|
"grad_norm": 2.1487152576446533, |
|
"learning_rate": 9.769701194437799e-05, |
|
"loss": 1.4051, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.15365703749231716, |
|
"grad_norm": 2.6264848709106445, |
|
"learning_rate": 9.763484716760649e-05, |
|
"loss": 1.7286, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.15488629379225569, |
|
"grad_norm": 0.2960408329963684, |
|
"learning_rate": 9.757187483077413e-05, |
|
"loss": 1.1932, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.15611555009219422, |
|
"grad_norm": 0.2633897364139557, |
|
"learning_rate": 9.750809600145954e-05, |
|
"loss": 1.2997, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.15734480639213275, |
|
"grad_norm": 0.2459549605846405, |
|
"learning_rate": 9.744351176091393e-05, |
|
"loss": 1.0985, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.1585740626920713, |
|
"grad_norm": 0.30462849140167236, |
|
"learning_rate": 9.737812320404271e-05, |
|
"loss": 1.4303, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.15980331899200984, |
|
"grad_norm": 0.27317526936531067, |
|
"learning_rate": 9.731193143938704e-05, |
|
"loss": 1.224, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16103257529194837, |
|
"grad_norm": 0.26538556814193726, |
|
"learning_rate": 9.724493758910491e-05, |
|
"loss": 1.2667, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.1622618315918869, |
|
"grad_norm": 0.28112831711769104, |
|
"learning_rate": 9.71771427889522e-05, |
|
"loss": 1.1212, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.16349108789182545, |
|
"grad_norm": 0.2989320755004883, |
|
"learning_rate": 9.71085481882634e-05, |
|
"loss": 1.0484, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.16472034419176398, |
|
"grad_norm": 0.2814895212650299, |
|
"learning_rate": 9.703915494993215e-05, |
|
"loss": 0.7544, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.16594960049170251, |
|
"grad_norm": 0.3104398846626282, |
|
"learning_rate": 9.696896425039146e-05, |
|
"loss": 1.0323, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.16717885679164105, |
|
"grad_norm": 0.4948181211948395, |
|
"learning_rate": 9.689797727959387e-05, |
|
"loss": 1.2073, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.1684081130915796, |
|
"grad_norm": 0.4018343985080719, |
|
"learning_rate": 9.682619524099112e-05, |
|
"loss": 1.2409, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.16963736939151813, |
|
"grad_norm": 0.5637558102607727, |
|
"learning_rate": 9.675361935151395e-05, |
|
"loss": 1.3184, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.17086662569145666, |
|
"grad_norm": 0.7405252456665039, |
|
"learning_rate": 9.66802508415513e-05, |
|
"loss": 1.0983, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.1720958819913952, |
|
"grad_norm": 0.6686736345291138, |
|
"learning_rate": 9.660609095492952e-05, |
|
"loss": 1.0025, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17332513829133375, |
|
"grad_norm": 0.7121345400810242, |
|
"learning_rate": 9.653114094889127e-05, |
|
"loss": 0.9337, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.17455439459127228, |
|
"grad_norm": 1.06205153465271, |
|
"learning_rate": 9.645540209407425e-05, |
|
"loss": 1.2931, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.1757836508912108, |
|
"grad_norm": 2.3874034881591797, |
|
"learning_rate": 9.637887567448959e-05, |
|
"loss": 1.5124, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.17701290719114934, |
|
"grad_norm": 2.6609811782836914, |
|
"learning_rate": 9.630156298750011e-05, |
|
"loss": 1.4161, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.1782421634910879, |
|
"grad_norm": 2.413705587387085, |
|
"learning_rate": 9.622346534379833e-05, |
|
"loss": 1.2768, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.17947141979102643, |
|
"grad_norm": 2.920910120010376, |
|
"learning_rate": 9.614458406738427e-05, |
|
"loss": 1.0866, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.18070067609096496, |
|
"grad_norm": 2.389439582824707, |
|
"learning_rate": 9.606492049554297e-05, |
|
"loss": 1.4862, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.1819299323909035, |
|
"grad_norm": 2.03515887260437, |
|
"learning_rate": 9.598447597882181e-05, |
|
"loss": 1.3503, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.18315918869084205, |
|
"grad_norm": 2.016889810562134, |
|
"learning_rate": 9.590325188100768e-05, |
|
"loss": 1.2565, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.18438844499078058, |
|
"grad_norm": 2.1591711044311523, |
|
"learning_rate": 9.582124957910375e-05, |
|
"loss": 1.1261, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1856177012907191, |
|
"grad_norm": 0.2707172632217407, |
|
"learning_rate": 9.573847046330628e-05, |
|
"loss": 1.1045, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.18684695759065764, |
|
"grad_norm": 0.25980842113494873, |
|
"learning_rate": 9.565491593698086e-05, |
|
"loss": 1.274, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.1880762138905962, |
|
"grad_norm": 0.25503602623939514, |
|
"learning_rate": 9.55705874166388e-05, |
|
"loss": 1.0971, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.18930547019053473, |
|
"grad_norm": 0.27756351232528687, |
|
"learning_rate": 9.548548633191299e-05, |
|
"loss": 1.215, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.19053472649047326, |
|
"grad_norm": 0.2732703387737274, |
|
"learning_rate": 9.539961412553375e-05, |
|
"loss": 1.1326, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.1917639827904118, |
|
"grad_norm": 0.28855475783348083, |
|
"learning_rate": 9.531297225330429e-05, |
|
"loss": 1.2862, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.19299323909035035, |
|
"grad_norm": 0.3158769905567169, |
|
"learning_rate": 9.522556218407608e-05, |
|
"loss": 1.2254, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.19422249539028888, |
|
"grad_norm": 0.30355289578437805, |
|
"learning_rate": 9.513738539972394e-05, |
|
"loss": 1.062, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.1954517516902274, |
|
"grad_norm": 0.3448358178138733, |
|
"learning_rate": 9.504844339512095e-05, |
|
"loss": 0.9856, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.19668100799016594, |
|
"grad_norm": 0.3306958079338074, |
|
"learning_rate": 9.495873767811305e-05, |
|
"loss": 1.2696, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1979102642901045, |
|
"grad_norm": 0.4231187105178833, |
|
"learning_rate": 9.486826976949345e-05, |
|
"loss": 1.1711, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.19913952059004303, |
|
"grad_norm": 0.5289990901947021, |
|
"learning_rate": 9.477704120297697e-05, |
|
"loss": 1.4088, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.20036877688998156, |
|
"grad_norm": 0.5111967921257019, |
|
"learning_rate": 9.468505352517394e-05, |
|
"loss": 1.1683, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.2015980331899201, |
|
"grad_norm": 0.7477207779884338, |
|
"learning_rate": 9.459230829556401e-05, |
|
"loss": 0.995, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.20282728948985865, |
|
"grad_norm": 0.7836649417877197, |
|
"learning_rate": 9.449880708646971e-05, |
|
"loss": 0.8027, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.20405654578979718, |
|
"grad_norm": 0.6803653240203857, |
|
"learning_rate": 9.440455148302977e-05, |
|
"loss": 0.9725, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.2052858020897357, |
|
"grad_norm": 0.8779723048210144, |
|
"learning_rate": 9.430954308317233e-05, |
|
"loss": 1.1995, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.20651505838967424, |
|
"grad_norm": 1.3584879636764526, |
|
"learning_rate": 9.421378349758769e-05, |
|
"loss": 1.4558, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.2077443146896128, |
|
"grad_norm": 2.1976521015167236, |
|
"learning_rate": 9.411727434970121e-05, |
|
"loss": 1.0717, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.20897357098955133, |
|
"grad_norm": 3.9302353858947754, |
|
"learning_rate": 9.402001727564565e-05, |
|
"loss": 1.5138, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21020282728948986, |
|
"grad_norm": 3.9594686031341553, |
|
"learning_rate": 9.392201392423342e-05, |
|
"loss": 1.4295, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.2114320835894284, |
|
"grad_norm": 3.2994837760925293, |
|
"learning_rate": 9.382326595692868e-05, |
|
"loss": 1.8676, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.21266133988936695, |
|
"grad_norm": 2.219341993331909, |
|
"learning_rate": 9.372377504781924e-05, |
|
"loss": 1.3185, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.21389059618930548, |
|
"grad_norm": 2.3389649391174316, |
|
"learning_rate": 9.362354288358803e-05, |
|
"loss": 0.9969, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.215119852489244, |
|
"grad_norm": 3.8493995666503906, |
|
"learning_rate": 9.35225711634846e-05, |
|
"loss": 1.2903, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.21634910878918254, |
|
"grad_norm": 0.24931700527668, |
|
"learning_rate": 9.34208615992963e-05, |
|
"loss": 1.051, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.2175783650891211, |
|
"grad_norm": 0.2944095730781555, |
|
"learning_rate": 9.331841591531922e-05, |
|
"loss": 1.3364, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.21880762138905963, |
|
"grad_norm": 0.26118403673171997, |
|
"learning_rate": 9.321523584832905e-05, |
|
"loss": 1.1487, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.22003687768899816, |
|
"grad_norm": 0.29458168148994446, |
|
"learning_rate": 9.311132314755149e-05, |
|
"loss": 1.365, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.2212661339889367, |
|
"grad_norm": 0.2739919424057007, |
|
"learning_rate": 9.300667957463278e-05, |
|
"loss": 1.2595, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.22249539028887522, |
|
"grad_norm": 0.25647538900375366, |
|
"learning_rate": 9.290130690360965e-05, |
|
"loss": 0.9865, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.22372464658881377, |
|
"grad_norm": 0.27343517541885376, |
|
"learning_rate": 9.279520692087938e-05, |
|
"loss": 1.1263, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.2249539028887523, |
|
"grad_norm": 0.3220975697040558, |
|
"learning_rate": 9.268838142516943e-05, |
|
"loss": 1.3404, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.22618315918869084, |
|
"grad_norm": 0.3012546896934509, |
|
"learning_rate": 9.258083222750703e-05, |
|
"loss": 0.934, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.22741241548862937, |
|
"grad_norm": 0.3433031439781189, |
|
"learning_rate": 9.247256115118835e-05, |
|
"loss": 1.1895, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.22864167178856792, |
|
"grad_norm": 0.3515290915966034, |
|
"learning_rate": 9.236357003174775e-05, |
|
"loss": 1.3236, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.22987092808850645, |
|
"grad_norm": 0.4033795893192291, |
|
"learning_rate": 9.225386071692654e-05, |
|
"loss": 1.2089, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.23110018438844498, |
|
"grad_norm": 0.42729562520980835, |
|
"learning_rate": 9.214343506664168e-05, |
|
"loss": 1.1346, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.23232944068838352, |
|
"grad_norm": 0.6692906618118286, |
|
"learning_rate": 9.203229495295429e-05, |
|
"loss": 1.0211, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.23355869698832207, |
|
"grad_norm": 0.6882857084274292, |
|
"learning_rate": 9.192044226003789e-05, |
|
"loss": 0.8235, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2347879532882606, |
|
"grad_norm": 0.6821665167808533, |
|
"learning_rate": 9.18078788841464e-05, |
|
"loss": 0.8171, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.23601720958819913, |
|
"grad_norm": 0.7368921041488647, |
|
"learning_rate": 9.169460673358212e-05, |
|
"loss": 0.9993, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.23724646588813766, |
|
"grad_norm": 0.9759008884429932, |
|
"learning_rate": 9.158062772866325e-05, |
|
"loss": 1.2029, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.23847572218807622, |
|
"grad_norm": 2.167100667953491, |
|
"learning_rate": 9.146594380169143e-05, |
|
"loss": 1.1393, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.23970497848801475, |
|
"grad_norm": 2.76292085647583, |
|
"learning_rate": 9.135055689691888e-05, |
|
"loss": 0.946, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.24093423478795328, |
|
"grad_norm": 3.504427671432495, |
|
"learning_rate": 9.123446897051555e-05, |
|
"loss": 1.7001, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.2421634910878918, |
|
"grad_norm": 2.606448173522949, |
|
"learning_rate": 9.111768199053588e-05, |
|
"loss": 1.6293, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.24339274738783037, |
|
"grad_norm": 2.1803855895996094, |
|
"learning_rate": 9.100019793688549e-05, |
|
"loss": 1.2392, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.2446220036877689, |
|
"grad_norm": 2.3470633029937744, |
|
"learning_rate": 9.088201880128755e-05, |
|
"loss": 1.0844, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.24585125998770743, |
|
"grad_norm": 2.47255802154541, |
|
"learning_rate": 9.076314658724906e-05, |
|
"loss": 1.19, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24708051628764596, |
|
"grad_norm": 0.2115241140127182, |
|
"learning_rate": 9.064358331002691e-05, |
|
"loss": 0.9038, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.24830977258758452, |
|
"grad_norm": 0.2693980038166046, |
|
"learning_rate": 9.05233309965936e-05, |
|
"loss": 1.0014, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.24953902888752305, |
|
"grad_norm": 0.28890225291252136, |
|
"learning_rate": 9.040239168560303e-05, |
|
"loss": 1.1698, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.2507682851874616, |
|
"grad_norm": 0.27143335342407227, |
|
"learning_rate": 9.028076742735583e-05, |
|
"loss": 1.1856, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.2507682851874616, |
|
"eval_loss": 1.0315037965774536, |
|
"eval_runtime": 65.4064, |
|
"eval_samples_per_second": 10.473, |
|
"eval_steps_per_second": 5.244, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.2519975414874001, |
|
"grad_norm": 0.3105545938014984, |
|
"learning_rate": 9.015846028376462e-05, |
|
"loss": 1.2827, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.25322679778733864, |
|
"grad_norm": 0.2826372981071472, |
|
"learning_rate": 9.00354723283191e-05, |
|
"loss": 1.1159, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.2544560540872772, |
|
"grad_norm": 0.2823708951473236, |
|
"learning_rate": 8.991180564605086e-05, |
|
"loss": 1.0368, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.25568531038721576, |
|
"grad_norm": 0.28265297412872314, |
|
"learning_rate": 8.978746233349802e-05, |
|
"loss": 1.1583, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.2569145666871543, |
|
"grad_norm": 0.3202212452888489, |
|
"learning_rate": 8.966244449866973e-05, |
|
"loss": 1.2069, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.2581438229870928, |
|
"grad_norm": 0.30576291680336, |
|
"learning_rate": 8.953675426101038e-05, |
|
"loss": 1.1588, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.25937307928703135, |
|
"grad_norm": 0.3853960633277893, |
|
"learning_rate": 8.941039375136371e-05, |
|
"loss": 1.1947, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.2606023355869699, |
|
"grad_norm": 0.4404067099094391, |
|
"learning_rate": 8.928336511193669e-05, |
|
"loss": 1.0786, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.2618315918869084, |
|
"grad_norm": 0.422333300113678, |
|
"learning_rate": 8.915567049626315e-05, |
|
"loss": 1.1454, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.26306084818684694, |
|
"grad_norm": 0.5277565121650696, |
|
"learning_rate": 8.902731206916734e-05, |
|
"loss": 0.7775, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.26429010448678547, |
|
"grad_norm": 0.7032243609428406, |
|
"learning_rate": 8.889829200672719e-05, |
|
"loss": 0.5771, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.26551936078672406, |
|
"grad_norm": 0.6663339734077454, |
|
"learning_rate": 8.876861249623739e-05, |
|
"loss": 0.616, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.2667486170866626, |
|
"grad_norm": 0.8129518628120422, |
|
"learning_rate": 8.863827573617238e-05, |
|
"loss": 1.1483, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.2679778733866011, |
|
"grad_norm": 1.0273211002349854, |
|
"learning_rate": 8.850728393614902e-05, |
|
"loss": 1.2066, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.26920712968653965, |
|
"grad_norm": 1.5424954891204834, |
|
"learning_rate": 8.837563931688919e-05, |
|
"loss": 1.247, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.2704363859864782, |
|
"grad_norm": 2.9167752265930176, |
|
"learning_rate": 8.824334411018204e-05, |
|
"loss": 1.3413, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2716656422864167, |
|
"grad_norm": 5.498292446136475, |
|
"learning_rate": 8.811040055884629e-05, |
|
"loss": 1.0072, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.27289489858635524, |
|
"grad_norm": 3.1687686443328857, |
|
"learning_rate": 8.797681091669206e-05, |
|
"loss": 1.3309, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.27412415488629377, |
|
"grad_norm": 2.760160446166992, |
|
"learning_rate": 8.784257744848279e-05, |
|
"loss": 1.5268, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.27535341118623236, |
|
"grad_norm": 2.3323326110839844, |
|
"learning_rate": 8.770770242989679e-05, |
|
"loss": 1.27, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.2765826674861709, |
|
"grad_norm": 2.150510549545288, |
|
"learning_rate": 8.75721881474886e-05, |
|
"loss": 1.0602, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2778119237861094, |
|
"grad_norm": 0.23049846291542053, |
|
"learning_rate": 8.743603689865039e-05, |
|
"loss": 1.0067, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.27904118008604795, |
|
"grad_norm": 0.2650708556175232, |
|
"learning_rate": 8.729925099157281e-05, |
|
"loss": 1.1932, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.2802704363859865, |
|
"grad_norm": 0.2723963260650635, |
|
"learning_rate": 8.7161832745206e-05, |
|
"loss": 1.2495, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.281499692685925, |
|
"grad_norm": 0.26627010107040405, |
|
"learning_rate": 8.702378448922026e-05, |
|
"loss": 1.2837, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.28272894898586354, |
|
"grad_norm": 0.2728361189365387, |
|
"learning_rate": 8.688510856396648e-05, |
|
"loss": 1.2969, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28395820528580207, |
|
"grad_norm": 0.26788559556007385, |
|
"learning_rate": 8.674580732043656e-05, |
|
"loss": 1.0944, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.28518746158574065, |
|
"grad_norm": 0.3129604160785675, |
|
"learning_rate": 8.660588312022344e-05, |
|
"loss": 1.3591, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.2864167178856792, |
|
"grad_norm": 0.32250627875328064, |
|
"learning_rate": 8.646533833548119e-05, |
|
"loss": 1.1469, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.2876459741856177, |
|
"grad_norm": 0.32614386081695557, |
|
"learning_rate": 8.632417534888473e-05, |
|
"loss": 1.3551, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.28887523048555624, |
|
"grad_norm": 0.3620636463165283, |
|
"learning_rate": 8.61823965535894e-05, |
|
"loss": 1.1427, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2901044867854948, |
|
"grad_norm": 0.39082473516464233, |
|
"learning_rate": 8.604000435319047e-05, |
|
"loss": 1.0041, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.2913337430854333, |
|
"grad_norm": 0.3823097050189972, |
|
"learning_rate": 8.589700116168232e-05, |
|
"loss": 1.1756, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.29256299938537184, |
|
"grad_norm": 0.5359341502189636, |
|
"learning_rate": 8.575338940341757e-05, |
|
"loss": 1.1814, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.29379225568531037, |
|
"grad_norm": 0.6902546286582947, |
|
"learning_rate": 8.560917151306593e-05, |
|
"loss": 0.9253, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.29502151198524895, |
|
"grad_norm": 0.7236252427101135, |
|
"learning_rate": 8.5464349935573e-05, |
|
"loss": 0.6398, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2962507682851875, |
|
"grad_norm": 0.7172759175300598, |
|
"learning_rate": 8.53189271261187e-05, |
|
"loss": 0.9061, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.297480024585126, |
|
"grad_norm": 0.7999723553657532, |
|
"learning_rate": 8.517290555007578e-05, |
|
"loss": 1.0691, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.29870928088506454, |
|
"grad_norm": 1.235872745513916, |
|
"learning_rate": 8.502628768296788e-05, |
|
"loss": 1.5235, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.2999385371850031, |
|
"grad_norm": 1.9676207304000854, |
|
"learning_rate": 8.487907601042777e-05, |
|
"loss": 1.5859, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.3011677934849416, |
|
"grad_norm": 3.5035860538482666, |
|
"learning_rate": 8.473127302815496e-05, |
|
"loss": 1.1743, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.30239704978488013, |
|
"grad_norm": 4.519472599029541, |
|
"learning_rate": 8.458288124187359e-05, |
|
"loss": 0.7165, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.30362630608481866, |
|
"grad_norm": 2.3718838691711426, |
|
"learning_rate": 8.443390316728987e-05, |
|
"loss": 1.1449, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.3048555623847572, |
|
"grad_norm": 2.1668829917907715, |
|
"learning_rate": 8.428434133004937e-05, |
|
"loss": 1.0383, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.3060848186846958, |
|
"grad_norm": 3.2350733280181885, |
|
"learning_rate": 8.413419826569435e-05, |
|
"loss": 1.2341, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.3073140749846343, |
|
"grad_norm": 2.3541886806488037, |
|
"learning_rate": 8.398347651962064e-05, |
|
"loss": 1.0355, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.30854333128457284, |
|
"grad_norm": 0.2730487883090973, |
|
"learning_rate": 8.383217864703456e-05, |
|
"loss": 1.2813, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.30977258758451137, |
|
"grad_norm": 0.2517383098602295, |
|
"learning_rate": 8.36803072129096e-05, |
|
"loss": 1.1793, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.3110018438844499, |
|
"grad_norm": 0.28486472368240356, |
|
"learning_rate": 8.352786479194288e-05, |
|
"loss": 1.4065, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.31223110018438843, |
|
"grad_norm": 0.3247184455394745, |
|
"learning_rate": 8.337485396851155e-05, |
|
"loss": 1.4863, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.31346035648432696, |
|
"grad_norm": 0.26896461844444275, |
|
"learning_rate": 8.322127733662897e-05, |
|
"loss": 1.1373, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.3146896127842655, |
|
"grad_norm": 0.29333245754241943, |
|
"learning_rate": 8.306713749990072e-05, |
|
"loss": 1.0615, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.3159188690842041, |
|
"grad_norm": 0.2958793640136719, |
|
"learning_rate": 8.291243707148048e-05, |
|
"loss": 0.9392, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.3171481253841426, |
|
"grad_norm": 0.3320540487766266, |
|
"learning_rate": 8.275717867402575e-05, |
|
"loss": 1.2935, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.31837738168408114, |
|
"grad_norm": 0.3567339777946472, |
|
"learning_rate": 8.260136493965326e-05, |
|
"loss": 1.0954, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.31960663798401967, |
|
"grad_norm": 0.38393881916999817, |
|
"learning_rate": 8.244499850989452e-05, |
|
"loss": 1.045, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3208358942839582, |
|
"grad_norm": 0.41993001103401184, |
|
"learning_rate": 8.228808203565095e-05, |
|
"loss": 1.2225, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.32206515058389673, |
|
"grad_norm": 0.6547941565513611, |
|
"learning_rate": 8.213061817714893e-05, |
|
"loss": 0.9286, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.32329440688383526, |
|
"grad_norm": 0.7117279767990112, |
|
"learning_rate": 8.197260960389474e-05, |
|
"loss": 0.5088, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.3245236631837738, |
|
"grad_norm": 0.7041743993759155, |
|
"learning_rate": 8.181405899462926e-05, |
|
"loss": 0.8899, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.3257529194837124, |
|
"grad_norm": 0.7142787575721741, |
|
"learning_rate": 8.16549690372826e-05, |
|
"loss": 0.7447, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.3269821757836509, |
|
"grad_norm": 0.8879908323287964, |
|
"learning_rate": 8.14953424289285e-05, |
|
"loss": 1.2607, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.32821143208358944, |
|
"grad_norm": 0.9387282133102417, |
|
"learning_rate": 8.133518187573862e-05, |
|
"loss": 1.1611, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.32944068838352797, |
|
"grad_norm": 1.4039078950881958, |
|
"learning_rate": 8.117449009293668e-05, |
|
"loss": 0.9947, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.3306699446834665, |
|
"grad_norm": 3.3686740398406982, |
|
"learning_rate": 8.101326980475237e-05, |
|
"loss": 1.0783, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.33189920098340503, |
|
"grad_norm": 2.8384785652160645, |
|
"learning_rate": 8.085152374437525e-05, |
|
"loss": 0.9008, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.33312845728334356, |
|
"grad_norm": 2.453441619873047, |
|
"learning_rate": 8.06892546539083e-05, |
|
"loss": 0.5504, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.3343577135832821, |
|
"grad_norm": 2.592667579650879, |
|
"learning_rate": 8.052646528432158e-05, |
|
"loss": 0.7489, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.3355869698832207, |
|
"grad_norm": 1.9753395318984985, |
|
"learning_rate": 8.036315839540545e-05, |
|
"loss": 0.9747, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.3368162261831592, |
|
"grad_norm": 3.042698860168457, |
|
"learning_rate": 8.019933675572389e-05, |
|
"loss": 1.6841, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.33804548248309774, |
|
"grad_norm": 2.4343316555023193, |
|
"learning_rate": 8.00350031425675e-05, |
|
"loss": 0.869, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.33927473878303627, |
|
"grad_norm": 0.2026144415140152, |
|
"learning_rate": 7.98701603419064e-05, |
|
"loss": 0.8867, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.3405039950829748, |
|
"grad_norm": 0.24370141327381134, |
|
"learning_rate": 7.970481114834312e-05, |
|
"loss": 1.3135, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.34173325138291333, |
|
"grad_norm": 0.22894087433815002, |
|
"learning_rate": 7.953895836506508e-05, |
|
"loss": 1.0986, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.34296250768285186, |
|
"grad_norm": 0.2533970773220062, |
|
"learning_rate": 7.937260480379712e-05, |
|
"loss": 1.1821, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.3441917639827904, |
|
"grad_norm": 0.25789350271224976, |
|
"learning_rate": 7.920575328475385e-05, |
|
"loss": 1.1414, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.345421020282729, |
|
"grad_norm": 0.28820541501045227, |
|
"learning_rate": 7.903840663659186e-05, |
|
"loss": 1.3332, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.3466502765826675, |
|
"grad_norm": 0.28611505031585693, |
|
"learning_rate": 7.887056769636165e-05, |
|
"loss": 1.0901, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.34787953288260604, |
|
"grad_norm": 0.28022873401641846, |
|
"learning_rate": 7.870223930945972e-05, |
|
"loss": 0.8461, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.34910878918254457, |
|
"grad_norm": 0.3246136009693146, |
|
"learning_rate": 7.853342432958013e-05, |
|
"loss": 0.9325, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.3503380454824831, |
|
"grad_norm": 0.3149406611919403, |
|
"learning_rate": 7.836412561866629e-05, |
|
"loss": 1.013, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3515673017824216, |
|
"grad_norm": 0.3745490610599518, |
|
"learning_rate": 7.819434604686228e-05, |
|
"loss": 1.2624, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.35279655808236016, |
|
"grad_norm": 0.4822925329208374, |
|
"learning_rate": 7.802408849246442e-05, |
|
"loss": 1.2424, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.3540258143822987, |
|
"grad_norm": 0.6210641264915466, |
|
"learning_rate": 7.785335584187219e-05, |
|
"loss": 1.2527, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.3552550706822373, |
|
"grad_norm": 0.6488444805145264, |
|
"learning_rate": 7.768215098953952e-05, |
|
"loss": 0.7986, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.3564843269821758, |
|
"grad_norm": 0.760388195514679, |
|
"learning_rate": 7.751047683792561e-05, |
|
"loss": 1.0136, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.35771358328211433, |
|
"grad_norm": 0.7666548490524292, |
|
"learning_rate": 7.73383362974458e-05, |
|
"loss": 0.8205, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.35894283958205286, |
|
"grad_norm": 0.7492078542709351, |
|
"learning_rate": 7.71657322864221e-05, |
|
"loss": 0.9254, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.3601720958819914, |
|
"grad_norm": 0.9061193466186523, |
|
"learning_rate": 7.699266773103389e-05, |
|
"loss": 1.3013, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.3614013521819299, |
|
"grad_norm": 2.1404013633728027, |
|
"learning_rate": 7.681914556526817e-05, |
|
"loss": 1.5957, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.36263060848186845, |
|
"grad_norm": 2.647864580154419, |
|
"learning_rate": 7.664516873086987e-05, |
|
"loss": 1.1658, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.363859864781807, |
|
"grad_norm": 3.0906460285186768, |
|
"learning_rate": 7.647074017729202e-05, |
|
"loss": 1.1344, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.36508912108174557, |
|
"grad_norm": 2.2348814010620117, |
|
"learning_rate": 7.629586286164565e-05, |
|
"loss": 0.8813, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.3663183773816841, |
|
"grad_norm": 2.937446117401123, |
|
"learning_rate": 7.612053974864976e-05, |
|
"loss": 1.0414, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.36754763368162263, |
|
"grad_norm": 2.5343546867370605, |
|
"learning_rate": 7.594477381058098e-05, |
|
"loss": 1.1847, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.36877688998156116, |
|
"grad_norm": 2.8971638679504395, |
|
"learning_rate": 7.576856802722325e-05, |
|
"loss": 0.9029, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3700061462814997, |
|
"grad_norm": 0.1982557773590088, |
|
"learning_rate": 7.559192538581722e-05, |
|
"loss": 0.9314, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.3712354025814382, |
|
"grad_norm": 0.24721381068229675, |
|
"learning_rate": 7.541484888100974e-05, |
|
"loss": 1.2432, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.37246465888137675, |
|
"grad_norm": 0.24999506771564484, |
|
"learning_rate": 7.523734151480289e-05, |
|
"loss": 1.285, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.3736939151813153, |
|
"grad_norm": 0.267764151096344, |
|
"learning_rate": 7.505940629650326e-05, |
|
"loss": 1.198, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.3749231714812538, |
|
"grad_norm": 0.26003679633140564, |
|
"learning_rate": 7.488104624267091e-05, |
|
"loss": 1.2001, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3761524277811924, |
|
"grad_norm": 0.28197526931762695, |
|
"learning_rate": 7.470226437706813e-05, |
|
"loss": 1.1687, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.37738168408113093, |
|
"grad_norm": 0.29367661476135254, |
|
"learning_rate": 7.452306373060829e-05, |
|
"loss": 1.211, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.37861094038106946, |
|
"grad_norm": 0.2982727885246277, |
|
"learning_rate": 7.434344734130437e-05, |
|
"loss": 1.151, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.379840196681008, |
|
"grad_norm": 0.3283758759498596, |
|
"learning_rate": 7.416341825421754e-05, |
|
"loss": 0.9875, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.3810694529809465, |
|
"grad_norm": 0.32420334219932556, |
|
"learning_rate": 7.398297952140544e-05, |
|
"loss": 1.0796, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.38229870928088505, |
|
"grad_norm": 0.4046980142593384, |
|
"learning_rate": 7.380213420187055e-05, |
|
"loss": 1.1158, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.3835279655808236, |
|
"grad_norm": 0.391736775636673, |
|
"learning_rate": 7.36208853615082e-05, |
|
"loss": 1.1682, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.3847572218807621, |
|
"grad_norm": 0.6027556657791138, |
|
"learning_rate": 7.343923607305471e-05, |
|
"loss": 1.0696, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.3859864781807007, |
|
"grad_norm": 0.6483603119850159, |
|
"learning_rate": 7.325718941603527e-05, |
|
"loss": 0.7843, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.38721573448063923, |
|
"grad_norm": 0.6711483001708984, |
|
"learning_rate": 7.307474847671168e-05, |
|
"loss": 0.7247, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.38844499078057776, |
|
"grad_norm": 0.7372632026672363, |
|
"learning_rate": 7.289191634803003e-05, |
|
"loss": 1.0535, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.3896742470805163, |
|
"grad_norm": 0.7427420020103455, |
|
"learning_rate": 7.270869612956835e-05, |
|
"loss": 1.0563, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.3909035033804548, |
|
"grad_norm": 2.6449501514434814, |
|
"learning_rate": 7.252509092748401e-05, |
|
"loss": 1.3099, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.39213275968039335, |
|
"grad_norm": 3.1938464641571045, |
|
"learning_rate": 7.234110385446103e-05, |
|
"loss": 1.1728, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.3933620159803319, |
|
"grad_norm": 2.584103584289551, |
|
"learning_rate": 7.215673802965734e-05, |
|
"loss": 0.792, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3945912722802704, |
|
"grad_norm": 2.358025074005127, |
|
"learning_rate": 7.197199657865195e-05, |
|
"loss": 1.0462, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.395820528580209, |
|
"grad_norm": 2.9621617794036865, |
|
"learning_rate": 7.178688263339184e-05, |
|
"loss": 1.4222, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.3970497848801475, |
|
"grad_norm": 2.5362660884857178, |
|
"learning_rate": 7.160139933213898e-05, |
|
"loss": 1.1527, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.39827904118008606, |
|
"grad_norm": 2.4901375770568848, |
|
"learning_rate": 7.141554981941709e-05, |
|
"loss": 1.1712, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.3995082974800246, |
|
"grad_norm": 2.9214236736297607, |
|
"learning_rate": 7.12293372459583e-05, |
|
"loss": 1.1977, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.4007375537799631, |
|
"grad_norm": 0.24753543734550476, |
|
"learning_rate": 7.104276476864974e-05, |
|
"loss": 1.2176, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.40196681007990165, |
|
"grad_norm": 0.25986090302467346, |
|
"learning_rate": 7.085583555048008e-05, |
|
"loss": 1.2854, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.4031960663798402, |
|
"grad_norm": 0.2640175521373749, |
|
"learning_rate": 7.066855276048587e-05, |
|
"loss": 1.2204, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.4044253226797787, |
|
"grad_norm": 0.2603614330291748, |
|
"learning_rate": 7.048091957369776e-05, |
|
"loss": 1.2621, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.4056545789797173, |
|
"grad_norm": 0.2921195924282074, |
|
"learning_rate": 7.029293917108678e-05, |
|
"loss": 1.281, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4068838352796558, |
|
"grad_norm": 0.2984941899776459, |
|
"learning_rate": 7.010461473951033e-05, |
|
"loss": 1.071, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.40811309157959436, |
|
"grad_norm": 0.31219175457954407, |
|
"learning_rate": 6.991594947165818e-05, |
|
"loss": 1.3161, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.4093423478795329, |
|
"grad_norm": 0.31329602003097534, |
|
"learning_rate": 6.972694656599834e-05, |
|
"loss": 0.9854, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.4105716041794714, |
|
"grad_norm": 0.3356671929359436, |
|
"learning_rate": 6.953760922672286e-05, |
|
"loss": 1.02, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.41180086047940995, |
|
"grad_norm": 0.3843994438648224, |
|
"learning_rate": 6.934794066369348e-05, |
|
"loss": 1.2173, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4130301167793485, |
|
"grad_norm": 0.45338544249534607, |
|
"learning_rate": 6.915794409238718e-05, |
|
"loss": 1.3614, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.414259373079287, |
|
"grad_norm": 0.4857298731803894, |
|
"learning_rate": 6.896762273384178e-05, |
|
"loss": 1.0175, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.4154886293792256, |
|
"grad_norm": 0.6512896418571472, |
|
"learning_rate": 6.877697981460125e-05, |
|
"loss": 0.6555, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.4167178856791641, |
|
"grad_norm": 0.6744720935821533, |
|
"learning_rate": 6.858601856666094e-05, |
|
"loss": 0.6057, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.41794714197910265, |
|
"grad_norm": 0.6527014374732971, |
|
"learning_rate": 6.839474222741299e-05, |
|
"loss": 0.9116, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4191763982790412, |
|
"grad_norm": 0.6935631036758423, |
|
"learning_rate": 6.820315403959123e-05, |
|
"loss": 0.9876, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.4204056545789797, |
|
"grad_norm": 0.6856899261474609, |
|
"learning_rate": 6.801125725121636e-05, |
|
"loss": 0.9591, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.42163491087891825, |
|
"grad_norm": 1.2577812671661377, |
|
"learning_rate": 6.781905511554079e-05, |
|
"loss": 1.3174, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.4228641671788568, |
|
"grad_norm": 2.421950578689575, |
|
"learning_rate": 6.762655089099353e-05, |
|
"loss": 1.6442, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.4240934234787953, |
|
"grad_norm": 2.6432454586029053, |
|
"learning_rate": 6.743374784112501e-05, |
|
"loss": 1.0468, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.4253226797787339, |
|
"grad_norm": 2.7061827182769775, |
|
"learning_rate": 6.724064923455155e-05, |
|
"loss": 1.1526, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.4265519360786724, |
|
"grad_norm": 2.466057777404785, |
|
"learning_rate": 6.704725834490024e-05, |
|
"loss": 1.1463, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.42778119237861095, |
|
"grad_norm": 2.753512144088745, |
|
"learning_rate": 6.685357845075315e-05, |
|
"loss": 0.9492, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.4290104486785495, |
|
"grad_norm": 2.76118803024292, |
|
"learning_rate": 6.665961283559197e-05, |
|
"loss": 0.8543, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.430239704978488, |
|
"grad_norm": 2.295574426651001, |
|
"learning_rate": 6.646536478774222e-05, |
|
"loss": 0.9564, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.43146896127842654, |
|
"grad_norm": 0.22731368243694305, |
|
"learning_rate": 6.627083760031754e-05, |
|
"loss": 0.9719, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.4326982175783651, |
|
"grad_norm": 0.20097078382968903, |
|
"learning_rate": 6.60760345711639e-05, |
|
"loss": 1.0094, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.4339274738783036, |
|
"grad_norm": 0.23321934044361115, |
|
"learning_rate": 6.58809590028036e-05, |
|
"loss": 1.101, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.4351567301782422, |
|
"grad_norm": 0.27995625138282776, |
|
"learning_rate": 6.568561420237935e-05, |
|
"loss": 1.3545, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.4363859864781807, |
|
"grad_norm": 0.259082168340683, |
|
"learning_rate": 6.54900034815982e-05, |
|
"loss": 1.1598, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.43761524277811925, |
|
"grad_norm": 0.2688703238964081, |
|
"learning_rate": 6.52941301566754e-05, |
|
"loss": 1.1141, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.4388444990780578, |
|
"grad_norm": 0.34018442034721375, |
|
"learning_rate": 6.50979975482781e-05, |
|
"loss": 1.2811, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.4400737553779963, |
|
"grad_norm": 0.2925175130367279, |
|
"learning_rate": 6.490160898146918e-05, |
|
"loss": 0.9025, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.44130301167793484, |
|
"grad_norm": 0.30208972096443176, |
|
"learning_rate": 6.470496778565082e-05, |
|
"loss": 1.0301, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.4425322679778734, |
|
"grad_norm": 0.3110770285129547, |
|
"learning_rate": 6.4508077294508e-05, |
|
"loss": 1.0911, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4437615242778119, |
|
"grad_norm": 0.426252543926239, |
|
"learning_rate": 6.431094084595209e-05, |
|
"loss": 1.1214, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.44499078057775043, |
|
"grad_norm": 0.4019356966018677, |
|
"learning_rate": 6.411356178206419e-05, |
|
"loss": 1.3063, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.446220036877689, |
|
"grad_norm": 0.4622703194618225, |
|
"learning_rate": 6.391594344903848e-05, |
|
"loss": 1.1208, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.44744929317762755, |
|
"grad_norm": 0.5752270817756653, |
|
"learning_rate": 6.371808919712549e-05, |
|
"loss": 0.9653, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.4486785494775661, |
|
"grad_norm": 0.6309720277786255, |
|
"learning_rate": 6.35200023805754e-05, |
|
"loss": 0.5664, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.4499078057775046, |
|
"grad_norm": 0.612684965133667, |
|
"learning_rate": 6.332168635758097e-05, |
|
"loss": 1.0443, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.45113706207744314, |
|
"grad_norm": 0.6797056794166565, |
|
"learning_rate": 6.31231444902208e-05, |
|
"loss": 0.8389, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.45236631837738167, |
|
"grad_norm": 1.222960352897644, |
|
"learning_rate": 6.292438014440227e-05, |
|
"loss": 1.4688, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.4535955746773202, |
|
"grad_norm": 2.9443516731262207, |
|
"learning_rate": 6.272539668980441e-05, |
|
"loss": 1.0079, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.45482483097725873, |
|
"grad_norm": 3.0168612003326416, |
|
"learning_rate": 6.252619749982089e-05, |
|
"loss": 0.9232, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4560540872771973, |
|
"grad_norm": 1.9470983743667603, |
|
"learning_rate": 6.232678595150275e-05, |
|
"loss": 0.8126, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.45728334357713585, |
|
"grad_norm": 2.4769911766052246, |
|
"learning_rate": 6.212716542550112e-05, |
|
"loss": 0.7786, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.4585125998770744, |
|
"grad_norm": 2.849158525466919, |
|
"learning_rate": 6.192733930601005e-05, |
|
"loss": 1.1914, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.4597418561770129, |
|
"grad_norm": 2.6154119968414307, |
|
"learning_rate": 6.172731098070899e-05, |
|
"loss": 0.9171, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.46097111247695144, |
|
"grad_norm": 3.5901479721069336, |
|
"learning_rate": 6.152708384070541e-05, |
|
"loss": 1.1269, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.46220036877688997, |
|
"grad_norm": 0.23536159098148346, |
|
"learning_rate": 6.132666128047732e-05, |
|
"loss": 0.8768, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.4634296250768285, |
|
"grad_norm": 0.24834086000919342, |
|
"learning_rate": 6.112604669781572e-05, |
|
"loss": 1.0644, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.46465888137676703, |
|
"grad_norm": 0.3041445016860962, |
|
"learning_rate": 6.0925243493767016e-05, |
|
"loss": 1.2779, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.4658881376767056, |
|
"grad_norm": 0.3158765137195587, |
|
"learning_rate": 6.0724255072575275e-05, |
|
"loss": 1.352, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.46711739397664415, |
|
"grad_norm": 0.2845201790332794, |
|
"learning_rate": 6.0523084841624635e-05, |
|
"loss": 1.2567, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4683466502765827, |
|
"grad_norm": 0.2909673750400543, |
|
"learning_rate": 6.0321736211381464e-05, |
|
"loss": 1.1735, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.4695759065765212, |
|
"grad_norm": 0.2946690022945404, |
|
"learning_rate": 6.0120212595336545e-05, |
|
"loss": 1.1514, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.47080516287645974, |
|
"grad_norm": 0.302846223115921, |
|
"learning_rate": 5.9918517409947215e-05, |
|
"loss": 1.0621, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.47203441917639827, |
|
"grad_norm": 0.3197241425514221, |
|
"learning_rate": 5.971665407457948e-05, |
|
"loss": 1.0299, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.4732636754763368, |
|
"grad_norm": 0.342777281999588, |
|
"learning_rate": 5.951462601144998e-05, |
|
"loss": 1.0858, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.47449293177627533, |
|
"grad_norm": 0.3554008901119232, |
|
"learning_rate": 5.931243664556803e-05, |
|
"loss": 1.1441, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.4757221880762139, |
|
"grad_norm": 0.36057665944099426, |
|
"learning_rate": 5.9110089404677524e-05, |
|
"loss": 1.1836, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.47695144437615244, |
|
"grad_norm": 0.5004509091377258, |
|
"learning_rate": 5.890758771919884e-05, |
|
"loss": 1.4109, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.478180700676091, |
|
"grad_norm": 0.5211744904518127, |
|
"learning_rate": 5.8704935022170684e-05, |
|
"loss": 1.0097, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.4794099569760295, |
|
"grad_norm": 0.7474620938301086, |
|
"learning_rate": 5.8502134749191816e-05, |
|
"loss": 0.8777, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.48063921327596804, |
|
"grad_norm": 0.7044636011123657, |
|
"learning_rate": 5.8299190338362996e-05, |
|
"loss": 0.9007, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.48186846957590657, |
|
"grad_norm": 0.6484948992729187, |
|
"learning_rate": 5.8096105230228435e-05, |
|
"loss": 0.8261, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.4830977258758451, |
|
"grad_norm": 0.672816812992096, |
|
"learning_rate": 5.78928828677177e-05, |
|
"loss": 1.0531, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.4843269821757836, |
|
"grad_norm": 1.1637803316116333, |
|
"learning_rate": 5.768952669608724e-05, |
|
"loss": 1.1586, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.4855562384757222, |
|
"grad_norm": 3.1862003803253174, |
|
"learning_rate": 5.748604016286192e-05, |
|
"loss": 1.6232, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.48678549477566074, |
|
"grad_norm": 3.3833253383636475, |
|
"learning_rate": 5.728242671777672e-05, |
|
"loss": 1.0918, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.4880147510755993, |
|
"grad_norm": 3.116319417953491, |
|
"learning_rate": 5.707868981271815e-05, |
|
"loss": 0.8615, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.4892440073755378, |
|
"grad_norm": 2.5967965126037598, |
|
"learning_rate": 5.687483290166573e-05, |
|
"loss": 0.8579, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.49047326367547633, |
|
"grad_norm": 3.7683048248291016, |
|
"learning_rate": 5.6670859440633486e-05, |
|
"loss": 1.0777, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.49170251997541486, |
|
"grad_norm": 3.182317018508911, |
|
"learning_rate": 5.646677288761132e-05, |
|
"loss": 0.8932, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4929317762753534, |
|
"grad_norm": 0.19372360408306122, |
|
"learning_rate": 5.6262576702506406e-05, |
|
"loss": 0.8516, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.4941610325752919, |
|
"grad_norm": 0.223338320851326, |
|
"learning_rate": 5.6058274347084504e-05, |
|
"loss": 1.1287, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.4953902888752305, |
|
"grad_norm": 0.23367400467395782, |
|
"learning_rate": 5.585386928491134e-05, |
|
"loss": 1.1128, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.49661954517516904, |
|
"grad_norm": 0.2717371881008148, |
|
"learning_rate": 5.5649364981293786e-05, |
|
"loss": 1.2813, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.49784880147510757, |
|
"grad_norm": 0.25909724831581116, |
|
"learning_rate": 5.54447649032212e-05, |
|
"loss": 1.2149, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.4990780577750461, |
|
"grad_norm": 0.25411197543144226, |
|
"learning_rate": 5.5240072519306606e-05, |
|
"loss": 1.0679, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.5003073140749846, |
|
"grad_norm": 0.2962128520011902, |
|
"learning_rate": 5.503529129972792e-05, |
|
"loss": 1.1156, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.5015365703749232, |
|
"grad_norm": 0.29252949357032776, |
|
"learning_rate": 5.483042471616908e-05, |
|
"loss": 1.125, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.5015365703749232, |
|
"eval_loss": 0.9695894122123718, |
|
"eval_runtime": 65.3254, |
|
"eval_samples_per_second": 10.486, |
|
"eval_steps_per_second": 5.251, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.5027658266748617, |
|
"grad_norm": 0.31012120842933655, |
|
"learning_rate": 5.4625476241761196e-05, |
|
"loss": 1.1491, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.5039950829748002, |
|
"grad_norm": 0.3182609975337982, |
|
"learning_rate": 5.442044935102375e-05, |
|
"loss": 1.0786, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5052243392747388, |
|
"grad_norm": 0.35413217544555664, |
|
"learning_rate": 5.421534751980556e-05, |
|
"loss": 1.2194, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.5064535955746773, |
|
"grad_norm": 0.39505264163017273, |
|
"learning_rate": 5.401017422522594e-05, |
|
"loss": 1.0296, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.5076828518746158, |
|
"grad_norm": 0.4267808496952057, |
|
"learning_rate": 5.380493294561573e-05, |
|
"loss": 1.0906, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.5089121081745543, |
|
"grad_norm": 0.5495923161506653, |
|
"learning_rate": 5.359962716045835e-05, |
|
"loss": 1.2171, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.510141364474493, |
|
"grad_norm": 0.6385165452957153, |
|
"learning_rate": 5.3394260350330796e-05, |
|
"loss": 0.8179, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.5113706207744315, |
|
"grad_norm": 0.7211401462554932, |
|
"learning_rate": 5.318883599684456e-05, |
|
"loss": 0.7624, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.51259987707437, |
|
"grad_norm": 0.800375759601593, |
|
"learning_rate": 5.298335758258678e-05, |
|
"loss": 0.9597, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.5138291333743086, |
|
"grad_norm": 0.9896557927131653, |
|
"learning_rate": 5.2777828591060984e-05, |
|
"loss": 1.12, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.5150583896742471, |
|
"grad_norm": 2.869112968444824, |
|
"learning_rate": 5.257225250662823e-05, |
|
"loss": 1.2992, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.5162876459741856, |
|
"grad_norm": 3.394543409347534, |
|
"learning_rate": 5.236663281444791e-05, |
|
"loss": 1.298, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5175169022741242, |
|
"grad_norm": 3.3043065071105957, |
|
"learning_rate": 5.21609730004187e-05, |
|
"loss": 1.1924, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.5187461585740627, |
|
"grad_norm": 2.603189706802368, |
|
"learning_rate": 5.1955276551119495e-05, |
|
"loss": 1.2155, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.5199754148740012, |
|
"grad_norm": 3.018608808517456, |
|
"learning_rate": 5.174954695375023e-05, |
|
"loss": 1.2001, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.5212046711739398, |
|
"grad_norm": 1.8929078578948975, |
|
"learning_rate": 5.154378769607286e-05, |
|
"loss": 0.7124, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.5224339274738783, |
|
"grad_norm": 2.4811208248138428, |
|
"learning_rate": 5.1338002266352106e-05, |
|
"loss": 0.8491, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.5236631837738168, |
|
"grad_norm": 0.22651559114456177, |
|
"learning_rate": 5.113219415329645e-05, |
|
"loss": 0.9983, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.5248924400737554, |
|
"grad_norm": 0.2817627787590027, |
|
"learning_rate": 5.0926366845998904e-05, |
|
"loss": 1.1314, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.5261216963736939, |
|
"grad_norm": 0.27301520109176636, |
|
"learning_rate": 5.072052383387786e-05, |
|
"loss": 1.2722, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.5273509526736324, |
|
"grad_norm": 0.2770383059978485, |
|
"learning_rate": 5.0514668606618e-05, |
|
"loss": 1.1654, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.5285802089735709, |
|
"grad_norm": 0.3147628903388977, |
|
"learning_rate": 5.0308804654111056e-05, |
|
"loss": 1.0896, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5298094652735095, |
|
"grad_norm": 0.305074542760849, |
|
"learning_rate": 5.01029354663967e-05, |
|
"loss": 1.1754, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.5310387215734481, |
|
"grad_norm": 0.3240547776222229, |
|
"learning_rate": 4.9897064533603315e-05, |
|
"loss": 1.2413, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.5322679778733866, |
|
"grad_norm": 0.34143832325935364, |
|
"learning_rate": 4.9691195345888956e-05, |
|
"loss": 1.1504, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.5334972341733252, |
|
"grad_norm": 0.3945367634296417, |
|
"learning_rate": 4.948533139338202e-05, |
|
"loss": 1.2914, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.5347264904732637, |
|
"grad_norm": 0.371367871761322, |
|
"learning_rate": 4.927947616612215e-05, |
|
"loss": 1.3269, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.5359557467732022, |
|
"grad_norm": 0.4348413646221161, |
|
"learning_rate": 4.90736331540011e-05, |
|
"loss": 1.2061, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.5371850030731408, |
|
"grad_norm": 0.5112064480781555, |
|
"learning_rate": 4.886780584670356e-05, |
|
"loss": 1.2283, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.5384142593730793, |
|
"grad_norm": 0.6409791707992554, |
|
"learning_rate": 4.866199773364789e-05, |
|
"loss": 0.7663, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.5396435156730178, |
|
"grad_norm": 0.7076222896575928, |
|
"learning_rate": 4.845621230392716e-05, |
|
"loss": 0.8073, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.5408727719729564, |
|
"grad_norm": 0.6545782089233398, |
|
"learning_rate": 4.825045304624978e-05, |
|
"loss": 0.762, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5421020282728949, |
|
"grad_norm": 0.6853161454200745, |
|
"learning_rate": 4.804472344888052e-05, |
|
"loss": 0.9905, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.5433312845728334, |
|
"grad_norm": 0.6758946776390076, |
|
"learning_rate": 4.7839026999581296e-05, |
|
"loss": 0.9353, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.544560540872772, |
|
"grad_norm": 1.1472845077514648, |
|
"learning_rate": 4.7633367185552095e-05, |
|
"loss": 0.9796, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.5457897971727105, |
|
"grad_norm": 2.543483018875122, |
|
"learning_rate": 4.742774749337179e-05, |
|
"loss": 1.5897, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.547019053472649, |
|
"grad_norm": 3.029282331466675, |
|
"learning_rate": 4.7222171408939034e-05, |
|
"loss": 1.2527, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.5482483097725875, |
|
"grad_norm": 3.1332318782806396, |
|
"learning_rate": 4.701664241741323e-05, |
|
"loss": 1.1093, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.5494775660725261, |
|
"grad_norm": 2.407163381576538, |
|
"learning_rate": 4.681116400315544e-05, |
|
"loss": 0.75, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.5507068223724647, |
|
"grad_norm": 2.622605085372925, |
|
"learning_rate": 4.6605739649669236e-05, |
|
"loss": 0.8924, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.5519360786724032, |
|
"grad_norm": 2.5338971614837646, |
|
"learning_rate": 4.640037283954165e-05, |
|
"loss": 0.681, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.5531653349723418, |
|
"grad_norm": 2.4635331630706787, |
|
"learning_rate": 4.619506705438428e-05, |
|
"loss": 0.7866, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5543945912722803, |
|
"grad_norm": 0.18258269131183624, |
|
"learning_rate": 4.598982577477408e-05, |
|
"loss": 0.8069, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.5556238475722188, |
|
"grad_norm": 0.23453976213932037, |
|
"learning_rate": 4.578465248019445e-05, |
|
"loss": 1.1606, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.5568531038721574, |
|
"grad_norm": 0.25334998965263367, |
|
"learning_rate": 4.557955064897626e-05, |
|
"loss": 1.2323, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.5580823601720959, |
|
"grad_norm": 0.2757891118526459, |
|
"learning_rate": 4.537452375823881e-05, |
|
"loss": 1.4173, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.5593116164720344, |
|
"grad_norm": 0.2755081355571747, |
|
"learning_rate": 4.5169575283830936e-05, |
|
"loss": 1.2255, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.560540872771973, |
|
"grad_norm": 0.29173940420150757, |
|
"learning_rate": 4.496470870027209e-05, |
|
"loss": 1.4528, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.5617701290719115, |
|
"grad_norm": 0.27483510971069336, |
|
"learning_rate": 4.475992748069339e-05, |
|
"loss": 0.7703, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.56299938537185, |
|
"grad_norm": 0.3180837333202362, |
|
"learning_rate": 4.455523509677882e-05, |
|
"loss": 1.025, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.5642286416717885, |
|
"grad_norm": 0.3103027939796448, |
|
"learning_rate": 4.435063501870622e-05, |
|
"loss": 1.0503, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.5654578979717271, |
|
"grad_norm": 0.3448195457458496, |
|
"learning_rate": 4.4146130715088676e-05, |
|
"loss": 1.0366, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5666871542716656, |
|
"grad_norm": 0.3892152011394501, |
|
"learning_rate": 4.3941725652915494e-05, |
|
"loss": 1.1834, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.5679164105716041, |
|
"grad_norm": 0.4770027697086334, |
|
"learning_rate": 4.373742329749362e-05, |
|
"loss": 1.4524, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.5691456668715427, |
|
"grad_norm": 0.5856226682662964, |
|
"learning_rate": 4.3533227112388694e-05, |
|
"loss": 1.2538, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.5703749231714813, |
|
"grad_norm": 0.6203077435493469, |
|
"learning_rate": 4.332914055936653e-05, |
|
"loss": 0.6276, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.5716041794714198, |
|
"grad_norm": 0.6294959187507629, |
|
"learning_rate": 4.3125167098334286e-05, |
|
"loss": 0.642, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5728334357713584, |
|
"grad_norm": 0.7135624885559082, |
|
"learning_rate": 4.2921310187281864e-05, |
|
"loss": 1.1486, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.5740626920712969, |
|
"grad_norm": 0.7505180835723877, |
|
"learning_rate": 4.27175732822233e-05, |
|
"loss": 0.9109, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.5752919483712354, |
|
"grad_norm": 1.0719517469406128, |
|
"learning_rate": 4.251395983713809e-05, |
|
"loss": 1.2909, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.576521204671174, |
|
"grad_norm": 2.5531132221221924, |
|
"learning_rate": 4.231047330391278e-05, |
|
"loss": 1.1723, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.5777504609711125, |
|
"grad_norm": 3.104837656021118, |
|
"learning_rate": 4.21071171322823e-05, |
|
"loss": 1.3296, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.578979717271051, |
|
"grad_norm": 3.239119052886963, |
|
"learning_rate": 4.190389476977156e-05, |
|
"loss": 0.8768, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.5802089735709896, |
|
"grad_norm": 2.681823968887329, |
|
"learning_rate": 4.170080966163702e-05, |
|
"loss": 0.8809, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.5814382298709281, |
|
"grad_norm": 2.557533025741577, |
|
"learning_rate": 4.149786525080819e-05, |
|
"loss": 0.8249, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.5826674861708666, |
|
"grad_norm": 2.4591336250305176, |
|
"learning_rate": 4.1295064977829334e-05, |
|
"loss": 0.7384, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.5838967424708051, |
|
"grad_norm": 2.414395809173584, |
|
"learning_rate": 4.109241228080115e-05, |
|
"loss": 0.7307, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5851259987707437, |
|
"grad_norm": 0.2034798115491867, |
|
"learning_rate": 4.088991059532248e-05, |
|
"loss": 0.8326, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.5863552550706822, |
|
"grad_norm": 0.24682076275348663, |
|
"learning_rate": 4.0687563354431984e-05, |
|
"loss": 1.3382, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.5875845113706207, |
|
"grad_norm": 0.2430254966020584, |
|
"learning_rate": 4.048537398855003e-05, |
|
"loss": 1.2718, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.5888137676705593, |
|
"grad_norm": 0.27249017357826233, |
|
"learning_rate": 4.028334592542054e-05, |
|
"loss": 1.2328, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.5900430239704979, |
|
"grad_norm": 0.28996631503105164, |
|
"learning_rate": 4.008148259005279e-05, |
|
"loss": 1.3963, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5912722802704364, |
|
"grad_norm": 0.28658372163772583, |
|
"learning_rate": 3.9879787404663474e-05, |
|
"loss": 1.2326, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.592501536570375, |
|
"grad_norm": 0.3246002495288849, |
|
"learning_rate": 3.967826378861854e-05, |
|
"loss": 1.1071, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.5937307928703135, |
|
"grad_norm": 0.33610400557518005, |
|
"learning_rate": 3.947691515837537e-05, |
|
"loss": 1.1798, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.594960049170252, |
|
"grad_norm": 0.35427945852279663, |
|
"learning_rate": 3.927574492742473e-05, |
|
"loss": 1.3277, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.5961893054701906, |
|
"grad_norm": 0.3942146599292755, |
|
"learning_rate": 3.907475650623299e-05, |
|
"loss": 1.0499, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.5974185617701291, |
|
"grad_norm": 0.36504921317100525, |
|
"learning_rate": 3.887395330218429e-05, |
|
"loss": 1.1876, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.5986478180700676, |
|
"grad_norm": 0.45135965943336487, |
|
"learning_rate": 3.867333871952269e-05, |
|
"loss": 1.1775, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.5998770743700061, |
|
"grad_norm": 0.6297984719276428, |
|
"learning_rate": 3.84729161592946e-05, |
|
"loss": 0.9756, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.6011063306699447, |
|
"grad_norm": 0.6099340915679932, |
|
"learning_rate": 3.827268901929102e-05, |
|
"loss": 0.7669, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.6023355869698832, |
|
"grad_norm": 0.7176752686500549, |
|
"learning_rate": 3.8072660693989967e-05, |
|
"loss": 0.5387, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6035648432698217, |
|
"grad_norm": 0.6262489557266235, |
|
"learning_rate": 3.78728345744989e-05, |
|
"loss": 0.9306, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.6047940995697603, |
|
"grad_norm": 1.0194954872131348, |
|
"learning_rate": 3.767321404849727e-05, |
|
"loss": 1.1677, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.6060233558696988, |
|
"grad_norm": 1.6236090660095215, |
|
"learning_rate": 3.7473802500179114e-05, |
|
"loss": 1.0458, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.6072526121696373, |
|
"grad_norm": 2.5714359283447266, |
|
"learning_rate": 3.727460331019559e-05, |
|
"loss": 1.2519, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.6084818684695759, |
|
"grad_norm": 2.4615423679351807, |
|
"learning_rate": 3.7075619855597744e-05, |
|
"loss": 0.5842, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.6097111247695144, |
|
"grad_norm": 3.4131109714508057, |
|
"learning_rate": 3.6876855509779206e-05, |
|
"loss": 1.3071, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.610940381069453, |
|
"grad_norm": 3.461987257003784, |
|
"learning_rate": 3.667831364241904e-05, |
|
"loss": 1.1426, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.6121696373693916, |
|
"grad_norm": 2.462989330291748, |
|
"learning_rate": 3.6479997619424605e-05, |
|
"loss": 0.8421, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.6133988936693301, |
|
"grad_norm": 2.423410654067993, |
|
"learning_rate": 3.628191080287451e-05, |
|
"loss": 0.7845, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.6146281499692686, |
|
"grad_norm": 2.5741212368011475, |
|
"learning_rate": 3.608405655096154e-05, |
|
"loss": 0.6916, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6158574062692072, |
|
"grad_norm": 0.2071269154548645, |
|
"learning_rate": 3.588643821793582e-05, |
|
"loss": 0.9648, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.6170866625691457, |
|
"grad_norm": 0.23868902027606964, |
|
"learning_rate": 3.5689059154047915e-05, |
|
"loss": 1.261, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.6183159188690842, |
|
"grad_norm": 0.2465473711490631, |
|
"learning_rate": 3.5491922705492e-05, |
|
"loss": 1.2999, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.6195451751690227, |
|
"grad_norm": 0.25372591614723206, |
|
"learning_rate": 3.5295032214349196e-05, |
|
"loss": 1.3633, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.6207744314689613, |
|
"grad_norm": 0.24923810362815857, |
|
"learning_rate": 3.5098391018530816e-05, |
|
"loss": 1.1408, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.6220036877688998, |
|
"grad_norm": 0.2594156563282013, |
|
"learning_rate": 3.4902002451721916e-05, |
|
"loss": 1.158, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.6232329440688383, |
|
"grad_norm": 0.28343746066093445, |
|
"learning_rate": 3.4705869843324614e-05, |
|
"loss": 1.1717, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.6244622003687769, |
|
"grad_norm": 0.29203376173973083, |
|
"learning_rate": 3.450999651840179e-05, |
|
"loss": 1.0693, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.6256914566687154, |
|
"grad_norm": 0.3236950933933258, |
|
"learning_rate": 3.431438579762066e-05, |
|
"loss": 1.0098, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.6269207129686539, |
|
"grad_norm": 0.3463621139526367, |
|
"learning_rate": 3.411904099719642e-05, |
|
"loss": 1.1123, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6281499692685925, |
|
"grad_norm": 0.4081440269947052, |
|
"learning_rate": 3.3923965428836105e-05, |
|
"loss": 1.1303, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.629379225568531, |
|
"grad_norm": 0.38460928201675415, |
|
"learning_rate": 3.3729162399682456e-05, |
|
"loss": 1.0858, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.6306084818684696, |
|
"grad_norm": 0.474970281124115, |
|
"learning_rate": 3.35346352122578e-05, |
|
"loss": 1.1176, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.6318377381684082, |
|
"grad_norm": 0.7059311270713806, |
|
"learning_rate": 3.3340387164408046e-05, |
|
"loss": 1.1786, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.6330669944683467, |
|
"grad_norm": 0.7394426465034485, |
|
"learning_rate": 3.314642154924686e-05, |
|
"loss": 0.7836, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.6342962507682852, |
|
"grad_norm": 0.766044020652771, |
|
"learning_rate": 3.295274165509979e-05, |
|
"loss": 0.8919, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.6355255070682237, |
|
"grad_norm": 0.7920165061950684, |
|
"learning_rate": 3.275935076544845e-05, |
|
"loss": 1.0439, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.6367547633681623, |
|
"grad_norm": 1.224188208580017, |
|
"learning_rate": 3.256625215887502e-05, |
|
"loss": 1.3103, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.6379840196681008, |
|
"grad_norm": 2.0128793716430664, |
|
"learning_rate": 3.237344910900648e-05, |
|
"loss": 1.0804, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.6392132759680393, |
|
"grad_norm": 3.370530366897583, |
|
"learning_rate": 3.218094488445923e-05, |
|
"loss": 0.5978, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6404425322679779, |
|
"grad_norm": 3.095421314239502, |
|
"learning_rate": 3.198874274878365e-05, |
|
"loss": 0.847, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.6416717885679164, |
|
"grad_norm": 3.4100868701934814, |
|
"learning_rate": 3.179684596040878e-05, |
|
"loss": 0.8489, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.6429010448678549, |
|
"grad_norm": 2.608668088912964, |
|
"learning_rate": 3.1605257772587035e-05, |
|
"loss": 0.8378, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.6441303011677935, |
|
"grad_norm": 2.300096273422241, |
|
"learning_rate": 3.141398143333907e-05, |
|
"loss": 0.7825, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.645359557467732, |
|
"grad_norm": 2.652704954147339, |
|
"learning_rate": 3.1223020185398765e-05, |
|
"loss": 0.6505, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.6465888137676705, |
|
"grad_norm": 0.19123272597789764, |
|
"learning_rate": 3.103237726615822e-05, |
|
"loss": 0.8317, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.647818070067609, |
|
"grad_norm": 0.22544066607952118, |
|
"learning_rate": 3.084205590761284e-05, |
|
"loss": 1.1174, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.6490473263675476, |
|
"grad_norm": 0.25559771060943604, |
|
"learning_rate": 3.065205933630655e-05, |
|
"loss": 1.258, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.6502765826674862, |
|
"grad_norm": 0.2453518509864807, |
|
"learning_rate": 3.0462390773277154e-05, |
|
"loss": 1.1197, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.6515058389674248, |
|
"grad_norm": 0.26897767186164856, |
|
"learning_rate": 3.0273053434001662e-05, |
|
"loss": 1.2231, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6527350952673633, |
|
"grad_norm": 0.2956353425979614, |
|
"learning_rate": 3.0084050528341824e-05, |
|
"loss": 0.9809, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.6539643515673018, |
|
"grad_norm": 0.3043264150619507, |
|
"learning_rate": 2.989538526048968e-05, |
|
"loss": 1.2625, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.6551936078672403, |
|
"grad_norm": 0.3115137219429016, |
|
"learning_rate": 2.9707060828913225e-05, |
|
"loss": 1.1544, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.6564228641671789, |
|
"grad_norm": 0.29051673412323, |
|
"learning_rate": 2.9519080426302238e-05, |
|
"loss": 0.8664, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.6576521204671174, |
|
"grad_norm": 0.31704720854759216, |
|
"learning_rate": 2.933144723951414e-05, |
|
"loss": 0.8553, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.6588813767670559, |
|
"grad_norm": 0.3343563675880432, |
|
"learning_rate": 2.9144164449519917e-05, |
|
"loss": 1.042, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.6601106330669945, |
|
"grad_norm": 0.411081999540329, |
|
"learning_rate": 2.895723523135028e-05, |
|
"loss": 1.2587, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.661339889366933, |
|
"grad_norm": 0.4190918505191803, |
|
"learning_rate": 2.877066275404172e-05, |
|
"loss": 1.1546, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.6625691456668715, |
|
"grad_norm": 0.5606351494789124, |
|
"learning_rate": 2.8584450180582912e-05, |
|
"loss": 0.9889, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.6637984019668101, |
|
"grad_norm": 0.6339558959007263, |
|
"learning_rate": 2.839860066786103e-05, |
|
"loss": 0.5278, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6650276582667486, |
|
"grad_norm": 0.6629421710968018, |
|
"learning_rate": 2.8213117366608188e-05, |
|
"loss": 0.752, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.6662569145666871, |
|
"grad_norm": 0.7566842436790466, |
|
"learning_rate": 2.802800342134807e-05, |
|
"loss": 1.0395, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.6674861708666256, |
|
"grad_norm": 1.169986605644226, |
|
"learning_rate": 2.784326197034266e-05, |
|
"loss": 1.1942, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.6687154271665642, |
|
"grad_norm": 2.78249192237854, |
|
"learning_rate": 2.7658896145538983e-05, |
|
"loss": 1.1694, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.6699446834665027, |
|
"grad_norm": 2.5933189392089844, |
|
"learning_rate": 2.7474909072515993e-05, |
|
"loss": 0.9325, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.6711739397664414, |
|
"grad_norm": 2.37711238861084, |
|
"learning_rate": 2.7291303870431662e-05, |
|
"loss": 0.9246, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.6724031960663799, |
|
"grad_norm": 3.3568241596221924, |
|
"learning_rate": 2.710808365197e-05, |
|
"loss": 0.9492, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.6736324523663184, |
|
"grad_norm": 2.975985527038574, |
|
"learning_rate": 2.6925251523288346e-05, |
|
"loss": 0.8018, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.6748617086662569, |
|
"grad_norm": 2.78823184967041, |
|
"learning_rate": 2.674281058396473e-05, |
|
"loss": 0.843, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.6760909649661955, |
|
"grad_norm": 4.179988384246826, |
|
"learning_rate": 2.6560763926945275e-05, |
|
"loss": 1.0288, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.677320221266134, |
|
"grad_norm": 0.21164661645889282, |
|
"learning_rate": 2.6379114638491807e-05, |
|
"loss": 0.9468, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.6785494775660725, |
|
"grad_norm": 0.2241944521665573, |
|
"learning_rate": 2.6197865798129462e-05, |
|
"loss": 1.1578, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.6797787338660111, |
|
"grad_norm": 0.25557973980903625, |
|
"learning_rate": 2.601702047859455e-05, |
|
"loss": 1.2432, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.6810079901659496, |
|
"grad_norm": 0.2733787000179291, |
|
"learning_rate": 2.5836581745782475e-05, |
|
"loss": 1.2624, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.6822372464658881, |
|
"grad_norm": 0.29645049571990967, |
|
"learning_rate": 2.5656552658695642e-05, |
|
"loss": 1.2544, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.6834665027658267, |
|
"grad_norm": 0.29211127758026123, |
|
"learning_rate": 2.5476936269391726e-05, |
|
"loss": 1.0548, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.6846957590657652, |
|
"grad_norm": 0.3013113737106323, |
|
"learning_rate": 2.5297735622931874e-05, |
|
"loss": 1.0941, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.6859250153657037, |
|
"grad_norm": 0.33897635340690613, |
|
"learning_rate": 2.5118953757329088e-05, |
|
"loss": 1.0855, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.6871542716656422, |
|
"grad_norm": 0.36230790615081787, |
|
"learning_rate": 2.494059370349673e-05, |
|
"loss": 1.0817, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.6883835279655808, |
|
"grad_norm": 0.36360612511634827, |
|
"learning_rate": 2.4762658485197123e-05, |
|
"loss": 1.0868, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6896127842655193, |
|
"grad_norm": 0.45407038927078247, |
|
"learning_rate": 2.4585151118990286e-05, |
|
"loss": 1.1186, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.690842040565458, |
|
"grad_norm": 0.5940976142883301, |
|
"learning_rate": 2.4408074614182773e-05, |
|
"loss": 1.1448, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.6920712968653965, |
|
"grad_norm": 0.6550815105438232, |
|
"learning_rate": 2.4231431972776758e-05, |
|
"loss": 0.931, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.693300553165335, |
|
"grad_norm": 0.6394757032394409, |
|
"learning_rate": 2.4055226189419018e-05, |
|
"loss": 0.8543, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.6945298094652735, |
|
"grad_norm": 0.6903315186500549, |
|
"learning_rate": 2.3879460251350255e-05, |
|
"loss": 0.8152, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.6957590657652121, |
|
"grad_norm": 0.9499403238296509, |
|
"learning_rate": 2.3704137138354355e-05, |
|
"loss": 1.2532, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.6969883220651506, |
|
"grad_norm": 1.2395949363708496, |
|
"learning_rate": 2.3529259822708e-05, |
|
"loss": 1.3153, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.6982175783650891, |
|
"grad_norm": 2.5209338665008545, |
|
"learning_rate": 2.3354831269130133e-05, |
|
"loss": 1.4484, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.6994468346650277, |
|
"grad_norm": 3.072936773300171, |
|
"learning_rate": 2.318085443473185e-05, |
|
"loss": 0.9566, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.7006760909649662, |
|
"grad_norm": 2.40030574798584, |
|
"learning_rate": 2.300733226896612e-05, |
|
"loss": 0.6958, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7019053472649047, |
|
"grad_norm": 3.1356310844421387, |
|
"learning_rate": 2.2834267713577904e-05, |
|
"loss": 1.261, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.7031346035648433, |
|
"grad_norm": 2.5506911277770996, |
|
"learning_rate": 2.2661663702554208e-05, |
|
"loss": 0.7489, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.7043638598647818, |
|
"grad_norm": 2.605987548828125, |
|
"learning_rate": 2.2489523162074393e-05, |
|
"loss": 0.8642, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.7055931161647203, |
|
"grad_norm": 2.3878705501556396, |
|
"learning_rate": 2.2317849010460507e-05, |
|
"loss": 0.559, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.7068223724646588, |
|
"grad_norm": 4.309600353240967, |
|
"learning_rate": 2.2146644158127827e-05, |
|
"loss": 1.3898, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.7080516287645974, |
|
"grad_norm": 0.2340036779642105, |
|
"learning_rate": 2.197591150753559e-05, |
|
"loss": 1.1237, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.7092808850645359, |
|
"grad_norm": 0.2404404878616333, |
|
"learning_rate": 2.1805653953137707e-05, |
|
"loss": 1.2235, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.7105101413644745, |
|
"grad_norm": 0.2546350657939911, |
|
"learning_rate": 2.1635874381333714e-05, |
|
"loss": 1.1931, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.7117393976644131, |
|
"grad_norm": 0.28081899881362915, |
|
"learning_rate": 2.1466575670419876e-05, |
|
"loss": 1.1851, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.7129686539643516, |
|
"grad_norm": 0.27532532811164856, |
|
"learning_rate": 2.1297760690540302e-05, |
|
"loss": 1.0733, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7141979102642901, |
|
"grad_norm": 0.26407289505004883, |
|
"learning_rate": 2.1129432303638352e-05, |
|
"loss": 1.0863, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.7154271665642287, |
|
"grad_norm": 0.31387853622436523, |
|
"learning_rate": 2.0961593363408156e-05, |
|
"loss": 1.1808, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.7166564228641672, |
|
"grad_norm": 0.3309866487979889, |
|
"learning_rate": 2.079424671524616e-05, |
|
"loss": 1.0741, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.7178856791641057, |
|
"grad_norm": 0.300689697265625, |
|
"learning_rate": 2.0627395196202898e-05, |
|
"loss": 0.9483, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.7191149354640443, |
|
"grad_norm": 0.3478316366672516, |
|
"learning_rate": 2.046104163493493e-05, |
|
"loss": 1.144, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.7203441917639828, |
|
"grad_norm": 0.44858890771865845, |
|
"learning_rate": 2.0295188851656892e-05, |
|
"loss": 1.1384, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.7215734480639213, |
|
"grad_norm": 0.4991127550601959, |
|
"learning_rate": 2.0129839658093607e-05, |
|
"loss": 1.3527, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.7228027043638598, |
|
"grad_norm": 0.5499400496482849, |
|
"learning_rate": 1.996499685743254e-05, |
|
"loss": 1.2116, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.7240319606637984, |
|
"grad_norm": 0.6611432433128357, |
|
"learning_rate": 1.980066324427613e-05, |
|
"loss": 0.5055, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.7252612169637369, |
|
"grad_norm": 0.7008240222930908, |
|
"learning_rate": 1.9636841604594557e-05, |
|
"loss": 0.7072, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7264904732636754, |
|
"grad_norm": 0.7243725061416626, |
|
"learning_rate": 1.9473534715678427e-05, |
|
"loss": 0.8031, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.727719729563614, |
|
"grad_norm": 0.8577756285667419, |
|
"learning_rate": 1.9310745346091714e-05, |
|
"loss": 1.1383, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.7289489858635525, |
|
"grad_norm": 1.2171465158462524, |
|
"learning_rate": 1.9148476255624764e-05, |
|
"loss": 0.9733, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.7301782421634911, |
|
"grad_norm": 3.537208318710327, |
|
"learning_rate": 1.898673019524764e-05, |
|
"loss": 1.1197, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.7314074984634297, |
|
"grad_norm": 2.420771360397339, |
|
"learning_rate": 1.8825509907063327e-05, |
|
"loss": 0.7744, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.7326367547633682, |
|
"grad_norm": 3.1013336181640625, |
|
"learning_rate": 1.8664818124261374e-05, |
|
"loss": 0.7931, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.7338660110633067, |
|
"grad_norm": 3.5393359661102295, |
|
"learning_rate": 1.8504657571071515e-05, |
|
"loss": 1.0527, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.7350952673632453, |
|
"grad_norm": 2.782148838043213, |
|
"learning_rate": 1.8345030962717407e-05, |
|
"loss": 0.8034, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.7363245236631838, |
|
"grad_norm": 2.12009596824646, |
|
"learning_rate": 1.8185941005370745e-05, |
|
"loss": 0.4618, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.7375537799631223, |
|
"grad_norm": 3.6186935901641846, |
|
"learning_rate": 1.802739039610527e-05, |
|
"loss": 0.793, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7387830362630609, |
|
"grad_norm": 0.22188998758792877, |
|
"learning_rate": 1.786938182285107e-05, |
|
"loss": 1.1901, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.7400122925629994, |
|
"grad_norm": 0.2288677990436554, |
|
"learning_rate": 1.7711917964349062e-05, |
|
"loss": 1.1755, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.7412415488629379, |
|
"grad_norm": 0.25471025705337524, |
|
"learning_rate": 1.7555001490105488e-05, |
|
"loss": 1.2616, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.7424708051628764, |
|
"grad_norm": 0.23840656876564026, |
|
"learning_rate": 1.7398635060346746e-05, |
|
"loss": 1.0882, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.743700061462815, |
|
"grad_norm": 0.2534352242946625, |
|
"learning_rate": 1.7242821325974258e-05, |
|
"loss": 1.1328, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.7449293177627535, |
|
"grad_norm": 0.279732346534729, |
|
"learning_rate": 1.7087562928519514e-05, |
|
"loss": 1.1703, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.746158574062692, |
|
"grad_norm": 0.3084731698036194, |
|
"learning_rate": 1.69328625000993e-05, |
|
"loss": 1.1646, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.7473878303626306, |
|
"grad_norm": 0.3052552044391632, |
|
"learning_rate": 1.6778722663371053e-05, |
|
"loss": 1.1481, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.7486170866625691, |
|
"grad_norm": 0.3502384126186371, |
|
"learning_rate": 1.662514603148847e-05, |
|
"loss": 1.0333, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.7498463429625076, |
|
"grad_norm": 0.3393804728984833, |
|
"learning_rate": 1.6472135208057126e-05, |
|
"loss": 1.0056, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7510755992624463, |
|
"grad_norm": 0.3970140814781189, |
|
"learning_rate": 1.631969278709041e-05, |
|
"loss": 1.1957, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.7523048555623848, |
|
"grad_norm": 0.4345152974128723, |
|
"learning_rate": 1.616782135296544e-05, |
|
"loss": 1.0145, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.7523048555623848, |
|
"eval_loss": 0.9307632446289062, |
|
"eval_runtime": 64.7426, |
|
"eval_samples_per_second": 10.58, |
|
"eval_steps_per_second": 5.298, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.7535341118623233, |
|
"grad_norm": 0.45276275277137756, |
|
"learning_rate": 1.6016523480379382e-05, |
|
"loss": 0.8856, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.7547633681622619, |
|
"grad_norm": 0.6476858854293823, |
|
"learning_rate": 1.5865801734305668e-05, |
|
"loss": 0.8316, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.7559926244622004, |
|
"grad_norm": 0.6815754771232605, |
|
"learning_rate": 1.5715658669950634e-05, |
|
"loss": 0.6487, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.7572218807621389, |
|
"grad_norm": 0.7536165118217468, |
|
"learning_rate": 1.5566096832710154e-05, |
|
"loss": 0.9835, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.7584511370620775, |
|
"grad_norm": 1.0182470083236694, |
|
"learning_rate": 1.541711875812641e-05, |
|
"loss": 1.172, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.759680393362016, |
|
"grad_norm": 2.240806818008423, |
|
"learning_rate": 1.5268726971845037e-05, |
|
"loss": 1.3291, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.7609096496619545, |
|
"grad_norm": 3.0141806602478027, |
|
"learning_rate": 1.5120923989572244e-05, |
|
"loss": 0.8101, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.762138905961893, |
|
"grad_norm": 3.7101502418518066, |
|
"learning_rate": 1.4973712317032135e-05, |
|
"loss": 1.0581, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7633681622618316, |
|
"grad_norm": 3.892627000808716, |
|
"learning_rate": 1.482709444992425e-05, |
|
"loss": 1.007, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.7645974185617701, |
|
"grad_norm": 3.3281090259552, |
|
"learning_rate": 1.4681072873881312e-05, |
|
"loss": 0.7403, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.7658266748617086, |
|
"grad_norm": 3.755115509033203, |
|
"learning_rate": 1.4535650064427003e-05, |
|
"loss": 0.9939, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.7670559311616472, |
|
"grad_norm": 2.45129132270813, |
|
"learning_rate": 1.439082848693406e-05, |
|
"loss": 0.6902, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.7682851874615857, |
|
"grad_norm": 3.0480661392211914, |
|
"learning_rate": 1.4246610596582444e-05, |
|
"loss": 0.6467, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.7695144437615242, |
|
"grad_norm": 0.2060498744249344, |
|
"learning_rate": 1.41029988383177e-05, |
|
"loss": 0.9195, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.7707437000614629, |
|
"grad_norm": 0.21947383880615234, |
|
"learning_rate": 1.3959995646809549e-05, |
|
"loss": 1.0644, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.7719729563614014, |
|
"grad_norm": 0.26282572746276855, |
|
"learning_rate": 1.381760344641061e-05, |
|
"loss": 1.3907, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.7732022126613399, |
|
"grad_norm": 0.2563353180885315, |
|
"learning_rate": 1.3675824651115276e-05, |
|
"loss": 1.2281, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.7744314689612785, |
|
"grad_norm": 0.3003043532371521, |
|
"learning_rate": 1.3534661664518817e-05, |
|
"loss": 1.2247, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.775660725261217, |
|
"grad_norm": 0.30270689725875854, |
|
"learning_rate": 1.339411687977657e-05, |
|
"loss": 1.3467, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.7768899815611555, |
|
"grad_norm": 0.3199015259742737, |
|
"learning_rate": 1.325419267956346e-05, |
|
"loss": 1.1064, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.778119237861094, |
|
"grad_norm": 0.33701401948928833, |
|
"learning_rate": 1.3114891436033522e-05, |
|
"loss": 1.1541, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.7793484941610326, |
|
"grad_norm": 0.3596295714378357, |
|
"learning_rate": 1.2976215510779755e-05, |
|
"loss": 1.0702, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.7805777504609711, |
|
"grad_norm": 0.4676484167575836, |
|
"learning_rate": 1.2838167254794004e-05, |
|
"loss": 0.9612, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.7818070067609096, |
|
"grad_norm": 0.43727314472198486, |
|
"learning_rate": 1.2700749008427205e-05, |
|
"loss": 1.171, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.7830362630608482, |
|
"grad_norm": 0.6096376180648804, |
|
"learning_rate": 1.2563963101349619e-05, |
|
"loss": 1.2183, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.7842655193607867, |
|
"grad_norm": 0.7051495909690857, |
|
"learning_rate": 1.2427811852511395e-05, |
|
"loss": 0.7788, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.7854947756607252, |
|
"grad_norm": 0.6735429167747498, |
|
"learning_rate": 1.2292297570103229e-05, |
|
"loss": 0.7001, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.7867240319606638, |
|
"grad_norm": 0.740139901638031, |
|
"learning_rate": 1.2157422551517228e-05, |
|
"loss": 0.8659, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7879532882606023, |
|
"grad_norm": 0.8181758522987366, |
|
"learning_rate": 1.202318908330795e-05, |
|
"loss": 1.1304, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.7891825445605408, |
|
"grad_norm": 1.1049834489822388, |
|
"learning_rate": 1.188959944115372e-05, |
|
"loss": 1.0549, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.7904118008604795, |
|
"grad_norm": 2.664985418319702, |
|
"learning_rate": 1.1756655889817953e-05, |
|
"loss": 1.2242, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.791641057160418, |
|
"grad_norm": 3.3801634311676025, |
|
"learning_rate": 1.1624360683110819e-05, |
|
"loss": 0.5464, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.7928703134603565, |
|
"grad_norm": 3.139307975769043, |
|
"learning_rate": 1.1492716063850973e-05, |
|
"loss": 0.9791, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.794099569760295, |
|
"grad_norm": 3.501133918762207, |
|
"learning_rate": 1.1361724263827633e-05, |
|
"loss": 1.0201, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.7953288260602336, |
|
"grad_norm": 2.9995079040527344, |
|
"learning_rate": 1.123138750376262e-05, |
|
"loss": 1.0347, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.7965580823601721, |
|
"grad_norm": 3.3414182662963867, |
|
"learning_rate": 1.1101707993272825e-05, |
|
"loss": 0.912, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.7977873386601106, |
|
"grad_norm": 2.3033857345581055, |
|
"learning_rate": 1.097268793083266e-05, |
|
"loss": 0.551, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.7990165949600492, |
|
"grad_norm": 2.474606990814209, |
|
"learning_rate": 1.084432950373685e-05, |
|
"loss": 0.4882, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8002458512599877, |
|
"grad_norm": 0.21653778851032257, |
|
"learning_rate": 1.071663488806331e-05, |
|
"loss": 1.1304, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.8014751075599262, |
|
"grad_norm": 0.19837996363639832, |
|
"learning_rate": 1.0589606248636292e-05, |
|
"loss": 0.8167, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.8027043638598648, |
|
"grad_norm": 0.2383168786764145, |
|
"learning_rate": 1.0463245738989636e-05, |
|
"loss": 1.0729, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.8039336201598033, |
|
"grad_norm": 0.2854044437408447, |
|
"learning_rate": 1.0337555501330281e-05, |
|
"loss": 1.1909, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.8051628764597418, |
|
"grad_norm": 0.2671623229980469, |
|
"learning_rate": 1.0212537666501976e-05, |
|
"loss": 1.2883, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.8063921327596804, |
|
"grad_norm": 0.2792122960090637, |
|
"learning_rate": 1.0088194353949137e-05, |
|
"loss": 1.0991, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.8076213890596189, |
|
"grad_norm": 0.2957600951194763, |
|
"learning_rate": 9.96452767168089e-06, |
|
"loss": 1.0984, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.8088506453595574, |
|
"grad_norm": 0.31414932012557983, |
|
"learning_rate": 9.841539716235387e-06, |
|
"loss": 1.0569, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.8100799016594961, |
|
"grad_norm": 0.3483067452907562, |
|
"learning_rate": 9.719232572644187e-06, |
|
"loss": 1.0767, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.8113091579594346, |
|
"grad_norm": 0.3564923107624054, |
|
"learning_rate": 9.597608314396978e-06, |
|
"loss": 1.2318, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8125384142593731, |
|
"grad_norm": 0.40873420238494873, |
|
"learning_rate": 9.476669003406403e-06, |
|
"loss": 1.2112, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.8137676705593117, |
|
"grad_norm": 0.4222586452960968, |
|
"learning_rate": 9.356416689973108e-06, |
|
"loss": 1.066, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.8149969268592502, |
|
"grad_norm": 0.5865674614906311, |
|
"learning_rate": 9.236853412750935e-06, |
|
"loss": 1.0605, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.8162261831591887, |
|
"grad_norm": 0.7420896291732788, |
|
"learning_rate": 9.11798119871245e-06, |
|
"loss": 0.7511, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.8174554394591272, |
|
"grad_norm": 0.6792820692062378, |
|
"learning_rate": 8.99980206311452e-06, |
|
"loss": 0.9264, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.8186846957590658, |
|
"grad_norm": 0.6937956213951111, |
|
"learning_rate": 8.882318009464125e-06, |
|
"loss": 0.6352, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.8199139520590043, |
|
"grad_norm": 0.7615432739257812, |
|
"learning_rate": 8.765531029484476e-06, |
|
"loss": 0.9749, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.8211432083589428, |
|
"grad_norm": 1.0325685739517212, |
|
"learning_rate": 8.64944310308114e-06, |
|
"loss": 1.2544, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.8223724646588814, |
|
"grad_norm": 3.312368631362915, |
|
"learning_rate": 8.534056198308582e-06, |
|
"loss": 1.1743, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.8236017209588199, |
|
"grad_norm": 2.5028300285339355, |
|
"learning_rate": 8.419372271336745e-06, |
|
"loss": 0.6261, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8248309772587584, |
|
"grad_norm": 2.5545358657836914, |
|
"learning_rate": 8.305393266417887e-06, |
|
"loss": 0.6315, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.826060233558697, |
|
"grad_norm": 2.4186129570007324, |
|
"learning_rate": 8.192121115853602e-06, |
|
"loss": 0.6056, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.8272894898586355, |
|
"grad_norm": 2.6672956943511963, |
|
"learning_rate": 8.079557739962128e-06, |
|
"loss": 0.644, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.828518746158574, |
|
"grad_norm": 3.801270008087158, |
|
"learning_rate": 7.967705047045715e-06, |
|
"loss": 1.0116, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.8297480024585125, |
|
"grad_norm": 2.186417579650879, |
|
"learning_rate": 7.856564933358324e-06, |
|
"loss": 0.4153, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.8309772587584512, |
|
"grad_norm": 0.2179417610168457, |
|
"learning_rate": 7.746139283073473e-06, |
|
"loss": 1.0057, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.8322065150583897, |
|
"grad_norm": 0.2457554191350937, |
|
"learning_rate": 7.636429968252257e-06, |
|
"loss": 1.2948, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.8334357713583282, |
|
"grad_norm": 0.2506256401538849, |
|
"learning_rate": 7.527438848811652e-06, |
|
"loss": 1.1521, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.8346650276582668, |
|
"grad_norm": 0.25727221369743347, |
|
"learning_rate": 7.4191677724929906e-06, |
|
"loss": 1.1952, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.8358942839582053, |
|
"grad_norm": 0.265831857919693, |
|
"learning_rate": 7.31161857483057e-06, |
|
"loss": 1.1952, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8371235402581438, |
|
"grad_norm": 0.27371159195899963, |
|
"learning_rate": 7.204793079120636e-06, |
|
"loss": 1.1563, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.8383527965580824, |
|
"grad_norm": 0.30959704518318176, |
|
"learning_rate": 7.0986930963903575e-06, |
|
"loss": 1.122, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.8395820528580209, |
|
"grad_norm": 0.335997074842453, |
|
"learning_rate": 6.993320425367222e-06, |
|
"loss": 1.0265, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.8408113091579594, |
|
"grad_norm": 0.3063446879386902, |
|
"learning_rate": 6.8886768524485e-06, |
|
"loss": 0.8463, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.842040565457898, |
|
"grad_norm": 0.34628668427467346, |
|
"learning_rate": 6.7847641516709635e-06, |
|
"loss": 0.9597, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.8432698217578365, |
|
"grad_norm": 0.38880714774131775, |
|
"learning_rate": 6.681584084680787e-06, |
|
"loss": 0.9935, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.844499078057775, |
|
"grad_norm": 0.45289453864097595, |
|
"learning_rate": 6.579138400703716e-06, |
|
"loss": 0.911, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.8457283343577136, |
|
"grad_norm": 0.697723388671875, |
|
"learning_rate": 6.4774288365154035e-06, |
|
"loss": 0.9489, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.8469575906576521, |
|
"grad_norm": 0.7846165895462036, |
|
"learning_rate": 6.376457116411971e-06, |
|
"loss": 0.9332, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.8481868469575906, |
|
"grad_norm": 0.6980641484260559, |
|
"learning_rate": 6.2762249521807645e-06, |
|
"loss": 0.8282, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8494161032575291, |
|
"grad_norm": 0.6430233716964722, |
|
"learning_rate": 6.17673404307132e-06, |
|
"loss": 0.7686, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.8506453595574678, |
|
"grad_norm": 0.7281519174575806, |
|
"learning_rate": 6.077986075766612e-06, |
|
"loss": 0.9116, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.8518746158574063, |
|
"grad_norm": 1.0536761283874512, |
|
"learning_rate": 5.979982724354366e-06, |
|
"loss": 1.1009, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.8531038721573448, |
|
"grad_norm": 1.9653775691986084, |
|
"learning_rate": 5.882725650298787e-06, |
|
"loss": 1.1204, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.8543331284572834, |
|
"grad_norm": 3.2125988006591797, |
|
"learning_rate": 5.7862165024123175e-06, |
|
"loss": 1.3958, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.8555623847572219, |
|
"grad_norm": 3.528750419616699, |
|
"learning_rate": 5.690456916827691e-06, |
|
"loss": 0.9974, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.8567916410571604, |
|
"grad_norm": 3.617098808288574, |
|
"learning_rate": 5.5954485169702306e-06, |
|
"loss": 1.1263, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.858020897357099, |
|
"grad_norm": 2.575523614883423, |
|
"learning_rate": 5.501192913530301e-06, |
|
"loss": 0.6504, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.8592501536570375, |
|
"grad_norm": 3.0249316692352295, |
|
"learning_rate": 5.407691704435991e-06, |
|
"loss": 0.6346, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.860479409956976, |
|
"grad_norm": 2.3461408615112305, |
|
"learning_rate": 5.314946474826066e-06, |
|
"loss": 0.4619, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8617086662569146, |
|
"grad_norm": 0.2000708281993866, |
|
"learning_rate": 5.222958797023036e-06, |
|
"loss": 0.9165, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.8629379225568531, |
|
"grad_norm": 0.23434923589229584, |
|
"learning_rate": 5.13173023050656e-06, |
|
"loss": 1.1903, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.8641671788567916, |
|
"grad_norm": 0.25243085622787476, |
|
"learning_rate": 5.041262321886958e-06, |
|
"loss": 1.2185, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.8653964351567301, |
|
"grad_norm": 0.25832071900367737, |
|
"learning_rate": 4.951556604879048e-06, |
|
"loss": 1.2881, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.8666256914566687, |
|
"grad_norm": 0.25066548585891724, |
|
"learning_rate": 4.862614600276061e-06, |
|
"loss": 1.1272, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.8678549477566072, |
|
"grad_norm": 0.27538540959358215, |
|
"learning_rate": 4.774437815923938e-06, |
|
"loss": 1.003, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.8690842040565457, |
|
"grad_norm": 0.27678731083869934, |
|
"learning_rate": 4.687027746695727e-06, |
|
"loss": 1.0579, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.8703134603564844, |
|
"grad_norm": 0.3305651843547821, |
|
"learning_rate": 4.600385874466256e-06, |
|
"loss": 1.0734, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.8715427166564229, |
|
"grad_norm": 0.3679453134536743, |
|
"learning_rate": 4.514513668087011e-06, |
|
"loss": 1.1202, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.8727719729563614, |
|
"grad_norm": 0.32898518443107605, |
|
"learning_rate": 4.429412583361209e-06, |
|
"loss": 1.1176, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8740012292563, |
|
"grad_norm": 0.37726449966430664, |
|
"learning_rate": 4.34508406301915e-06, |
|
"loss": 1.1331, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.8752304855562385, |
|
"grad_norm": 0.44210806488990784, |
|
"learning_rate": 4.261529536693737e-06, |
|
"loss": 0.9606, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.876459741856177, |
|
"grad_norm": 0.5424479246139526, |
|
"learning_rate": 4.178750420896255e-06, |
|
"loss": 0.9004, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.8776889981561156, |
|
"grad_norm": 0.7757157683372498, |
|
"learning_rate": 4.0967481189923384e-06, |
|
"loss": 0.8007, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.8789182544560541, |
|
"grad_norm": 0.6526496410369873, |
|
"learning_rate": 4.015524021178196e-06, |
|
"loss": 0.7106, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.8801475107559926, |
|
"grad_norm": 0.6553987264633179, |
|
"learning_rate": 3.935079504457034e-06, |
|
"loss": 0.668, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.8813767670559312, |
|
"grad_norm": 0.795111358165741, |
|
"learning_rate": 3.8554159326157304e-06, |
|
"loss": 1.0273, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.8826060233558697, |
|
"grad_norm": 1.3710993528366089, |
|
"learning_rate": 3.7765346562016744e-06, |
|
"loss": 1.0677, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.8838352796558082, |
|
"grad_norm": 2.689974308013916, |
|
"learning_rate": 3.6984370124999058e-06, |
|
"loss": 1.3152, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.8850645359557467, |
|
"grad_norm": 3.0526108741760254, |
|
"learning_rate": 3.621124325510422e-06, |
|
"loss": 0.7108, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8862937922556853, |
|
"grad_norm": 2.8882553577423096, |
|
"learning_rate": 3.5445979059257505e-06, |
|
"loss": 0.7952, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.8875230485556238, |
|
"grad_norm": 2.545741319656372, |
|
"learning_rate": 3.4688590511087304e-06, |
|
"loss": 0.5186, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.8887523048555623, |
|
"grad_norm": 3.898621082305908, |
|
"learning_rate": 3.3939090450704925e-06, |
|
"loss": 0.921, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.8899815611555009, |
|
"grad_norm": 2.621055841445923, |
|
"learning_rate": 3.3197491584487093e-06, |
|
"loss": 0.7413, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.8912108174554395, |
|
"grad_norm": 2.9799768924713135, |
|
"learning_rate": 3.246380648486058e-06, |
|
"loss": 0.5268, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.892440073755378, |
|
"grad_norm": 0.2122289389371872, |
|
"learning_rate": 3.1738047590088803e-06, |
|
"loss": 1.0725, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.8936693300553166, |
|
"grad_norm": 0.2591450810432434, |
|
"learning_rate": 3.10202272040615e-06, |
|
"loss": 1.4448, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.8948985863552551, |
|
"grad_norm": 0.22906170785427094, |
|
"learning_rate": 3.0310357496085405e-06, |
|
"loss": 1.0041, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.8961278426551936, |
|
"grad_norm": 0.2589828073978424, |
|
"learning_rate": 2.9608450500678565e-06, |
|
"loss": 1.2252, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.8973570989551322, |
|
"grad_norm": 0.27138885855674744, |
|
"learning_rate": 2.8914518117366006e-06, |
|
"loss": 1.2409, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8985863552550707, |
|
"grad_norm": 0.2877761423587799, |
|
"learning_rate": 2.8228572110478133e-06, |
|
"loss": 1.1005, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.8998156115550092, |
|
"grad_norm": 0.3168583810329437, |
|
"learning_rate": 2.755062410895104e-06, |
|
"loss": 0.9683, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.9010448678549478, |
|
"grad_norm": 0.3234322667121887, |
|
"learning_rate": 2.6880685606129664e-06, |
|
"loss": 0.979, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.9022741241548863, |
|
"grad_norm": 0.33790323138237, |
|
"learning_rate": 2.62187679595729e-06, |
|
"loss": 1.1439, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.9035033804548248, |
|
"grad_norm": 0.34143367409706116, |
|
"learning_rate": 2.55648823908608e-06, |
|
"loss": 1.1412, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.9047326367547633, |
|
"grad_norm": 0.41084036231040955, |
|
"learning_rate": 2.4919039985404626e-06, |
|
"loss": 1.1742, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.9059618930547019, |
|
"grad_norm": 0.5593938231468201, |
|
"learning_rate": 2.428125169225881e-06, |
|
"loss": 1.0794, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.9071911493546404, |
|
"grad_norm": 0.7228390574455261, |
|
"learning_rate": 2.36515283239353e-06, |
|
"loss": 0.7684, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.9084204056545789, |
|
"grad_norm": 0.6799226999282837, |
|
"learning_rate": 2.3029880556220074e-06, |
|
"loss": 1.0537, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.9096496619545175, |
|
"grad_norm": 0.6442016959190369, |
|
"learning_rate": 2.241631892799262e-06, |
|
"loss": 0.498, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.9108789182544561, |
|
"grad_norm": 0.8138104677200317, |
|
"learning_rate": 2.181085384104703e-06, |
|
"loss": 0.9415, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.9121081745543946, |
|
"grad_norm": 0.864895224571228, |
|
"learning_rate": 2.121349555991525e-06, |
|
"loss": 1.0131, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.9133374308543332, |
|
"grad_norm": 1.3241431713104248, |
|
"learning_rate": 2.0624254211693894e-06, |
|
"loss": 1.0531, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.9145666871542717, |
|
"grad_norm": 2.810883045196533, |
|
"learning_rate": 2.004313978587186e-06, |
|
"loss": 0.8746, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.9157959434542102, |
|
"grad_norm": 3.3806657791137695, |
|
"learning_rate": 1.9470162134161143e-06, |
|
"loss": 0.804, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.9170251997541488, |
|
"grad_norm": 3.9051411151885986, |
|
"learning_rate": 1.8905330970330259e-06, |
|
"loss": 1.1265, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.9182544560540873, |
|
"grad_norm": 2.50091290473938, |
|
"learning_rate": 1.83486558700387e-06, |
|
"loss": 0.8042, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.9194837123540258, |
|
"grad_norm": 2.444403648376465, |
|
"learning_rate": 1.78001462706755e-06, |
|
"loss": 0.7185, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.9207129686539643, |
|
"grad_norm": 2.565976619720459, |
|
"learning_rate": 1.7259811471198706e-06, |
|
"loss": 0.5854, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.9219422249539029, |
|
"grad_norm": 3.5841028690338135, |
|
"learning_rate": 1.6727660631977893e-06, |
|
"loss": 1.1298, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9231714812538414, |
|
"grad_norm": 0.18373464047908783, |
|
"learning_rate": 1.620370277463884e-06, |
|
"loss": 0.798, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.9244007375537799, |
|
"grad_norm": 0.23574425280094147, |
|
"learning_rate": 1.5687946781910378e-06, |
|
"loss": 1.1075, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.9256299938537185, |
|
"grad_norm": 0.25111323595046997, |
|
"learning_rate": 1.5180401397474343e-06, |
|
"loss": 1.2677, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.926859250153657, |
|
"grad_norm": 0.23998814821243286, |
|
"learning_rate": 1.4681075225816854e-06, |
|
"loss": 1.0985, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.9280885064535955, |
|
"grad_norm": 0.2664302587509155, |
|
"learning_rate": 1.4189976732082666e-06, |
|
"loss": 1.0119, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.9293177627535341, |
|
"grad_norm": 0.28511667251586914, |
|
"learning_rate": 1.3707114241931328e-06, |
|
"loss": 1.2079, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.9305470190534727, |
|
"grad_norm": 0.278188556432724, |
|
"learning_rate": 1.3232495941396639e-06, |
|
"loss": 1.1176, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.9317762753534112, |
|
"grad_norm": 0.3081459105014801, |
|
"learning_rate": 1.2766129876747413e-06, |
|
"loss": 0.9467, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.9330055316533498, |
|
"grad_norm": 0.34005647897720337, |
|
"learning_rate": 1.2308023954351043e-06, |
|
"loss": 1.0071, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.9342347879532883, |
|
"grad_norm": 0.3434988856315613, |
|
"learning_rate": 1.1858185940539779e-06, |
|
"loss": 1.1407, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9354640442532268, |
|
"grad_norm": 0.38441202044487, |
|
"learning_rate": 1.1416623461478704e-06, |
|
"loss": 1.316, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.9366933005531654, |
|
"grad_norm": 0.42989110946655273, |
|
"learning_rate": 1.0983344003036912e-06, |
|
"loss": 1.2446, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.9379225568531039, |
|
"grad_norm": 0.4977233111858368, |
|
"learning_rate": 1.055835491066004e-06, |
|
"loss": 0.7431, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.9391518131530424, |
|
"grad_norm": 0.6383669376373291, |
|
"learning_rate": 1.014166338924627e-06, |
|
"loss": 0.5211, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.9403810694529809, |
|
"grad_norm": 0.7088152766227722, |
|
"learning_rate": 9.733276503023692e-07, |
|
"loss": 0.537, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.9416103257529195, |
|
"grad_norm": 0.6624244451522827, |
|
"learning_rate": 9.33320117543085e-07, |
|
"loss": 0.8375, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.942839582052858, |
|
"grad_norm": 0.7807971239089966, |
|
"learning_rate": 8.941444188999393e-07, |
|
"loss": 0.7689, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.9440688383527965, |
|
"grad_norm": 1.2817997932434082, |
|
"learning_rate": 8.558012185238939e-07, |
|
"loss": 1.1247, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.9452980946527351, |
|
"grad_norm": 3.324708938598633, |
|
"learning_rate": 8.182911664524562e-07, |
|
"loss": 1.0981, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.9465273509526736, |
|
"grad_norm": 3.574130058288574, |
|
"learning_rate": 7.816148985986483e-07, |
|
"loss": 0.6504, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9477566072526121, |
|
"grad_norm": 3.618056297302246, |
|
"learning_rate": 7.457730367402549e-07, |
|
"loss": 0.6916, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.9489858635525507, |
|
"grad_norm": 2.9001710414886475, |
|
"learning_rate": 7.107661885092321e-07, |
|
"loss": 0.8261, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.9502151198524893, |
|
"grad_norm": 2.4319024085998535, |
|
"learning_rate": 6.765949473814648e-07, |
|
"loss": 0.9126, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.9514443761524278, |
|
"grad_norm": 3.393937110900879, |
|
"learning_rate": 6.432598926666589e-07, |
|
"loss": 0.8197, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.9526736324523664, |
|
"grad_norm": 3.406257390975952, |
|
"learning_rate": 6.107615894985375e-07, |
|
"loss": 0.9015, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.9539028887523049, |
|
"grad_norm": 0.21785248816013336, |
|
"learning_rate": 5.791005888252765e-07, |
|
"loss": 0.9666, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.9551321450522434, |
|
"grad_norm": 0.22744600474834442, |
|
"learning_rate": 5.482774274001401e-07, |
|
"loss": 1.1927, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.956361401352182, |
|
"grad_norm": 0.26558175683021545, |
|
"learning_rate": 5.18292627772382e-07, |
|
"loss": 1.3098, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.9575906576521205, |
|
"grad_norm": 0.25804319977760315, |
|
"learning_rate": 4.891466982783977e-07, |
|
"loss": 1.2578, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.958819913952059, |
|
"grad_norm": 0.2447444498538971, |
|
"learning_rate": 4.60840133033108e-07, |
|
"loss": 0.9595, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9600491702519975, |
|
"grad_norm": 0.2831968069076538, |
|
"learning_rate": 4.3337341192157265e-07, |
|
"loss": 1.0676, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.9612784265519361, |
|
"grad_norm": 0.2845923602581024, |
|
"learning_rate": 4.067470005908625e-07, |
|
"loss": 0.9645, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.9625076828518746, |
|
"grad_norm": 0.3282490372657776, |
|
"learning_rate": 3.809613504421661e-07, |
|
"loss": 0.8996, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.9637369391518131, |
|
"grad_norm": 0.3805837631225586, |
|
"learning_rate": 3.5601689862311826e-07, |
|
"loss": 0.9756, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.9649661954517517, |
|
"grad_norm": 0.3583146631717682, |
|
"learning_rate": 3.3191406802041693e-07, |
|
"loss": 1.0574, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.9661954517516902, |
|
"grad_norm": 0.4697728753089905, |
|
"learning_rate": 3.0865326725263435e-07, |
|
"loss": 1.4301, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.9674247080516287, |
|
"grad_norm": 0.6786931157112122, |
|
"learning_rate": 2.8623489066329503e-07, |
|
"loss": 1.0623, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.9686539643515673, |
|
"grad_norm": 0.6815045475959778, |
|
"learning_rate": 2.646593183142088e-07, |
|
"loss": 0.7782, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.9698832206515058, |
|
"grad_norm": 0.7787659764289856, |
|
"learning_rate": 2.4392691597898143e-07, |
|
"loss": 0.8097, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.9711124769514444, |
|
"grad_norm": 0.7595354318618774, |
|
"learning_rate": 2.2403803513686428e-07, |
|
"loss": 0.9502, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.972341733251383, |
|
"grad_norm": 0.8005998730659485, |
|
"learning_rate": 2.0499301296676432e-07, |
|
"loss": 0.7545, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.9735709895513215, |
|
"grad_norm": 1.0289498567581177, |
|
"learning_rate": 1.8679217234154334e-07, |
|
"loss": 1.2025, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.97480024585126, |
|
"grad_norm": 1.7563585042953491, |
|
"learning_rate": 1.6943582182253336e-07, |
|
"loss": 1.1175, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.9760295021511985, |
|
"grad_norm": 3.998744487762451, |
|
"learning_rate": 1.5292425565430757e-07, |
|
"loss": 1.2125, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.9772587584511371, |
|
"grad_norm": 2.2866718769073486, |
|
"learning_rate": 1.372577537597064e-07, |
|
"loss": 0.3866, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.9784880147510756, |
|
"grad_norm": 3.594325542449951, |
|
"learning_rate": 1.224365817350692e-07, |
|
"loss": 0.8022, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.9797172710510141, |
|
"grad_norm": 2.57466197013855, |
|
"learning_rate": 1.0846099084574346e-07, |
|
"loss": 0.8229, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.9809465273509527, |
|
"grad_norm": 2.2920215129852295, |
|
"learning_rate": 9.533121802183797e-08, |
|
"loss": 0.509, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.9821757836508912, |
|
"grad_norm": 2.825840711593628, |
|
"learning_rate": 8.304748585417078e-08, |
|
"loss": 0.9473, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.9834050399508297, |
|
"grad_norm": 3.6677334308624268, |
|
"learning_rate": 7.161000259053308e-08, |
|
"loss": 0.7938, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9846342962507683, |
|
"grad_norm": 0.2495596706867218, |
|
"learning_rate": 6.10189621321422e-08, |
|
"loss": 1.2466, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.9858635525507068, |
|
"grad_norm": 0.2586044371128082, |
|
"learning_rate": 5.127454403034415e-08, |
|
"loss": 1.3163, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.9870928088506453, |
|
"grad_norm": 0.2908371388912201, |
|
"learning_rate": 4.2376913483599404e-08, |
|
"loss": 1.2477, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.9883220651505839, |
|
"grad_norm": 0.3172759413719177, |
|
"learning_rate": 3.4326221334640695e-08, |
|
"loss": 1.1196, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.9895513214505224, |
|
"grad_norm": 0.33819660544395447, |
|
"learning_rate": 2.712260406795286e-08, |
|
"loss": 1.0077, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.990780577750461, |
|
"grad_norm": 0.353371262550354, |
|
"learning_rate": 2.076618380744133e-08, |
|
"loss": 1.0194, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.9920098340503996, |
|
"grad_norm": 0.4687137007713318, |
|
"learning_rate": 1.525706831437268e-08, |
|
"loss": 1.199, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.9932390903503381, |
|
"grad_norm": 0.6552625298500061, |
|
"learning_rate": 1.0595350985526109e-08, |
|
"loss": 0.5601, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.9944683466502766, |
|
"grad_norm": 0.6662526726722717, |
|
"learning_rate": 6.781110851633576e-09, |
|
"loss": 0.8824, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.9956976029502151, |
|
"grad_norm": 1.0151597261428833, |
|
"learning_rate": 3.814412576025328e-09, |
|
"loss": 1.1593, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9969268592501537, |
|
"grad_norm": 2.71447491645813, |
|
"learning_rate": 1.6953064535474295e-09, |
|
"loss": 0.7048, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.9981561155500922, |
|
"grad_norm": 3.1842808723449707, |
|
"learning_rate": 4.238284096902412e-10, |
|
"loss": 0.8335, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.9993853718500307, |
|
"grad_norm": 2.6523120403289795, |
|
"learning_rate": 0.0, |
|
"loss": 0.6896, |
|
"step": 813 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 813, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 204, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6756836906985062e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|