|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5015365703749232, |
|
"eval_steps": 204, |
|
"global_step": 408, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001229256299938537, |
|
"grad_norm": 0.19411148130893707, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.1612, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.001229256299938537, |
|
"eval_loss": 2.1468453407287598, |
|
"eval_runtime": 66.4837, |
|
"eval_samples_per_second": 10.303, |
|
"eval_steps_per_second": 5.159, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002458512599877074, |
|
"grad_norm": 0.2264145463705063, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.4401, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0036877688998156115, |
|
"grad_norm": 0.2364473193883896, |
|
"learning_rate": 6e-06, |
|
"loss": 1.4676, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.004917025199754148, |
|
"grad_norm": 0.24018821120262146, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.3851, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.006146281499692686, |
|
"grad_norm": 0.23238497972488403, |
|
"learning_rate": 1e-05, |
|
"loss": 1.213, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.007375537799631223, |
|
"grad_norm": 0.24634625017642975, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.2627, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.008604794099569761, |
|
"grad_norm": 0.26495596766471863, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 1.3908, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.009834050399508297, |
|
"grad_norm": 0.2719455361366272, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.3814, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.011063306699446834, |
|
"grad_norm": 0.26454323530197144, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.2438, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.012292562999385371, |
|
"grad_norm": 0.3004608750343323, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3694, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.013521819299323909, |
|
"grad_norm": 0.3035408854484558, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 1.4792, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.014751075599262446, |
|
"grad_norm": 0.4270775020122528, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.1673, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.015980331899200985, |
|
"grad_norm": 0.4388391971588135, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.5171, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.017209588199139522, |
|
"grad_norm": 0.7133700847625732, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.0732, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01843884449907806, |
|
"grad_norm": 1.026840329170227, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1705, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.019668100799016593, |
|
"grad_norm": 0.7934454679489136, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.3509, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.02089735709895513, |
|
"grad_norm": 0.8138520121574402, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.181, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.022126613398893668, |
|
"grad_norm": 1.7830528020858765, |
|
"learning_rate": 3.6e-05, |
|
"loss": 2.1836, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.023355869698832205, |
|
"grad_norm": 10.527496337890625, |
|
"learning_rate": 3.8e-05, |
|
"loss": 3.3749, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.024585125998770743, |
|
"grad_norm": 6.364173889160156, |
|
"learning_rate": 4e-05, |
|
"loss": 2.9191, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02581438229870928, |
|
"grad_norm": 7.087876796722412, |
|
"learning_rate": 4.2e-05, |
|
"loss": 3.0788, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.027043638598647817, |
|
"grad_norm": 5.370169639587402, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 2.7809, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.028272894898586354, |
|
"grad_norm": 4.118806838989258, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 2.7475, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.02950215119852489, |
|
"grad_norm": 4.46057653427124, |
|
"learning_rate": 4.8e-05, |
|
"loss": 2.6906, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03073140749846343, |
|
"grad_norm": 3.8601913452148438, |
|
"learning_rate": 5e-05, |
|
"loss": 2.652, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03196066379840197, |
|
"grad_norm": 0.19972144067287445, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.1845, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.03318992009834051, |
|
"grad_norm": 0.21230019629001617, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 1.2875, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.034419176398279044, |
|
"grad_norm": 0.22694356739521027, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.1829, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03564843269821758, |
|
"grad_norm": 0.2587474584579468, |
|
"learning_rate": 5.8e-05, |
|
"loss": 1.3909, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.03687768899815612, |
|
"grad_norm": 0.24409259855747223, |
|
"learning_rate": 6e-05, |
|
"loss": 1.1671, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03810694529809465, |
|
"grad_norm": 0.26323097944259644, |
|
"learning_rate": 6.2e-05, |
|
"loss": 1.2055, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.03933620159803319, |
|
"grad_norm": 0.2842409908771515, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 1.3051, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.040565457897971724, |
|
"grad_norm": 0.32476744055747986, |
|
"learning_rate": 6.6e-05, |
|
"loss": 1.0828, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.04179471419791026, |
|
"grad_norm": 0.32893380522727966, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 1.3896, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0430239704978488, |
|
"grad_norm": 0.3359004855155945, |
|
"learning_rate": 7e-05, |
|
"loss": 1.3713, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.044253226797787336, |
|
"grad_norm": 0.5471646189689636, |
|
"learning_rate": 7.2e-05, |
|
"loss": 1.2572, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04548248309772587, |
|
"grad_norm": 0.5404387712478638, |
|
"learning_rate": 7.4e-05, |
|
"loss": 1.2321, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.04671173939766441, |
|
"grad_norm": 1.0199828147888184, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.8434, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04794099569760295, |
|
"grad_norm": 1.5890088081359863, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.7781, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.049170251997541485, |
|
"grad_norm": 0.7897126078605652, |
|
"learning_rate": 8e-05, |
|
"loss": 1.1982, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05039950829748002, |
|
"grad_norm": 0.7874982953071594, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.3432, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.05162876459741856, |
|
"grad_norm": 1.3902230262756348, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.7817, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0528580208973571, |
|
"grad_norm": 4.006369590759277, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.8832, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.054087277197295634, |
|
"grad_norm": 7.2500996589660645, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.3256, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.05531653349723417, |
|
"grad_norm": 5.088122844696045, |
|
"learning_rate": 9e-05, |
|
"loss": 1.4272, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05654578979717271, |
|
"grad_norm": 2.9680252075195312, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.8063, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.057775046097111246, |
|
"grad_norm": 3.4886820316314697, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.7842, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.05900430239704978, |
|
"grad_norm": 2.635120153427124, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.7775, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06023355869698832, |
|
"grad_norm": 2.7715940475463867, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.6795, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.06146281499692686, |
|
"grad_norm": 4.598182678222656, |
|
"learning_rate": 0.0001, |
|
"loss": 2.0141, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0626920712968654, |
|
"grad_norm": 0.4595154821872711, |
|
"learning_rate": 9.999957617159031e-05, |
|
"loss": 1.1302, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.06392132759680394, |
|
"grad_norm": 0.3996050953865051, |
|
"learning_rate": 9.999830469354645e-05, |
|
"loss": 1.3499, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.06515058389674247, |
|
"grad_norm": 0.4248620867729187, |
|
"learning_rate": 9.999618558742398e-05, |
|
"loss": 1.4393, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.06637984019668101, |
|
"grad_norm": 0.37063130736351013, |
|
"learning_rate": 9.999321888914836e-05, |
|
"loss": 1.4761, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.06760909649661954, |
|
"grad_norm": 0.3327302038669586, |
|
"learning_rate": 9.998940464901447e-05, |
|
"loss": 1.1365, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06883835279655809, |
|
"grad_norm": 0.3424387276172638, |
|
"learning_rate": 9.998474293168562e-05, |
|
"loss": 1.2037, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.07006760909649662, |
|
"grad_norm": 0.34453633427619934, |
|
"learning_rate": 9.997923381619256e-05, |
|
"loss": 0.9586, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.07129686539643516, |
|
"grad_norm": 0.3327544033527374, |
|
"learning_rate": 9.997287739593206e-05, |
|
"loss": 1.3026, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0725261216963737, |
|
"grad_norm": 0.33542299270629883, |
|
"learning_rate": 9.996567377866537e-05, |
|
"loss": 1.1601, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.07375537799631224, |
|
"grad_norm": 0.5743572115898132, |
|
"learning_rate": 9.99576230865164e-05, |
|
"loss": 1.3892, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07498463429625077, |
|
"grad_norm": 0.4624180495738983, |
|
"learning_rate": 9.994872545596966e-05, |
|
"loss": 1.2519, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0762138905961893, |
|
"grad_norm": 0.6259918808937073, |
|
"learning_rate": 9.993898103786786e-05, |
|
"loss": 1.315, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.07744314689612784, |
|
"grad_norm": 0.6186118125915527, |
|
"learning_rate": 9.992838999740947e-05, |
|
"loss": 0.877, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.07867240319606637, |
|
"grad_norm": 0.6788893342018127, |
|
"learning_rate": 9.991695251414583e-05, |
|
"loss": 0.886, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.07990165949600492, |
|
"grad_norm": 0.7688488960266113, |
|
"learning_rate": 9.990466878197817e-05, |
|
"loss": 0.7427, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08113091579594345, |
|
"grad_norm": 0.6739158630371094, |
|
"learning_rate": 9.989153900915427e-05, |
|
"loss": 1.091, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.08236017209588199, |
|
"grad_norm": 1.0515763759613037, |
|
"learning_rate": 9.987756341826493e-05, |
|
"loss": 1.4195, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.08358942839582052, |
|
"grad_norm": 2.324380397796631, |
|
"learning_rate": 9.98627422462403e-05, |
|
"loss": 1.8108, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.08481868469575907, |
|
"grad_norm": 4.131134510040283, |
|
"learning_rate": 9.98470757443457e-05, |
|
"loss": 1.2769, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0860479409956976, |
|
"grad_norm": 6.158152103424072, |
|
"learning_rate": 9.983056417817747e-05, |
|
"loss": 1.609, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08727719729563614, |
|
"grad_norm": 2.710057020187378, |
|
"learning_rate": 9.981320782765846e-05, |
|
"loss": 1.6382, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.08850645359557467, |
|
"grad_norm": 2.729590654373169, |
|
"learning_rate": 9.979500698703323e-05, |
|
"loss": 1.8179, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.08973570989551322, |
|
"grad_norm": 2.1861114501953125, |
|
"learning_rate": 9.977596196486314e-05, |
|
"loss": 1.7416, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.09096496619545175, |
|
"grad_norm": 2.614532947540283, |
|
"learning_rate": 9.975607308402101e-05, |
|
"loss": 1.8413, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.09219422249539029, |
|
"grad_norm": 3.3295183181762695, |
|
"learning_rate": 9.973534068168579e-05, |
|
"loss": 2.1946, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09342347879532882, |
|
"grad_norm": 0.3009834885597229, |
|
"learning_rate": 9.97137651093367e-05, |
|
"loss": 1.1058, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.09465273509526737, |
|
"grad_norm": 0.2889084815979004, |
|
"learning_rate": 9.969134673274738e-05, |
|
"loss": 1.0812, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0958819913952059, |
|
"grad_norm": 0.26639047265052795, |
|
"learning_rate": 9.966808593197959e-05, |
|
"loss": 1.2787, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.09711124769514444, |
|
"grad_norm": 0.2839871048927307, |
|
"learning_rate": 9.964398310137688e-05, |
|
"loss": 1.2314, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.09834050399508297, |
|
"grad_norm": 0.29856863617897034, |
|
"learning_rate": 9.961903864955783e-05, |
|
"loss": 1.1781, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09956976029502151, |
|
"grad_norm": 0.3113296329975128, |
|
"learning_rate": 9.959325299940914e-05, |
|
"loss": 1.1297, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.10079901659496004, |
|
"grad_norm": 0.3259466290473938, |
|
"learning_rate": 9.956662658807842e-05, |
|
"loss": 1.3892, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.10202827289489859, |
|
"grad_norm": 0.3366626501083374, |
|
"learning_rate": 9.95391598669669e-05, |
|
"loss": 1.1833, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.10325752919483712, |
|
"grad_norm": 0.3032483458518982, |
|
"learning_rate": 9.95108533017216e-05, |
|
"loss": 1.1729, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.10448678549477566, |
|
"grad_norm": 0.4028280973434448, |
|
"learning_rate": 9.948170737222762e-05, |
|
"loss": 1.1019, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1057160417947142, |
|
"grad_norm": 0.3796052932739258, |
|
"learning_rate": 9.945172257259986e-05, |
|
"loss": 1.3822, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.10694529809465274, |
|
"grad_norm": 0.3956368565559387, |
|
"learning_rate": 9.942089941117472e-05, |
|
"loss": 1.2101, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.10817455439459127, |
|
"grad_norm": 0.5040555596351624, |
|
"learning_rate": 9.938923841050147e-05, |
|
"loss": 1.059, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.10940381069452981, |
|
"grad_norm": 0.7209507822990417, |
|
"learning_rate": 9.935674010733336e-05, |
|
"loss": 0.9387, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.11063306699446834, |
|
"grad_norm": 0.6711410284042358, |
|
"learning_rate": 9.932340505261855e-05, |
|
"loss": 0.9325, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11186232329440689, |
|
"grad_norm": 0.670559823513031, |
|
"learning_rate": 9.928923381149078e-05, |
|
"loss": 1.1188, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.11309157959434542, |
|
"grad_norm": 1.4009896516799927, |
|
"learning_rate": 9.925422696325975e-05, |
|
"loss": 1.4021, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.11432083589428396, |
|
"grad_norm": 2.7449545860290527, |
|
"learning_rate": 9.921838510140135e-05, |
|
"loss": 1.7181, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.11555009219422249, |
|
"grad_norm": 3.5462844371795654, |
|
"learning_rate": 9.918170883354755e-05, |
|
"loss": 1.4934, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.11677934849416104, |
|
"grad_norm": 3.204674005508423, |
|
"learning_rate": 9.914419878147611e-05, |
|
"loss": 1.2952, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.11800860479409957, |
|
"grad_norm": 2.583436965942383, |
|
"learning_rate": 9.910585558110006e-05, |
|
"loss": 1.418, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.11923786109403811, |
|
"grad_norm": 3.0214803218841553, |
|
"learning_rate": 9.906667988245692e-05, |
|
"loss": 1.8579, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.12046711739397664, |
|
"grad_norm": 2.359790325164795, |
|
"learning_rate": 9.902667234969764e-05, |
|
"loss": 1.2705, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.12169637369391519, |
|
"grad_norm": 2.093607187271118, |
|
"learning_rate": 9.898583366107538e-05, |
|
"loss": 1.4241, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.12292562999385372, |
|
"grad_norm": 2.613720655441284, |
|
"learning_rate": 9.8944164508934e-05, |
|
"loss": 1.7558, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12415488629379226, |
|
"grad_norm": 0.29464319348335266, |
|
"learning_rate": 9.890166559969631e-05, |
|
"loss": 1.1966, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.1253841425937308, |
|
"grad_norm": 0.27224430441856384, |
|
"learning_rate": 9.885833765385212e-05, |
|
"loss": 1.3172, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.12661339889366932, |
|
"grad_norm": 0.2738960385322571, |
|
"learning_rate": 9.881418140594603e-05, |
|
"loss": 1.2875, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.12784265519360788, |
|
"grad_norm": 0.274746298789978, |
|
"learning_rate": 9.876919760456492e-05, |
|
"loss": 1.3156, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1290719114935464, |
|
"grad_norm": 0.3050672113895416, |
|
"learning_rate": 9.872338701232526e-05, |
|
"loss": 1.2426, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.13030116779348494, |
|
"grad_norm": 0.2726648449897766, |
|
"learning_rate": 9.867675040586034e-05, |
|
"loss": 1.1997, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.13153042409342347, |
|
"grad_norm": 0.2615199685096741, |
|
"learning_rate": 9.862928857580687e-05, |
|
"loss": 1.1518, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.13275968039336203, |
|
"grad_norm": 0.27568066120147705, |
|
"learning_rate": 9.858100232679175e-05, |
|
"loss": 0.9874, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.13398893669330056, |
|
"grad_norm": 0.29168951511383057, |
|
"learning_rate": 9.853189247741833e-05, |
|
"loss": 1.2147, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.1352181929932391, |
|
"grad_norm": 0.30630671977996826, |
|
"learning_rate": 9.848195986025257e-05, |
|
"loss": 1.2474, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.13644744929317762, |
|
"grad_norm": 0.3246194124221802, |
|
"learning_rate": 9.843120532180896e-05, |
|
"loss": 1.1839, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.13767670559311618, |
|
"grad_norm": 0.34899017214775085, |
|
"learning_rate": 9.837962972253612e-05, |
|
"loss": 1.2389, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.1389059618930547, |
|
"grad_norm": 0.3848627805709839, |
|
"learning_rate": 9.83272339368022e-05, |
|
"loss": 1.1833, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.14013521819299324, |
|
"grad_norm": 0.4109489917755127, |
|
"learning_rate": 9.827401885288013e-05, |
|
"loss": 1.1026, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.14136447449293177, |
|
"grad_norm": 0.6600728034973145, |
|
"learning_rate": 9.821998537293245e-05, |
|
"loss": 1.4073, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.14259373079287033, |
|
"grad_norm": 0.5556017756462097, |
|
"learning_rate": 9.816513441299613e-05, |
|
"loss": 0.6878, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.14382298709280886, |
|
"grad_norm": 0.5937761068344116, |
|
"learning_rate": 9.810946690296698e-05, |
|
"loss": 0.7988, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.1450522433927474, |
|
"grad_norm": 0.6892157196998596, |
|
"learning_rate": 9.80529837865839e-05, |
|
"loss": 1.2152, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.14628149969268592, |
|
"grad_norm": 1.1046031713485718, |
|
"learning_rate": 9.799568602141283e-05, |
|
"loss": 1.4396, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.14751075599262448, |
|
"grad_norm": 3.366898536682129, |
|
"learning_rate": 9.793757457883062e-05, |
|
"loss": 1.6062, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.148740012292563, |
|
"grad_norm": 4.46527624130249, |
|
"learning_rate": 9.787865044400848e-05, |
|
"loss": 1.041, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.14996926859250154, |
|
"grad_norm": 3.8992013931274414, |
|
"learning_rate": 9.781891461589531e-05, |
|
"loss": 1.6166, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.15119852489244007, |
|
"grad_norm": 2.6794042587280273, |
|
"learning_rate": 9.775836810720074e-05, |
|
"loss": 1.5444, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.1524277811923786, |
|
"grad_norm": 2.1487152576446533, |
|
"learning_rate": 9.769701194437799e-05, |
|
"loss": 1.4051, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.15365703749231716, |
|
"grad_norm": 2.6264848709106445, |
|
"learning_rate": 9.763484716760649e-05, |
|
"loss": 1.7286, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.15488629379225569, |
|
"grad_norm": 0.2960408329963684, |
|
"learning_rate": 9.757187483077413e-05, |
|
"loss": 1.1932, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.15611555009219422, |
|
"grad_norm": 0.2633897364139557, |
|
"learning_rate": 9.750809600145954e-05, |
|
"loss": 1.2997, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.15734480639213275, |
|
"grad_norm": 0.2459549605846405, |
|
"learning_rate": 9.744351176091393e-05, |
|
"loss": 1.0985, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.1585740626920713, |
|
"grad_norm": 0.30462849140167236, |
|
"learning_rate": 9.737812320404271e-05, |
|
"loss": 1.4303, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.15980331899200984, |
|
"grad_norm": 0.27317526936531067, |
|
"learning_rate": 9.731193143938704e-05, |
|
"loss": 1.224, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16103257529194837, |
|
"grad_norm": 0.26538556814193726, |
|
"learning_rate": 9.724493758910491e-05, |
|
"loss": 1.2667, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.1622618315918869, |
|
"grad_norm": 0.28112831711769104, |
|
"learning_rate": 9.71771427889522e-05, |
|
"loss": 1.1212, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.16349108789182545, |
|
"grad_norm": 0.2989320755004883, |
|
"learning_rate": 9.71085481882634e-05, |
|
"loss": 1.0484, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.16472034419176398, |
|
"grad_norm": 0.2814895212650299, |
|
"learning_rate": 9.703915494993215e-05, |
|
"loss": 0.7544, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.16594960049170251, |
|
"grad_norm": 0.3104398846626282, |
|
"learning_rate": 9.696896425039146e-05, |
|
"loss": 1.0323, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.16717885679164105, |
|
"grad_norm": 0.4948181211948395, |
|
"learning_rate": 9.689797727959387e-05, |
|
"loss": 1.2073, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.1684081130915796, |
|
"grad_norm": 0.4018343985080719, |
|
"learning_rate": 9.682619524099112e-05, |
|
"loss": 1.2409, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.16963736939151813, |
|
"grad_norm": 0.5637558102607727, |
|
"learning_rate": 9.675361935151395e-05, |
|
"loss": 1.3184, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.17086662569145666, |
|
"grad_norm": 0.7405252456665039, |
|
"learning_rate": 9.66802508415513e-05, |
|
"loss": 1.0983, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.1720958819913952, |
|
"grad_norm": 0.6686736345291138, |
|
"learning_rate": 9.660609095492952e-05, |
|
"loss": 1.0025, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17332513829133375, |
|
"grad_norm": 0.7121345400810242, |
|
"learning_rate": 9.653114094889127e-05, |
|
"loss": 0.9337, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.17455439459127228, |
|
"grad_norm": 1.06205153465271, |
|
"learning_rate": 9.645540209407425e-05, |
|
"loss": 1.2931, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.1757836508912108, |
|
"grad_norm": 2.3874034881591797, |
|
"learning_rate": 9.637887567448959e-05, |
|
"loss": 1.5124, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.17701290719114934, |
|
"grad_norm": 2.6609811782836914, |
|
"learning_rate": 9.630156298750011e-05, |
|
"loss": 1.4161, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.1782421634910879, |
|
"grad_norm": 2.413705587387085, |
|
"learning_rate": 9.622346534379833e-05, |
|
"loss": 1.2768, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.17947141979102643, |
|
"grad_norm": 2.920910120010376, |
|
"learning_rate": 9.614458406738427e-05, |
|
"loss": 1.0866, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.18070067609096496, |
|
"grad_norm": 2.389439582824707, |
|
"learning_rate": 9.606492049554297e-05, |
|
"loss": 1.4862, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.1819299323909035, |
|
"grad_norm": 2.03515887260437, |
|
"learning_rate": 9.598447597882181e-05, |
|
"loss": 1.3503, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.18315918869084205, |
|
"grad_norm": 2.016889810562134, |
|
"learning_rate": 9.590325188100768e-05, |
|
"loss": 1.2565, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.18438844499078058, |
|
"grad_norm": 2.1591711044311523, |
|
"learning_rate": 9.582124957910375e-05, |
|
"loss": 1.1261, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1856177012907191, |
|
"grad_norm": 0.2707172632217407, |
|
"learning_rate": 9.573847046330628e-05, |
|
"loss": 1.1045, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.18684695759065764, |
|
"grad_norm": 0.25980842113494873, |
|
"learning_rate": 9.565491593698086e-05, |
|
"loss": 1.274, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.1880762138905962, |
|
"grad_norm": 0.25503602623939514, |
|
"learning_rate": 9.55705874166388e-05, |
|
"loss": 1.0971, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.18930547019053473, |
|
"grad_norm": 0.27756351232528687, |
|
"learning_rate": 9.548548633191299e-05, |
|
"loss": 1.215, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.19053472649047326, |
|
"grad_norm": 0.2732703387737274, |
|
"learning_rate": 9.539961412553375e-05, |
|
"loss": 1.1326, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.1917639827904118, |
|
"grad_norm": 0.28855475783348083, |
|
"learning_rate": 9.531297225330429e-05, |
|
"loss": 1.2862, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.19299323909035035, |
|
"grad_norm": 0.3158769905567169, |
|
"learning_rate": 9.522556218407608e-05, |
|
"loss": 1.2254, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.19422249539028888, |
|
"grad_norm": 0.30355289578437805, |
|
"learning_rate": 9.513738539972394e-05, |
|
"loss": 1.062, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.1954517516902274, |
|
"grad_norm": 0.3448358178138733, |
|
"learning_rate": 9.504844339512095e-05, |
|
"loss": 0.9856, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.19668100799016594, |
|
"grad_norm": 0.3306958079338074, |
|
"learning_rate": 9.495873767811305e-05, |
|
"loss": 1.2696, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1979102642901045, |
|
"grad_norm": 0.4231187105178833, |
|
"learning_rate": 9.486826976949345e-05, |
|
"loss": 1.1711, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.19913952059004303, |
|
"grad_norm": 0.5289990901947021, |
|
"learning_rate": 9.477704120297697e-05, |
|
"loss": 1.4088, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.20036877688998156, |
|
"grad_norm": 0.5111967921257019, |
|
"learning_rate": 9.468505352517394e-05, |
|
"loss": 1.1683, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.2015980331899201, |
|
"grad_norm": 0.7477207779884338, |
|
"learning_rate": 9.459230829556401e-05, |
|
"loss": 0.995, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.20282728948985865, |
|
"grad_norm": 0.7836649417877197, |
|
"learning_rate": 9.449880708646971e-05, |
|
"loss": 0.8027, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.20405654578979718, |
|
"grad_norm": 0.6803653240203857, |
|
"learning_rate": 9.440455148302977e-05, |
|
"loss": 0.9725, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.2052858020897357, |
|
"grad_norm": 0.8779723048210144, |
|
"learning_rate": 9.430954308317233e-05, |
|
"loss": 1.1995, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.20651505838967424, |
|
"grad_norm": 1.3584879636764526, |
|
"learning_rate": 9.421378349758769e-05, |
|
"loss": 1.4558, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.2077443146896128, |
|
"grad_norm": 2.1976521015167236, |
|
"learning_rate": 9.411727434970121e-05, |
|
"loss": 1.0717, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.20897357098955133, |
|
"grad_norm": 3.9302353858947754, |
|
"learning_rate": 9.402001727564565e-05, |
|
"loss": 1.5138, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21020282728948986, |
|
"grad_norm": 3.9594686031341553, |
|
"learning_rate": 9.392201392423342e-05, |
|
"loss": 1.4295, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.2114320835894284, |
|
"grad_norm": 3.2994837760925293, |
|
"learning_rate": 9.382326595692868e-05, |
|
"loss": 1.8676, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.21266133988936695, |
|
"grad_norm": 2.219341993331909, |
|
"learning_rate": 9.372377504781924e-05, |
|
"loss": 1.3185, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.21389059618930548, |
|
"grad_norm": 2.3389649391174316, |
|
"learning_rate": 9.362354288358803e-05, |
|
"loss": 0.9969, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.215119852489244, |
|
"grad_norm": 3.8493995666503906, |
|
"learning_rate": 9.35225711634846e-05, |
|
"loss": 1.2903, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.21634910878918254, |
|
"grad_norm": 0.24931700527668, |
|
"learning_rate": 9.34208615992963e-05, |
|
"loss": 1.051, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.2175783650891211, |
|
"grad_norm": 0.2944095730781555, |
|
"learning_rate": 9.331841591531922e-05, |
|
"loss": 1.3364, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.21880762138905963, |
|
"grad_norm": 0.26118403673171997, |
|
"learning_rate": 9.321523584832905e-05, |
|
"loss": 1.1487, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.22003687768899816, |
|
"grad_norm": 0.29458168148994446, |
|
"learning_rate": 9.311132314755149e-05, |
|
"loss": 1.365, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.2212661339889367, |
|
"grad_norm": 0.2739919424057007, |
|
"learning_rate": 9.300667957463278e-05, |
|
"loss": 1.2595, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.22249539028887522, |
|
"grad_norm": 0.25647538900375366, |
|
"learning_rate": 9.290130690360965e-05, |
|
"loss": 0.9865, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.22372464658881377, |
|
"grad_norm": 0.27343517541885376, |
|
"learning_rate": 9.279520692087938e-05, |
|
"loss": 1.1263, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.2249539028887523, |
|
"grad_norm": 0.3220975697040558, |
|
"learning_rate": 9.268838142516943e-05, |
|
"loss": 1.3404, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.22618315918869084, |
|
"grad_norm": 0.3012546896934509, |
|
"learning_rate": 9.258083222750703e-05, |
|
"loss": 0.934, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.22741241548862937, |
|
"grad_norm": 0.3433031439781189, |
|
"learning_rate": 9.247256115118835e-05, |
|
"loss": 1.1895, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.22864167178856792, |
|
"grad_norm": 0.3515290915966034, |
|
"learning_rate": 9.236357003174775e-05, |
|
"loss": 1.3236, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.22987092808850645, |
|
"grad_norm": 0.4033795893192291, |
|
"learning_rate": 9.225386071692654e-05, |
|
"loss": 1.2089, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.23110018438844498, |
|
"grad_norm": 0.42729562520980835, |
|
"learning_rate": 9.214343506664168e-05, |
|
"loss": 1.1346, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.23232944068838352, |
|
"grad_norm": 0.6692906618118286, |
|
"learning_rate": 9.203229495295429e-05, |
|
"loss": 1.0211, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.23355869698832207, |
|
"grad_norm": 0.6882857084274292, |
|
"learning_rate": 9.192044226003789e-05, |
|
"loss": 0.8235, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2347879532882606, |
|
"grad_norm": 0.6821665167808533, |
|
"learning_rate": 9.18078788841464e-05, |
|
"loss": 0.8171, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.23601720958819913, |
|
"grad_norm": 0.7368921041488647, |
|
"learning_rate": 9.169460673358212e-05, |
|
"loss": 0.9993, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.23724646588813766, |
|
"grad_norm": 0.9759008884429932, |
|
"learning_rate": 9.158062772866325e-05, |
|
"loss": 1.2029, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.23847572218807622, |
|
"grad_norm": 2.167100667953491, |
|
"learning_rate": 9.146594380169143e-05, |
|
"loss": 1.1393, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.23970497848801475, |
|
"grad_norm": 2.76292085647583, |
|
"learning_rate": 9.135055689691888e-05, |
|
"loss": 0.946, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.24093423478795328, |
|
"grad_norm": 3.504427671432495, |
|
"learning_rate": 9.123446897051555e-05, |
|
"loss": 1.7001, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.2421634910878918, |
|
"grad_norm": 2.606448173522949, |
|
"learning_rate": 9.111768199053588e-05, |
|
"loss": 1.6293, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.24339274738783037, |
|
"grad_norm": 2.1803855895996094, |
|
"learning_rate": 9.100019793688549e-05, |
|
"loss": 1.2392, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.2446220036877689, |
|
"grad_norm": 2.3470633029937744, |
|
"learning_rate": 9.088201880128755e-05, |
|
"loss": 1.0844, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.24585125998770743, |
|
"grad_norm": 2.47255802154541, |
|
"learning_rate": 9.076314658724906e-05, |
|
"loss": 1.19, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24708051628764596, |
|
"grad_norm": 0.2115241140127182, |
|
"learning_rate": 9.064358331002691e-05, |
|
"loss": 0.9038, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.24830977258758452, |
|
"grad_norm": 0.2693980038166046, |
|
"learning_rate": 9.05233309965936e-05, |
|
"loss": 1.0014, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.24953902888752305, |
|
"grad_norm": 0.28890225291252136, |
|
"learning_rate": 9.040239168560303e-05, |
|
"loss": 1.1698, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.2507682851874616, |
|
"grad_norm": 0.27143335342407227, |
|
"learning_rate": 9.028076742735583e-05, |
|
"loss": 1.1856, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.2507682851874616, |
|
"eval_loss": 1.0315037965774536, |
|
"eval_runtime": 65.4064, |
|
"eval_samples_per_second": 10.473, |
|
"eval_steps_per_second": 5.244, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.2519975414874001, |
|
"grad_norm": 0.3105545938014984, |
|
"learning_rate": 9.015846028376462e-05, |
|
"loss": 1.2827, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.25322679778733864, |
|
"grad_norm": 0.2826372981071472, |
|
"learning_rate": 9.00354723283191e-05, |
|
"loss": 1.1159, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.2544560540872772, |
|
"grad_norm": 0.2823708951473236, |
|
"learning_rate": 8.991180564605086e-05, |
|
"loss": 1.0368, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.25568531038721576, |
|
"grad_norm": 0.28265297412872314, |
|
"learning_rate": 8.978746233349802e-05, |
|
"loss": 1.1583, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.2569145666871543, |
|
"grad_norm": 0.3202212452888489, |
|
"learning_rate": 8.966244449866973e-05, |
|
"loss": 1.2069, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.2581438229870928, |
|
"grad_norm": 0.30576291680336, |
|
"learning_rate": 8.953675426101038e-05, |
|
"loss": 1.1588, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.25937307928703135, |
|
"grad_norm": 0.3853960633277893, |
|
"learning_rate": 8.941039375136371e-05, |
|
"loss": 1.1947, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.2606023355869699, |
|
"grad_norm": 0.4404067099094391, |
|
"learning_rate": 8.928336511193669e-05, |
|
"loss": 1.0786, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.2618315918869084, |
|
"grad_norm": 0.422333300113678, |
|
"learning_rate": 8.915567049626315e-05, |
|
"loss": 1.1454, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.26306084818684694, |
|
"grad_norm": 0.5277565121650696, |
|
"learning_rate": 8.902731206916734e-05, |
|
"loss": 0.7775, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.26429010448678547, |
|
"grad_norm": 0.7032243609428406, |
|
"learning_rate": 8.889829200672719e-05, |
|
"loss": 0.5771, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.26551936078672406, |
|
"grad_norm": 0.6663339734077454, |
|
"learning_rate": 8.876861249623739e-05, |
|
"loss": 0.616, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.2667486170866626, |
|
"grad_norm": 0.8129518628120422, |
|
"learning_rate": 8.863827573617238e-05, |
|
"loss": 1.1483, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.2679778733866011, |
|
"grad_norm": 1.0273211002349854, |
|
"learning_rate": 8.850728393614902e-05, |
|
"loss": 1.2066, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.26920712968653965, |
|
"grad_norm": 1.5424954891204834, |
|
"learning_rate": 8.837563931688919e-05, |
|
"loss": 1.247, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.2704363859864782, |
|
"grad_norm": 2.9167752265930176, |
|
"learning_rate": 8.824334411018204e-05, |
|
"loss": 1.3413, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2716656422864167, |
|
"grad_norm": 5.498292446136475, |
|
"learning_rate": 8.811040055884629e-05, |
|
"loss": 1.0072, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.27289489858635524, |
|
"grad_norm": 3.1687686443328857, |
|
"learning_rate": 8.797681091669206e-05, |
|
"loss": 1.3309, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.27412415488629377, |
|
"grad_norm": 2.760160446166992, |
|
"learning_rate": 8.784257744848279e-05, |
|
"loss": 1.5268, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.27535341118623236, |
|
"grad_norm": 2.3323326110839844, |
|
"learning_rate": 8.770770242989679e-05, |
|
"loss": 1.27, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.2765826674861709, |
|
"grad_norm": 2.150510549545288, |
|
"learning_rate": 8.75721881474886e-05, |
|
"loss": 1.0602, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2778119237861094, |
|
"grad_norm": 0.23049846291542053, |
|
"learning_rate": 8.743603689865039e-05, |
|
"loss": 1.0067, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.27904118008604795, |
|
"grad_norm": 0.2650708556175232, |
|
"learning_rate": 8.729925099157281e-05, |
|
"loss": 1.1932, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.2802704363859865, |
|
"grad_norm": 0.2723963260650635, |
|
"learning_rate": 8.7161832745206e-05, |
|
"loss": 1.2495, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.281499692685925, |
|
"grad_norm": 0.26627010107040405, |
|
"learning_rate": 8.702378448922026e-05, |
|
"loss": 1.2837, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.28272894898586354, |
|
"grad_norm": 0.2728361189365387, |
|
"learning_rate": 8.688510856396648e-05, |
|
"loss": 1.2969, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28395820528580207, |
|
"grad_norm": 0.26788559556007385, |
|
"learning_rate": 8.674580732043656e-05, |
|
"loss": 1.0944, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.28518746158574065, |
|
"grad_norm": 0.3129604160785675, |
|
"learning_rate": 8.660588312022344e-05, |
|
"loss": 1.3591, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.2864167178856792, |
|
"grad_norm": 0.32250627875328064, |
|
"learning_rate": 8.646533833548119e-05, |
|
"loss": 1.1469, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.2876459741856177, |
|
"grad_norm": 0.32614386081695557, |
|
"learning_rate": 8.632417534888473e-05, |
|
"loss": 1.3551, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.28887523048555624, |
|
"grad_norm": 0.3620636463165283, |
|
"learning_rate": 8.61823965535894e-05, |
|
"loss": 1.1427, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2901044867854948, |
|
"grad_norm": 0.39082473516464233, |
|
"learning_rate": 8.604000435319047e-05, |
|
"loss": 1.0041, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.2913337430854333, |
|
"grad_norm": 0.3823097050189972, |
|
"learning_rate": 8.589700116168232e-05, |
|
"loss": 1.1756, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.29256299938537184, |
|
"grad_norm": 0.5359341502189636, |
|
"learning_rate": 8.575338940341757e-05, |
|
"loss": 1.1814, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.29379225568531037, |
|
"grad_norm": 0.6902546286582947, |
|
"learning_rate": 8.560917151306593e-05, |
|
"loss": 0.9253, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.29502151198524895, |
|
"grad_norm": 0.7236252427101135, |
|
"learning_rate": 8.5464349935573e-05, |
|
"loss": 0.6398, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2962507682851875, |
|
"grad_norm": 0.7172759175300598, |
|
"learning_rate": 8.53189271261187e-05, |
|
"loss": 0.9061, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.297480024585126, |
|
"grad_norm": 0.7999723553657532, |
|
"learning_rate": 8.517290555007578e-05, |
|
"loss": 1.0691, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.29870928088506454, |
|
"grad_norm": 1.235872745513916, |
|
"learning_rate": 8.502628768296788e-05, |
|
"loss": 1.5235, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.2999385371850031, |
|
"grad_norm": 1.9676207304000854, |
|
"learning_rate": 8.487907601042777e-05, |
|
"loss": 1.5859, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.3011677934849416, |
|
"grad_norm": 3.5035860538482666, |
|
"learning_rate": 8.473127302815496e-05, |
|
"loss": 1.1743, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.30239704978488013, |
|
"grad_norm": 4.519472599029541, |
|
"learning_rate": 8.458288124187359e-05, |
|
"loss": 0.7165, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.30362630608481866, |
|
"grad_norm": 2.3718838691711426, |
|
"learning_rate": 8.443390316728987e-05, |
|
"loss": 1.1449, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.3048555623847572, |
|
"grad_norm": 2.1668829917907715, |
|
"learning_rate": 8.428434133004937e-05, |
|
"loss": 1.0383, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.3060848186846958, |
|
"grad_norm": 3.2350733280181885, |
|
"learning_rate": 8.413419826569435e-05, |
|
"loss": 1.2341, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.3073140749846343, |
|
"grad_norm": 2.3541886806488037, |
|
"learning_rate": 8.398347651962064e-05, |
|
"loss": 1.0355, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.30854333128457284, |
|
"grad_norm": 0.2730487883090973, |
|
"learning_rate": 8.383217864703456e-05, |
|
"loss": 1.2813, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.30977258758451137, |
|
"grad_norm": 0.2517383098602295, |
|
"learning_rate": 8.36803072129096e-05, |
|
"loss": 1.1793, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.3110018438844499, |
|
"grad_norm": 0.28486472368240356, |
|
"learning_rate": 8.352786479194288e-05, |
|
"loss": 1.4065, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.31223110018438843, |
|
"grad_norm": 0.3247184455394745, |
|
"learning_rate": 8.337485396851155e-05, |
|
"loss": 1.4863, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.31346035648432696, |
|
"grad_norm": 0.26896461844444275, |
|
"learning_rate": 8.322127733662897e-05, |
|
"loss": 1.1373, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.3146896127842655, |
|
"grad_norm": 0.29333245754241943, |
|
"learning_rate": 8.306713749990072e-05, |
|
"loss": 1.0615, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.3159188690842041, |
|
"grad_norm": 0.2958793640136719, |
|
"learning_rate": 8.291243707148048e-05, |
|
"loss": 0.9392, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.3171481253841426, |
|
"grad_norm": 0.3320540487766266, |
|
"learning_rate": 8.275717867402575e-05, |
|
"loss": 1.2935, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.31837738168408114, |
|
"grad_norm": 0.3567339777946472, |
|
"learning_rate": 8.260136493965326e-05, |
|
"loss": 1.0954, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.31960663798401967, |
|
"grad_norm": 0.38393881916999817, |
|
"learning_rate": 8.244499850989452e-05, |
|
"loss": 1.045, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3208358942839582, |
|
"grad_norm": 0.41993001103401184, |
|
"learning_rate": 8.228808203565095e-05, |
|
"loss": 1.2225, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.32206515058389673, |
|
"grad_norm": 0.6547941565513611, |
|
"learning_rate": 8.213061817714893e-05, |
|
"loss": 0.9286, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.32329440688383526, |
|
"grad_norm": 0.7117279767990112, |
|
"learning_rate": 8.197260960389474e-05, |
|
"loss": 0.5088, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.3245236631837738, |
|
"grad_norm": 0.7041743993759155, |
|
"learning_rate": 8.181405899462926e-05, |
|
"loss": 0.8899, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.3257529194837124, |
|
"grad_norm": 0.7142787575721741, |
|
"learning_rate": 8.16549690372826e-05, |
|
"loss": 0.7447, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.3269821757836509, |
|
"grad_norm": 0.8879908323287964, |
|
"learning_rate": 8.14953424289285e-05, |
|
"loss": 1.2607, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.32821143208358944, |
|
"grad_norm": 0.9387282133102417, |
|
"learning_rate": 8.133518187573862e-05, |
|
"loss": 1.1611, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.32944068838352797, |
|
"grad_norm": 1.4039078950881958, |
|
"learning_rate": 8.117449009293668e-05, |
|
"loss": 0.9947, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.3306699446834665, |
|
"grad_norm": 3.3686740398406982, |
|
"learning_rate": 8.101326980475237e-05, |
|
"loss": 1.0783, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.33189920098340503, |
|
"grad_norm": 2.8384785652160645, |
|
"learning_rate": 8.085152374437525e-05, |
|
"loss": 0.9008, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.33312845728334356, |
|
"grad_norm": 2.453441619873047, |
|
"learning_rate": 8.06892546539083e-05, |
|
"loss": 0.5504, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.3343577135832821, |
|
"grad_norm": 2.592667579650879, |
|
"learning_rate": 8.052646528432158e-05, |
|
"loss": 0.7489, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.3355869698832207, |
|
"grad_norm": 1.9753395318984985, |
|
"learning_rate": 8.036315839540545e-05, |
|
"loss": 0.9747, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.3368162261831592, |
|
"grad_norm": 3.042698860168457, |
|
"learning_rate": 8.019933675572389e-05, |
|
"loss": 1.6841, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.33804548248309774, |
|
"grad_norm": 2.4343316555023193, |
|
"learning_rate": 8.00350031425675e-05, |
|
"loss": 0.869, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.33927473878303627, |
|
"grad_norm": 0.2026144415140152, |
|
"learning_rate": 7.98701603419064e-05, |
|
"loss": 0.8867, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.3405039950829748, |
|
"grad_norm": 0.24370141327381134, |
|
"learning_rate": 7.970481114834312e-05, |
|
"loss": 1.3135, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.34173325138291333, |
|
"grad_norm": 0.22894087433815002, |
|
"learning_rate": 7.953895836506508e-05, |
|
"loss": 1.0986, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.34296250768285186, |
|
"grad_norm": 0.2533970773220062, |
|
"learning_rate": 7.937260480379712e-05, |
|
"loss": 1.1821, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.3441917639827904, |
|
"grad_norm": 0.25789350271224976, |
|
"learning_rate": 7.920575328475385e-05, |
|
"loss": 1.1414, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.345421020282729, |
|
"grad_norm": 0.28820541501045227, |
|
"learning_rate": 7.903840663659186e-05, |
|
"loss": 1.3332, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.3466502765826675, |
|
"grad_norm": 0.28611505031585693, |
|
"learning_rate": 7.887056769636165e-05, |
|
"loss": 1.0901, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.34787953288260604, |
|
"grad_norm": 0.28022873401641846, |
|
"learning_rate": 7.870223930945972e-05, |
|
"loss": 0.8461, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.34910878918254457, |
|
"grad_norm": 0.3246136009693146, |
|
"learning_rate": 7.853342432958013e-05, |
|
"loss": 0.9325, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.3503380454824831, |
|
"grad_norm": 0.3149406611919403, |
|
"learning_rate": 7.836412561866629e-05, |
|
"loss": 1.013, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3515673017824216, |
|
"grad_norm": 0.3745490610599518, |
|
"learning_rate": 7.819434604686228e-05, |
|
"loss": 1.2624, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.35279655808236016, |
|
"grad_norm": 0.4822925329208374, |
|
"learning_rate": 7.802408849246442e-05, |
|
"loss": 1.2424, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.3540258143822987, |
|
"grad_norm": 0.6210641264915466, |
|
"learning_rate": 7.785335584187219e-05, |
|
"loss": 1.2527, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.3552550706822373, |
|
"grad_norm": 0.6488444805145264, |
|
"learning_rate": 7.768215098953952e-05, |
|
"loss": 0.7986, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.3564843269821758, |
|
"grad_norm": 0.760388195514679, |
|
"learning_rate": 7.751047683792561e-05, |
|
"loss": 1.0136, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.35771358328211433, |
|
"grad_norm": 0.7666548490524292, |
|
"learning_rate": 7.73383362974458e-05, |
|
"loss": 0.8205, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.35894283958205286, |
|
"grad_norm": 0.7492078542709351, |
|
"learning_rate": 7.71657322864221e-05, |
|
"loss": 0.9254, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.3601720958819914, |
|
"grad_norm": 0.9061193466186523, |
|
"learning_rate": 7.699266773103389e-05, |
|
"loss": 1.3013, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.3614013521819299, |
|
"grad_norm": 2.1404013633728027, |
|
"learning_rate": 7.681914556526817e-05, |
|
"loss": 1.5957, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.36263060848186845, |
|
"grad_norm": 2.647864580154419, |
|
"learning_rate": 7.664516873086987e-05, |
|
"loss": 1.1658, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.363859864781807, |
|
"grad_norm": 3.0906460285186768, |
|
"learning_rate": 7.647074017729202e-05, |
|
"loss": 1.1344, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.36508912108174557, |
|
"grad_norm": 2.2348814010620117, |
|
"learning_rate": 7.629586286164565e-05, |
|
"loss": 0.8813, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.3663183773816841, |
|
"grad_norm": 2.937446117401123, |
|
"learning_rate": 7.612053974864976e-05, |
|
"loss": 1.0414, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.36754763368162263, |
|
"grad_norm": 2.5343546867370605, |
|
"learning_rate": 7.594477381058098e-05, |
|
"loss": 1.1847, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.36877688998156116, |
|
"grad_norm": 2.8971638679504395, |
|
"learning_rate": 7.576856802722325e-05, |
|
"loss": 0.9029, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3700061462814997, |
|
"grad_norm": 0.1982557773590088, |
|
"learning_rate": 7.559192538581722e-05, |
|
"loss": 0.9314, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.3712354025814382, |
|
"grad_norm": 0.24721381068229675, |
|
"learning_rate": 7.541484888100974e-05, |
|
"loss": 1.2432, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.37246465888137675, |
|
"grad_norm": 0.24999506771564484, |
|
"learning_rate": 7.523734151480289e-05, |
|
"loss": 1.285, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.3736939151813153, |
|
"grad_norm": 0.267764151096344, |
|
"learning_rate": 7.505940629650326e-05, |
|
"loss": 1.198, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.3749231714812538, |
|
"grad_norm": 0.26003679633140564, |
|
"learning_rate": 7.488104624267091e-05, |
|
"loss": 1.2001, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3761524277811924, |
|
"grad_norm": 0.28197526931762695, |
|
"learning_rate": 7.470226437706813e-05, |
|
"loss": 1.1687, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.37738168408113093, |
|
"grad_norm": 0.29367661476135254, |
|
"learning_rate": 7.452306373060829e-05, |
|
"loss": 1.211, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.37861094038106946, |
|
"grad_norm": 0.2982727885246277, |
|
"learning_rate": 7.434344734130437e-05, |
|
"loss": 1.151, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.379840196681008, |
|
"grad_norm": 0.3283758759498596, |
|
"learning_rate": 7.416341825421754e-05, |
|
"loss": 0.9875, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.3810694529809465, |
|
"grad_norm": 0.32420334219932556, |
|
"learning_rate": 7.398297952140544e-05, |
|
"loss": 1.0796, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.38229870928088505, |
|
"grad_norm": 0.4046980142593384, |
|
"learning_rate": 7.380213420187055e-05, |
|
"loss": 1.1158, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.3835279655808236, |
|
"grad_norm": 0.391736775636673, |
|
"learning_rate": 7.36208853615082e-05, |
|
"loss": 1.1682, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.3847572218807621, |
|
"grad_norm": 0.6027556657791138, |
|
"learning_rate": 7.343923607305471e-05, |
|
"loss": 1.0696, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.3859864781807007, |
|
"grad_norm": 0.6483603119850159, |
|
"learning_rate": 7.325718941603527e-05, |
|
"loss": 0.7843, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.38721573448063923, |
|
"grad_norm": 0.6711483001708984, |
|
"learning_rate": 7.307474847671168e-05, |
|
"loss": 0.7247, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.38844499078057776, |
|
"grad_norm": 0.7372632026672363, |
|
"learning_rate": 7.289191634803003e-05, |
|
"loss": 1.0535, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.3896742470805163, |
|
"grad_norm": 0.7427420020103455, |
|
"learning_rate": 7.270869612956835e-05, |
|
"loss": 1.0563, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.3909035033804548, |
|
"grad_norm": 2.6449501514434814, |
|
"learning_rate": 7.252509092748401e-05, |
|
"loss": 1.3099, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.39213275968039335, |
|
"grad_norm": 3.1938464641571045, |
|
"learning_rate": 7.234110385446103e-05, |
|
"loss": 1.1728, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.3933620159803319, |
|
"grad_norm": 2.584103584289551, |
|
"learning_rate": 7.215673802965734e-05, |
|
"loss": 0.792, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3945912722802704, |
|
"grad_norm": 2.358025074005127, |
|
"learning_rate": 7.197199657865195e-05, |
|
"loss": 1.0462, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.395820528580209, |
|
"grad_norm": 2.9621617794036865, |
|
"learning_rate": 7.178688263339184e-05, |
|
"loss": 1.4222, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.3970497848801475, |
|
"grad_norm": 2.5362660884857178, |
|
"learning_rate": 7.160139933213898e-05, |
|
"loss": 1.1527, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.39827904118008606, |
|
"grad_norm": 2.4901375770568848, |
|
"learning_rate": 7.141554981941709e-05, |
|
"loss": 1.1712, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.3995082974800246, |
|
"grad_norm": 2.9214236736297607, |
|
"learning_rate": 7.12293372459583e-05, |
|
"loss": 1.1977, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.4007375537799631, |
|
"grad_norm": 0.24753543734550476, |
|
"learning_rate": 7.104276476864974e-05, |
|
"loss": 1.2176, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.40196681007990165, |
|
"grad_norm": 0.25986090302467346, |
|
"learning_rate": 7.085583555048008e-05, |
|
"loss": 1.2854, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.4031960663798402, |
|
"grad_norm": 0.2640175521373749, |
|
"learning_rate": 7.066855276048587e-05, |
|
"loss": 1.2204, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.4044253226797787, |
|
"grad_norm": 0.2603614330291748, |
|
"learning_rate": 7.048091957369776e-05, |
|
"loss": 1.2621, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.4056545789797173, |
|
"grad_norm": 0.2921195924282074, |
|
"learning_rate": 7.029293917108678e-05, |
|
"loss": 1.281, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4068838352796558, |
|
"grad_norm": 0.2984941899776459, |
|
"learning_rate": 7.010461473951033e-05, |
|
"loss": 1.071, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.40811309157959436, |
|
"grad_norm": 0.31219175457954407, |
|
"learning_rate": 6.991594947165818e-05, |
|
"loss": 1.3161, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.4093423478795329, |
|
"grad_norm": 0.31329602003097534, |
|
"learning_rate": 6.972694656599834e-05, |
|
"loss": 0.9854, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.4105716041794714, |
|
"grad_norm": 0.3356671929359436, |
|
"learning_rate": 6.953760922672286e-05, |
|
"loss": 1.02, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.41180086047940995, |
|
"grad_norm": 0.3843994438648224, |
|
"learning_rate": 6.934794066369348e-05, |
|
"loss": 1.2173, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4130301167793485, |
|
"grad_norm": 0.45338544249534607, |
|
"learning_rate": 6.915794409238718e-05, |
|
"loss": 1.3614, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.414259373079287, |
|
"grad_norm": 0.4857298731803894, |
|
"learning_rate": 6.896762273384178e-05, |
|
"loss": 1.0175, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.4154886293792256, |
|
"grad_norm": 0.6512896418571472, |
|
"learning_rate": 6.877697981460125e-05, |
|
"loss": 0.6555, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.4167178856791641, |
|
"grad_norm": 0.6744720935821533, |
|
"learning_rate": 6.858601856666094e-05, |
|
"loss": 0.6057, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.41794714197910265, |
|
"grad_norm": 0.6527014374732971, |
|
"learning_rate": 6.839474222741299e-05, |
|
"loss": 0.9116, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4191763982790412, |
|
"grad_norm": 0.6935631036758423, |
|
"learning_rate": 6.820315403959123e-05, |
|
"loss": 0.9876, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.4204056545789797, |
|
"grad_norm": 0.6856899261474609, |
|
"learning_rate": 6.801125725121636e-05, |
|
"loss": 0.9591, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.42163491087891825, |
|
"grad_norm": 1.2577812671661377, |
|
"learning_rate": 6.781905511554079e-05, |
|
"loss": 1.3174, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.4228641671788568, |
|
"grad_norm": 2.421950578689575, |
|
"learning_rate": 6.762655089099353e-05, |
|
"loss": 1.6442, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.4240934234787953, |
|
"grad_norm": 2.6432454586029053, |
|
"learning_rate": 6.743374784112501e-05, |
|
"loss": 1.0468, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.4253226797787339, |
|
"grad_norm": 2.7061827182769775, |
|
"learning_rate": 6.724064923455155e-05, |
|
"loss": 1.1526, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.4265519360786724, |
|
"grad_norm": 2.466057777404785, |
|
"learning_rate": 6.704725834490024e-05, |
|
"loss": 1.1463, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.42778119237861095, |
|
"grad_norm": 2.753512144088745, |
|
"learning_rate": 6.685357845075315e-05, |
|
"loss": 0.9492, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.4290104486785495, |
|
"grad_norm": 2.76118803024292, |
|
"learning_rate": 6.665961283559197e-05, |
|
"loss": 0.8543, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.430239704978488, |
|
"grad_norm": 2.295574426651001, |
|
"learning_rate": 6.646536478774222e-05, |
|
"loss": 0.9564, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.43146896127842654, |
|
"grad_norm": 0.22731368243694305, |
|
"learning_rate": 6.627083760031754e-05, |
|
"loss": 0.9719, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.4326982175783651, |
|
"grad_norm": 0.20097078382968903, |
|
"learning_rate": 6.60760345711639e-05, |
|
"loss": 1.0094, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.4339274738783036, |
|
"grad_norm": 0.23321934044361115, |
|
"learning_rate": 6.58809590028036e-05, |
|
"loss": 1.101, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.4351567301782422, |
|
"grad_norm": 0.27995625138282776, |
|
"learning_rate": 6.568561420237935e-05, |
|
"loss": 1.3545, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.4363859864781807, |
|
"grad_norm": 0.259082168340683, |
|
"learning_rate": 6.54900034815982e-05, |
|
"loss": 1.1598, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.43761524277811925, |
|
"grad_norm": 0.2688703238964081, |
|
"learning_rate": 6.52941301566754e-05, |
|
"loss": 1.1141, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.4388444990780578, |
|
"grad_norm": 0.34018442034721375, |
|
"learning_rate": 6.50979975482781e-05, |
|
"loss": 1.2811, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.4400737553779963, |
|
"grad_norm": 0.2925175130367279, |
|
"learning_rate": 6.490160898146918e-05, |
|
"loss": 0.9025, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.44130301167793484, |
|
"grad_norm": 0.30208972096443176, |
|
"learning_rate": 6.470496778565082e-05, |
|
"loss": 1.0301, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.4425322679778734, |
|
"grad_norm": 0.3110770285129547, |
|
"learning_rate": 6.4508077294508e-05, |
|
"loss": 1.0911, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4437615242778119, |
|
"grad_norm": 0.426252543926239, |
|
"learning_rate": 6.431094084595209e-05, |
|
"loss": 1.1214, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.44499078057775043, |
|
"grad_norm": 0.4019356966018677, |
|
"learning_rate": 6.411356178206419e-05, |
|
"loss": 1.3063, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.446220036877689, |
|
"grad_norm": 0.4622703194618225, |
|
"learning_rate": 6.391594344903848e-05, |
|
"loss": 1.1208, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.44744929317762755, |
|
"grad_norm": 0.5752270817756653, |
|
"learning_rate": 6.371808919712549e-05, |
|
"loss": 0.9653, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.4486785494775661, |
|
"grad_norm": 0.6309720277786255, |
|
"learning_rate": 6.35200023805754e-05, |
|
"loss": 0.5664, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.4499078057775046, |
|
"grad_norm": 0.612684965133667, |
|
"learning_rate": 6.332168635758097e-05, |
|
"loss": 1.0443, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.45113706207744314, |
|
"grad_norm": 0.6797056794166565, |
|
"learning_rate": 6.31231444902208e-05, |
|
"loss": 0.8389, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.45236631837738167, |
|
"grad_norm": 1.222960352897644, |
|
"learning_rate": 6.292438014440227e-05, |
|
"loss": 1.4688, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.4535955746773202, |
|
"grad_norm": 2.9443516731262207, |
|
"learning_rate": 6.272539668980441e-05, |
|
"loss": 1.0079, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.45482483097725873, |
|
"grad_norm": 3.0168612003326416, |
|
"learning_rate": 6.252619749982089e-05, |
|
"loss": 0.9232, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4560540872771973, |
|
"grad_norm": 1.9470983743667603, |
|
"learning_rate": 6.232678595150275e-05, |
|
"loss": 0.8126, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.45728334357713585, |
|
"grad_norm": 2.4769911766052246, |
|
"learning_rate": 6.212716542550112e-05, |
|
"loss": 0.7786, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.4585125998770744, |
|
"grad_norm": 2.849158525466919, |
|
"learning_rate": 6.192733930601005e-05, |
|
"loss": 1.1914, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.4597418561770129, |
|
"grad_norm": 2.6154119968414307, |
|
"learning_rate": 6.172731098070899e-05, |
|
"loss": 0.9171, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.46097111247695144, |
|
"grad_norm": 3.5901479721069336, |
|
"learning_rate": 6.152708384070541e-05, |
|
"loss": 1.1269, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.46220036877688997, |
|
"grad_norm": 0.23536159098148346, |
|
"learning_rate": 6.132666128047732e-05, |
|
"loss": 0.8768, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.4634296250768285, |
|
"grad_norm": 0.24834086000919342, |
|
"learning_rate": 6.112604669781572e-05, |
|
"loss": 1.0644, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.46465888137676703, |
|
"grad_norm": 0.3041445016860962, |
|
"learning_rate": 6.0925243493767016e-05, |
|
"loss": 1.2779, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.4658881376767056, |
|
"grad_norm": 0.3158765137195587, |
|
"learning_rate": 6.0724255072575275e-05, |
|
"loss": 1.352, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.46711739397664415, |
|
"grad_norm": 0.2845201790332794, |
|
"learning_rate": 6.0523084841624635e-05, |
|
"loss": 1.2567, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4683466502765827, |
|
"grad_norm": 0.2909673750400543, |
|
"learning_rate": 6.0321736211381464e-05, |
|
"loss": 1.1735, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.4695759065765212, |
|
"grad_norm": 0.2946690022945404, |
|
"learning_rate": 6.0120212595336545e-05, |
|
"loss": 1.1514, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.47080516287645974, |
|
"grad_norm": 0.302846223115921, |
|
"learning_rate": 5.9918517409947215e-05, |
|
"loss": 1.0621, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.47203441917639827, |
|
"grad_norm": 0.3197241425514221, |
|
"learning_rate": 5.971665407457948e-05, |
|
"loss": 1.0299, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.4732636754763368, |
|
"grad_norm": 0.342777281999588, |
|
"learning_rate": 5.951462601144998e-05, |
|
"loss": 1.0858, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.47449293177627533, |
|
"grad_norm": 0.3554008901119232, |
|
"learning_rate": 5.931243664556803e-05, |
|
"loss": 1.1441, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.4757221880762139, |
|
"grad_norm": 0.36057665944099426, |
|
"learning_rate": 5.9110089404677524e-05, |
|
"loss": 1.1836, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.47695144437615244, |
|
"grad_norm": 0.5004509091377258, |
|
"learning_rate": 5.890758771919884e-05, |
|
"loss": 1.4109, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.478180700676091, |
|
"grad_norm": 0.5211744904518127, |
|
"learning_rate": 5.8704935022170684e-05, |
|
"loss": 1.0097, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.4794099569760295, |
|
"grad_norm": 0.7474620938301086, |
|
"learning_rate": 5.8502134749191816e-05, |
|
"loss": 0.8777, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.48063921327596804, |
|
"grad_norm": 0.7044636011123657, |
|
"learning_rate": 5.8299190338362996e-05, |
|
"loss": 0.9007, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.48186846957590657, |
|
"grad_norm": 0.6484948992729187, |
|
"learning_rate": 5.8096105230228435e-05, |
|
"loss": 0.8261, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.4830977258758451, |
|
"grad_norm": 0.672816812992096, |
|
"learning_rate": 5.78928828677177e-05, |
|
"loss": 1.0531, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.4843269821757836, |
|
"grad_norm": 1.1637803316116333, |
|
"learning_rate": 5.768952669608724e-05, |
|
"loss": 1.1586, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.4855562384757222, |
|
"grad_norm": 3.1862003803253174, |
|
"learning_rate": 5.748604016286192e-05, |
|
"loss": 1.6232, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.48678549477566074, |
|
"grad_norm": 3.3833253383636475, |
|
"learning_rate": 5.728242671777672e-05, |
|
"loss": 1.0918, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.4880147510755993, |
|
"grad_norm": 3.116319417953491, |
|
"learning_rate": 5.707868981271815e-05, |
|
"loss": 0.8615, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.4892440073755378, |
|
"grad_norm": 2.5967965126037598, |
|
"learning_rate": 5.687483290166573e-05, |
|
"loss": 0.8579, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.49047326367547633, |
|
"grad_norm": 3.7683048248291016, |
|
"learning_rate": 5.6670859440633486e-05, |
|
"loss": 1.0777, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.49170251997541486, |
|
"grad_norm": 3.182317018508911, |
|
"learning_rate": 5.646677288761132e-05, |
|
"loss": 0.8932, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4929317762753534, |
|
"grad_norm": 0.19372360408306122, |
|
"learning_rate": 5.6262576702506406e-05, |
|
"loss": 0.8516, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.4941610325752919, |
|
"grad_norm": 0.223338320851326, |
|
"learning_rate": 5.6058274347084504e-05, |
|
"loss": 1.1287, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.4953902888752305, |
|
"grad_norm": 0.23367400467395782, |
|
"learning_rate": 5.585386928491134e-05, |
|
"loss": 1.1128, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.49661954517516904, |
|
"grad_norm": 0.2717371881008148, |
|
"learning_rate": 5.5649364981293786e-05, |
|
"loss": 1.2813, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.49784880147510757, |
|
"grad_norm": 0.25909724831581116, |
|
"learning_rate": 5.54447649032212e-05, |
|
"loss": 1.2149, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.4990780577750461, |
|
"grad_norm": 0.25411197543144226, |
|
"learning_rate": 5.5240072519306606e-05, |
|
"loss": 1.0679, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.5003073140749846, |
|
"grad_norm": 0.2962128520011902, |
|
"learning_rate": 5.503529129972792e-05, |
|
"loss": 1.1156, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.5015365703749232, |
|
"grad_norm": 0.29252949357032776, |
|
"learning_rate": 5.483042471616908e-05, |
|
"loss": 1.125, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.5015365703749232, |
|
"eval_loss": 0.9695894122123718, |
|
"eval_runtime": 65.3254, |
|
"eval_samples_per_second": 10.486, |
|
"eval_steps_per_second": 5.251, |
|
"step": 408 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 813, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 204, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.411229584359424e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|