|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0788133053640996, |
|
"eval_steps": 500, |
|
"global_step": 18000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005993407252022775, |
|
"grad_norm": 0.8397009372711182, |
|
"learning_rate": 5.991611743559017e-07, |
|
"loss": 0.0366, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01198681450404555, |
|
"grad_norm": 0.7360026240348816, |
|
"learning_rate": 1.1983223487118035e-06, |
|
"loss": 0.0143, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.017980221756068324, |
|
"grad_norm": 0.26396483182907104, |
|
"learning_rate": 1.7974835230677055e-06, |
|
"loss": 0.0091, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0239736290080911, |
|
"grad_norm": 0.08779273182153702, |
|
"learning_rate": 2.396644697423607e-06, |
|
"loss": 0.0059, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.029967036260113874, |
|
"grad_norm": 0.5255675911903381, |
|
"learning_rate": 2.995805871779509e-06, |
|
"loss": 0.0059, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03596044351213665, |
|
"grad_norm": 0.2772226929664612, |
|
"learning_rate": 3.594967046135411e-06, |
|
"loss": 0.005, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.041953850764159424, |
|
"grad_norm": 0.29560720920562744, |
|
"learning_rate": 4.194128220491313e-06, |
|
"loss": 0.0041, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0479472580161822, |
|
"grad_norm": 0.4243590235710144, |
|
"learning_rate": 4.793289394847214e-06, |
|
"loss": 0.0038, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.05394066526820498, |
|
"grad_norm": 0.12234604358673096, |
|
"learning_rate": 5.392450569203116e-06, |
|
"loss": 0.0033, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.05993407252022775, |
|
"grad_norm": 0.17332005500793457, |
|
"learning_rate": 5.991611743559018e-06, |
|
"loss": 0.0045, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.06592747977225052, |
|
"grad_norm": 0.017084548249840736, |
|
"learning_rate": 6.59077291791492e-06, |
|
"loss": 0.0031, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.0719208870242733, |
|
"grad_norm": 0.04909040033817291, |
|
"learning_rate": 7.189934092270822e-06, |
|
"loss": 0.0034, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.07791429427629608, |
|
"grad_norm": 0.03835730627179146, |
|
"learning_rate": 7.789095266626723e-06, |
|
"loss": 0.0028, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.08390770152831885, |
|
"grad_norm": 0.04889771714806557, |
|
"learning_rate": 8.388256440982625e-06, |
|
"loss": 0.0028, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.08990110878034162, |
|
"grad_norm": 0.1031421571969986, |
|
"learning_rate": 8.987417615338527e-06, |
|
"loss": 0.003, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.0958945160323644, |
|
"grad_norm": 0.11215908825397491, |
|
"learning_rate": 9.586578789694428e-06, |
|
"loss": 0.0027, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.10188792328438717, |
|
"grad_norm": 0.1708650439977646, |
|
"learning_rate": 9.99022112867102e-06, |
|
"loss": 0.0025, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.10788133053640996, |
|
"grad_norm": 0.01850762963294983, |
|
"learning_rate": 9.958676382448504e-06, |
|
"loss": 0.0025, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.11387473778843273, |
|
"grad_norm": 0.10600468516349792, |
|
"learning_rate": 9.927131636225988e-06, |
|
"loss": 0.0025, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.1198681450404555, |
|
"grad_norm": 0.16077758371829987, |
|
"learning_rate": 9.895586890003471e-06, |
|
"loss": 0.0027, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.12586155229247828, |
|
"grad_norm": 0.3142828047275543, |
|
"learning_rate": 9.864042143780953e-06, |
|
"loss": 0.0022, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.13185495954450105, |
|
"grad_norm": 0.18406708538532257, |
|
"learning_rate": 9.832497397558437e-06, |
|
"loss": 0.0025, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.13784836679652382, |
|
"grad_norm": 0.10599557310342789, |
|
"learning_rate": 9.80095265133592e-06, |
|
"loss": 0.0027, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.1438417740485466, |
|
"grad_norm": 0.041681960225105286, |
|
"learning_rate": 9.769407905113404e-06, |
|
"loss": 0.0027, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.1498351813005694, |
|
"grad_norm": 0.26586103439331055, |
|
"learning_rate": 9.737863158890888e-06, |
|
"loss": 0.0031, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.15582858855259216, |
|
"grad_norm": 0.1568969488143921, |
|
"learning_rate": 9.70631841266837e-06, |
|
"loss": 0.0025, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.16182199580461493, |
|
"grad_norm": 0.09259970486164093, |
|
"learning_rate": 9.674773666445855e-06, |
|
"loss": 0.0023, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.1678154030566377, |
|
"grad_norm": 0.03380216658115387, |
|
"learning_rate": 9.643228920223337e-06, |
|
"loss": 0.0022, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.17380881030866047, |
|
"grad_norm": 0.18946796655654907, |
|
"learning_rate": 9.611684174000821e-06, |
|
"loss": 0.0025, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.17980221756068324, |
|
"grad_norm": 0.3344770073890686, |
|
"learning_rate": 9.580139427778305e-06, |
|
"loss": 0.0021, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.18579562481270603, |
|
"grad_norm": 0.04218849539756775, |
|
"learning_rate": 9.548594681555787e-06, |
|
"loss": 0.0024, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.1917890320647288, |
|
"grad_norm": 0.0481434129178524, |
|
"learning_rate": 9.517049935333272e-06, |
|
"loss": 0.0027, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.19778243931675157, |
|
"grad_norm": 0.32030656933784485, |
|
"learning_rate": 9.485505189110754e-06, |
|
"loss": 0.0025, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.20377584656877434, |
|
"grad_norm": 0.19509385526180267, |
|
"learning_rate": 9.453960442888238e-06, |
|
"loss": 0.0022, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.2097692538207971, |
|
"grad_norm": 0.08745113760232925, |
|
"learning_rate": 9.422415696665721e-06, |
|
"loss": 0.0026, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.2157626610728199, |
|
"grad_norm": 0.11743105947971344, |
|
"learning_rate": 9.390870950443205e-06, |
|
"loss": 0.0021, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.22175606832484268, |
|
"grad_norm": 0.1497587114572525, |
|
"learning_rate": 9.359326204220689e-06, |
|
"loss": 0.0026, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.22774947557686545, |
|
"grad_norm": 0.07227639853954315, |
|
"learning_rate": 9.32778145799817e-06, |
|
"loss": 0.0024, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.23374288282888822, |
|
"grad_norm": 0.022099023684859276, |
|
"learning_rate": 9.296236711775654e-06, |
|
"loss": 0.0019, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.239736290080911, |
|
"grad_norm": 0.09603813290596008, |
|
"learning_rate": 9.264691965553138e-06, |
|
"loss": 0.0019, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.24572969733293376, |
|
"grad_norm": 0.09311718493700027, |
|
"learning_rate": 9.233147219330622e-06, |
|
"loss": 0.002, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.25172310458495656, |
|
"grad_norm": 0.06892485171556473, |
|
"learning_rate": 9.201602473108105e-06, |
|
"loss": 0.0022, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.2577165118369793, |
|
"grad_norm": 0.2696809470653534, |
|
"learning_rate": 9.170057726885589e-06, |
|
"loss": 0.0024, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.2637099190890021, |
|
"grad_norm": 0.12481023371219635, |
|
"learning_rate": 9.138512980663071e-06, |
|
"loss": 0.0021, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.2697033263410249, |
|
"grad_norm": 0.029085570946335793, |
|
"learning_rate": 9.106968234440555e-06, |
|
"loss": 0.0025, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.27569673359304764, |
|
"grad_norm": 0.16772325336933136, |
|
"learning_rate": 9.075423488218038e-06, |
|
"loss": 0.0019, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.28169014084507044, |
|
"grad_norm": 0.25038984417915344, |
|
"learning_rate": 9.04387874199552e-06, |
|
"loss": 0.0022, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.2876835480970932, |
|
"grad_norm": 0.009772785007953644, |
|
"learning_rate": 9.012333995773006e-06, |
|
"loss": 0.002, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.293676955349116, |
|
"grad_norm": 0.10010802745819092, |
|
"learning_rate": 8.980789249550487e-06, |
|
"loss": 0.0021, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.2996703626011388, |
|
"grad_norm": 0.019169898703694344, |
|
"learning_rate": 8.949244503327971e-06, |
|
"loss": 0.0024, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.3056637698531615, |
|
"grad_norm": 0.039739012718200684, |
|
"learning_rate": 8.917699757105455e-06, |
|
"loss": 0.0022, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.3116571771051843, |
|
"grad_norm": 0.20961305499076843, |
|
"learning_rate": 8.886155010882938e-06, |
|
"loss": 0.0021, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.31765058435720706, |
|
"grad_norm": 0.07605484127998352, |
|
"learning_rate": 8.854610264660422e-06, |
|
"loss": 0.002, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.32364399160922985, |
|
"grad_norm": 0.01589258573949337, |
|
"learning_rate": 8.823065518437904e-06, |
|
"loss": 0.0022, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.3296373988612526, |
|
"grad_norm": 0.10248999297618866, |
|
"learning_rate": 8.791520772215388e-06, |
|
"loss": 0.0023, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.3356308061132754, |
|
"grad_norm": 0.09167122095823288, |
|
"learning_rate": 8.759976025992871e-06, |
|
"loss": 0.002, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.3416242133652982, |
|
"grad_norm": 0.23392055928707123, |
|
"learning_rate": 8.728431279770355e-06, |
|
"loss": 0.0021, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.34761762061732093, |
|
"grad_norm": 0.040714360773563385, |
|
"learning_rate": 8.696886533547839e-06, |
|
"loss": 0.0025, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.35361102786934373, |
|
"grad_norm": 0.184820294380188, |
|
"learning_rate": 8.665341787325322e-06, |
|
"loss": 0.0021, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.3596044351213665, |
|
"grad_norm": 0.04772236570715904, |
|
"learning_rate": 8.633797041102804e-06, |
|
"loss": 0.0022, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.36559784237338927, |
|
"grad_norm": 0.12407626956701279, |
|
"learning_rate": 8.60225229488029e-06, |
|
"loss": 0.0018, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.37159124962541207, |
|
"grad_norm": 0.1552393138408661, |
|
"learning_rate": 8.570707548657772e-06, |
|
"loss": 0.0024, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.3775846568774348, |
|
"grad_norm": 0.005017109680920839, |
|
"learning_rate": 8.539162802435255e-06, |
|
"loss": 0.0022, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.3835780641294576, |
|
"grad_norm": 0.00316947465762496, |
|
"learning_rate": 8.507618056212739e-06, |
|
"loss": 0.0021, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.38957147138148035, |
|
"grad_norm": 0.08644753694534302, |
|
"learning_rate": 8.476073309990221e-06, |
|
"loss": 0.0016, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.39556487863350315, |
|
"grad_norm": 0.23877011239528656, |
|
"learning_rate": 8.444528563767705e-06, |
|
"loss": 0.0023, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.40155828588552595, |
|
"grad_norm": 0.12397243827581406, |
|
"learning_rate": 8.412983817545188e-06, |
|
"loss": 0.002, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.4075516931375487, |
|
"grad_norm": 0.08488207310438156, |
|
"learning_rate": 8.381439071322672e-06, |
|
"loss": 0.002, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.4135451003895715, |
|
"grad_norm": 0.15658150613307953, |
|
"learning_rate": 8.349894325100156e-06, |
|
"loss": 0.0021, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.4195385076415942, |
|
"grad_norm": 0.09054456651210785, |
|
"learning_rate": 8.31834957887764e-06, |
|
"loss": 0.0022, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 0.1383715718984604, |
|
"learning_rate": 8.286804832655121e-06, |
|
"loss": 0.0019, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.4315253221456398, |
|
"grad_norm": 0.23421403765678406, |
|
"learning_rate": 8.255260086432605e-06, |
|
"loss": 0.0021, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.43751872939766256, |
|
"grad_norm": 0.07612959295511246, |
|
"learning_rate": 8.223715340210089e-06, |
|
"loss": 0.0018, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.44351213664968536, |
|
"grad_norm": 0.08813223987817764, |
|
"learning_rate": 8.192170593987572e-06, |
|
"loss": 0.0028, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.4495055439017081, |
|
"grad_norm": 0.11603320389986038, |
|
"learning_rate": 8.160625847765056e-06, |
|
"loss": 0.0021, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.4554989511537309, |
|
"grad_norm": 0.06462118774652481, |
|
"learning_rate": 8.129081101542538e-06, |
|
"loss": 0.0021, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.46149235840575364, |
|
"grad_norm": 0.08253411203622818, |
|
"learning_rate": 8.097536355320023e-06, |
|
"loss": 0.0019, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.46748576565777644, |
|
"grad_norm": 0.017711922526359558, |
|
"learning_rate": 8.065991609097505e-06, |
|
"loss": 0.0018, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.47347917290979924, |
|
"grad_norm": 0.16423271596431732, |
|
"learning_rate": 8.034446862874989e-06, |
|
"loss": 0.0021, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.479472580161822, |
|
"grad_norm": 0.17104622721672058, |
|
"learning_rate": 8.002902116652473e-06, |
|
"loss": 0.0022, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.4854659874138448, |
|
"grad_norm": 0.11236003786325455, |
|
"learning_rate": 7.971357370429955e-06, |
|
"loss": 0.002, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.4914593946658675, |
|
"grad_norm": 0.004910625517368317, |
|
"learning_rate": 7.93981262420744e-06, |
|
"loss": 0.0017, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.4974528019178903, |
|
"grad_norm": 0.015166868455708027, |
|
"learning_rate": 7.908267877984922e-06, |
|
"loss": 0.0016, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.5034462091699131, |
|
"grad_norm": 0.04219336435198784, |
|
"learning_rate": 7.876723131762406e-06, |
|
"loss": 0.0019, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.5094396164219359, |
|
"grad_norm": 0.08096965402364731, |
|
"learning_rate": 7.84517838553989e-06, |
|
"loss": 0.002, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.5154330236739586, |
|
"grad_norm": 0.27304044365882874, |
|
"learning_rate": 7.813633639317373e-06, |
|
"loss": 0.002, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.5214264309259814, |
|
"grad_norm": 0.023843977600336075, |
|
"learning_rate": 7.782088893094857e-06, |
|
"loss": 0.0021, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.5274198381780042, |
|
"grad_norm": 0.06996838003396988, |
|
"learning_rate": 7.750544146872338e-06, |
|
"loss": 0.0016, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.533413245430027, |
|
"grad_norm": 0.09238845109939575, |
|
"learning_rate": 7.718999400649822e-06, |
|
"loss": 0.0017, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.5394066526820498, |
|
"grad_norm": 0.031245483085513115, |
|
"learning_rate": 7.687454654427306e-06, |
|
"loss": 0.0019, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.5454000599340725, |
|
"grad_norm": 0.02232646569609642, |
|
"learning_rate": 7.65590990820479e-06, |
|
"loss": 0.0022, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.5513934671860953, |
|
"grad_norm": 0.18810293078422546, |
|
"learning_rate": 7.624365161982272e-06, |
|
"loss": 0.002, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.5573868744381181, |
|
"grad_norm": 0.04845254495739937, |
|
"learning_rate": 7.592820415759756e-06, |
|
"loss": 0.0021, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.5633802816901409, |
|
"grad_norm": 0.12073975801467896, |
|
"learning_rate": 7.561275669537239e-06, |
|
"loss": 0.0021, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.5693736889421637, |
|
"grad_norm": 0.03330647572875023, |
|
"learning_rate": 7.529730923314722e-06, |
|
"loss": 0.002, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5753670961941864, |
|
"grad_norm": 0.23292703926563263, |
|
"learning_rate": 7.498186177092206e-06, |
|
"loss": 0.0018, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.5813605034462092, |
|
"grad_norm": 0.3227817118167877, |
|
"learning_rate": 7.466641430869689e-06, |
|
"loss": 0.0017, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.587353910698232, |
|
"grad_norm": 0.03530238941311836, |
|
"learning_rate": 7.4350966846471726e-06, |
|
"loss": 0.0023, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.5933473179502547, |
|
"grad_norm": 0.1631837785243988, |
|
"learning_rate": 7.403551938424655e-06, |
|
"loss": 0.0019, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.5993407252022775, |
|
"grad_norm": 0.11341429501771927, |
|
"learning_rate": 7.37200719220214e-06, |
|
"loss": 0.0018, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.6053341324543002, |
|
"grad_norm": 0.19524067640304565, |
|
"learning_rate": 7.340462445979623e-06, |
|
"loss": 0.0021, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.611327539706323, |
|
"grad_norm": 0.058198366314172745, |
|
"learning_rate": 7.308917699757106e-06, |
|
"loss": 0.0018, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.6173209469583458, |
|
"grad_norm": 0.02788078971207142, |
|
"learning_rate": 7.277372953534589e-06, |
|
"loss": 0.0014, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.6233143542103686, |
|
"grad_norm": 0.07168685644865036, |
|
"learning_rate": 7.245828207312072e-06, |
|
"loss": 0.0017, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.6293077614623914, |
|
"grad_norm": 0.07542666047811508, |
|
"learning_rate": 7.2142834610895565e-06, |
|
"loss": 0.0022, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.6353011687144141, |
|
"grad_norm": 0.1050957664847374, |
|
"learning_rate": 7.182738714867039e-06, |
|
"loss": 0.0017, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.6412945759664369, |
|
"grad_norm": 0.02330237440764904, |
|
"learning_rate": 7.151193968644523e-06, |
|
"loss": 0.002, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.6472879832184597, |
|
"grad_norm": 0.019814783707261086, |
|
"learning_rate": 7.119649222422006e-06, |
|
"loss": 0.0019, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.6532813904704825, |
|
"grad_norm": 0.041212160140275955, |
|
"learning_rate": 7.08810447619949e-06, |
|
"loss": 0.0022, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.6592747977225052, |
|
"grad_norm": 0.104148730635643, |
|
"learning_rate": 7.056559729976973e-06, |
|
"loss": 0.0017, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.665268204974528, |
|
"grad_norm": 0.060578759759664536, |
|
"learning_rate": 7.025014983754457e-06, |
|
"loss": 0.0019, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.6712616122265508, |
|
"grad_norm": 0.014108662493526936, |
|
"learning_rate": 6.99347023753194e-06, |
|
"loss": 0.002, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.6772550194785736, |
|
"grad_norm": 0.06860730797052383, |
|
"learning_rate": 6.9619254913094224e-06, |
|
"loss": 0.0018, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.6832484267305964, |
|
"grad_norm": 0.2818455696105957, |
|
"learning_rate": 6.930380745086906e-06, |
|
"loss": 0.0015, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.6892418339826191, |
|
"grad_norm": 0.09976188093423843, |
|
"learning_rate": 6.89883599886439e-06, |
|
"loss": 0.0017, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.6952352412346419, |
|
"grad_norm": 0.04988027364015579, |
|
"learning_rate": 6.8672912526418734e-06, |
|
"loss": 0.0016, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.7012286484866647, |
|
"grad_norm": 0.061295535415410995, |
|
"learning_rate": 6.835746506419356e-06, |
|
"loss": 0.0016, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.7072220557386875, |
|
"grad_norm": 0.04820416495203972, |
|
"learning_rate": 6.80420176019684e-06, |
|
"loss": 0.0017, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.7132154629907103, |
|
"grad_norm": 0.08933009207248688, |
|
"learning_rate": 6.772657013974323e-06, |
|
"loss": 0.002, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.719208870242733, |
|
"grad_norm": 0.057753268629312515, |
|
"learning_rate": 6.7411122677518055e-06, |
|
"loss": 0.0018, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7252022774947557, |
|
"grad_norm": 0.020321357995271683, |
|
"learning_rate": 6.70956752152929e-06, |
|
"loss": 0.0017, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.7311956847467785, |
|
"grad_norm": 0.258957177400589, |
|
"learning_rate": 6.678022775306773e-06, |
|
"loss": 0.0019, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.7371890919988013, |
|
"grad_norm": 0.1562880277633667, |
|
"learning_rate": 6.6464780290842565e-06, |
|
"loss": 0.002, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.7431824992508241, |
|
"grad_norm": 0.0703672245144844, |
|
"learning_rate": 6.614933282861739e-06, |
|
"loss": 0.0018, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.7491759065028468, |
|
"grad_norm": 0.015919741243124008, |
|
"learning_rate": 6.583388536639224e-06, |
|
"loss": 0.0018, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.7551693137548696, |
|
"grad_norm": 0.06606917828321457, |
|
"learning_rate": 6.551843790416707e-06, |
|
"loss": 0.0022, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.7611627210068924, |
|
"grad_norm": 0.1327201873064041, |
|
"learning_rate": 6.52029904419419e-06, |
|
"loss": 0.002, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.7671561282589152, |
|
"grad_norm": 0.10167068988084793, |
|
"learning_rate": 6.488754297971673e-06, |
|
"loss": 0.0018, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.773149535510938, |
|
"grad_norm": 0.20014306902885437, |
|
"learning_rate": 6.457209551749156e-06, |
|
"loss": 0.0019, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.7791429427629607, |
|
"grad_norm": 0.10611408203840256, |
|
"learning_rate": 6.4256648055266405e-06, |
|
"loss": 0.0016, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.7851363500149835, |
|
"grad_norm": 0.004227208439260721, |
|
"learning_rate": 6.394120059304123e-06, |
|
"loss": 0.0018, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.7911297572670063, |
|
"grad_norm": 0.04251255840063095, |
|
"learning_rate": 6.362575313081607e-06, |
|
"loss": 0.0022, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.7971231645190291, |
|
"grad_norm": 0.09611974656581879, |
|
"learning_rate": 6.33103056685909e-06, |
|
"loss": 0.0019, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.8031165717710519, |
|
"grad_norm": 0.060009848326444626, |
|
"learning_rate": 6.299485820636574e-06, |
|
"loss": 0.0019, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.8091099790230746, |
|
"grad_norm": 0.027135098353028297, |
|
"learning_rate": 6.267941074414057e-06, |
|
"loss": 0.0016, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.8151033862750974, |
|
"grad_norm": 0.09115968644618988, |
|
"learning_rate": 6.236396328191541e-06, |
|
"loss": 0.0017, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.8210967935271202, |
|
"grad_norm": 0.3819001317024231, |
|
"learning_rate": 6.204851581969024e-06, |
|
"loss": 0.0019, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.827090200779143, |
|
"grad_norm": 0.07268409430980682, |
|
"learning_rate": 6.173306835746506e-06, |
|
"loss": 0.002, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.8330836080311658, |
|
"grad_norm": 0.1490897685289383, |
|
"learning_rate": 6.14176208952399e-06, |
|
"loss": 0.0015, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.8390770152831885, |
|
"grad_norm": 0.07468798011541367, |
|
"learning_rate": 6.110217343301474e-06, |
|
"loss": 0.0017, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.8450704225352113, |
|
"grad_norm": 0.045000866055488586, |
|
"learning_rate": 6.078672597078957e-06, |
|
"loss": 0.0019, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 0.22245222330093384, |
|
"learning_rate": 6.04712785085644e-06, |
|
"loss": 0.0015, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.8570572370392568, |
|
"grad_norm": 0.09135129302740097, |
|
"learning_rate": 6.015583104633924e-06, |
|
"loss": 0.002, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.8630506442912796, |
|
"grad_norm": 0.043701499700546265, |
|
"learning_rate": 5.984038358411407e-06, |
|
"loss": 0.0017, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.8690440515433023, |
|
"grad_norm": 0.1364869773387909, |
|
"learning_rate": 5.9524936121888895e-06, |
|
"loss": 0.0019, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.8750374587953251, |
|
"grad_norm": 0.08669265359640121, |
|
"learning_rate": 5.920948865966374e-06, |
|
"loss": 0.002, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.8810308660473479, |
|
"grad_norm": 0.00844608899205923, |
|
"learning_rate": 5.889404119743857e-06, |
|
"loss": 0.0016, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.8870242732993707, |
|
"grad_norm": 0.027935262769460678, |
|
"learning_rate": 5.8578593735213405e-06, |
|
"loss": 0.0018, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.8930176805513935, |
|
"grad_norm": 0.0481196753680706, |
|
"learning_rate": 5.826314627298823e-06, |
|
"loss": 0.0019, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.8990110878034162, |
|
"grad_norm": 0.021947329863905907, |
|
"learning_rate": 5.794769881076308e-06, |
|
"loss": 0.0015, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.905004495055439, |
|
"grad_norm": 0.08527759462594986, |
|
"learning_rate": 5.763225134853791e-06, |
|
"loss": 0.0017, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.9109979023074618, |
|
"grad_norm": 0.021068023517727852, |
|
"learning_rate": 5.731680388631274e-06, |
|
"loss": 0.0018, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.9169913095594846, |
|
"grad_norm": 0.08113428950309753, |
|
"learning_rate": 5.700135642408757e-06, |
|
"loss": 0.0017, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.9229847168115073, |
|
"grad_norm": 0.10709325969219208, |
|
"learning_rate": 5.66859089618624e-06, |
|
"loss": 0.0015, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.9289781240635301, |
|
"grad_norm": 0.08009694516658783, |
|
"learning_rate": 5.6370461499637244e-06, |
|
"loss": 0.0016, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.9349715313155529, |
|
"grad_norm": 0.03613545373082161, |
|
"learning_rate": 5.605501403741207e-06, |
|
"loss": 0.0017, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.9409649385675757, |
|
"grad_norm": 0.06710252165794373, |
|
"learning_rate": 5.573956657518691e-06, |
|
"loss": 0.0018, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.9469583458195985, |
|
"grad_norm": 0.09847810864448547, |
|
"learning_rate": 5.542411911296174e-06, |
|
"loss": 0.0014, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.9529517530716212, |
|
"grad_norm": 0.011624569073319435, |
|
"learning_rate": 5.510867165073658e-06, |
|
"loss": 0.0016, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.958945160323644, |
|
"grad_norm": 0.06741365045309067, |
|
"learning_rate": 5.479322418851141e-06, |
|
"loss": 0.0015, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.9649385675756668, |
|
"grad_norm": 0.021546615287661552, |
|
"learning_rate": 5.447777672628625e-06, |
|
"loss": 0.0017, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.9709319748276896, |
|
"grad_norm": 0.1303360015153885, |
|
"learning_rate": 5.4162329264061075e-06, |
|
"loss": 0.0018, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.9769253820797124, |
|
"grad_norm": 0.10070718824863434, |
|
"learning_rate": 5.38468818018359e-06, |
|
"loss": 0.0018, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.982918789331735, |
|
"grad_norm": 0.08305861055850983, |
|
"learning_rate": 5.353143433961074e-06, |
|
"loss": 0.0016, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.9889121965837578, |
|
"grad_norm": 0.007656518369913101, |
|
"learning_rate": 5.321598687738557e-06, |
|
"loss": 0.0017, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.9949056038357806, |
|
"grad_norm": 0.0743492990732193, |
|
"learning_rate": 5.290053941516041e-06, |
|
"loss": 0.0015, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.8271744263468347, |
|
"eval_f1": 0.7498195656860883, |
|
"eval_loss": 0.001594877801835537, |
|
"eval_precision": 0.6861185445920746, |
|
"eval_recall": 0.8271744263468347, |
|
"eval_runtime": 1686.0917, |
|
"eval_samples_per_second": 8.796, |
|
"eval_steps_per_second": 1.1, |
|
"step": 16685 |
|
}, |
|
{ |
|
"epoch": 1.0008990110878033, |
|
"grad_norm": 0.05216585099697113, |
|
"learning_rate": 5.258509195293524e-06, |
|
"loss": 0.0015, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 1.0068924183398262, |
|
"grad_norm": 0.12606635689735413, |
|
"learning_rate": 5.226964449071008e-06, |
|
"loss": 0.0011, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.012885825591849, |
|
"grad_norm": 0.0004606186121236533, |
|
"learning_rate": 5.195419702848491e-06, |
|
"loss": 0.001, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 1.0188792328438718, |
|
"grad_norm": 0.000365409447113052, |
|
"learning_rate": 5.1638749566259735e-06, |
|
"loss": 0.001, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.0248726400958945, |
|
"grad_norm": 0.031485725194215775, |
|
"learning_rate": 5.132330210403458e-06, |
|
"loss": 0.0012, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 1.0308660473479172, |
|
"grad_norm": 0.0031660550739616156, |
|
"learning_rate": 5.100785464180941e-06, |
|
"loss": 0.0011, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.0368594545999401, |
|
"grad_norm": 0.04788443446159363, |
|
"learning_rate": 5.0692407179584244e-06, |
|
"loss": 0.001, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 1.0428528618519628, |
|
"grad_norm": 0.07966958731412888, |
|
"learning_rate": 5.037695971735907e-06, |
|
"loss": 0.001, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.0488462691039857, |
|
"grad_norm": 0.2937103807926178, |
|
"learning_rate": 5.006151225513392e-06, |
|
"loss": 0.0007, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.0548396763560084, |
|
"grad_norm": 0.0027551730163395405, |
|
"learning_rate": 4.974606479290875e-06, |
|
"loss": 0.0007, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.060833083608031, |
|
"grad_norm": 0.08430271595716476, |
|
"learning_rate": 4.943061733068357e-06, |
|
"loss": 0.0008, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.066826490860054, |
|
"grad_norm": 0.24536843597888947, |
|
"learning_rate": 4.911516986845841e-06, |
|
"loss": 0.0006, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.0728198981120767, |
|
"grad_norm": 0.040876179933547974, |
|
"learning_rate": 4.879972240623325e-06, |
|
"loss": 0.0009, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 1.0788133053640996, |
|
"grad_norm": 0.0515579879283905, |
|
"learning_rate": 4.848427494400808e-06, |
|
"loss": 0.0007, |
|
"step": 18000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 33370, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 1000, |
|
"total_flos": 7.597573697465206e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|