|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.983792544570502, |
|
"eval_steps": 500, |
|
"global_step": 3080, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03241491085899514, |
|
"grad_norm": 1.712692141532898, |
|
"learning_rate": 0.00019999479806942977, |
|
"loss": 2.6861, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06482982171799027, |
|
"grad_norm": 2.658978223800659, |
|
"learning_rate": 0.00019997919281892067, |
|
"loss": 2.2023, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09724473257698542, |
|
"grad_norm": 1.8215035200119019, |
|
"learning_rate": 0.0001999531858720213, |
|
"loss": 1.8589, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12965964343598055, |
|
"grad_norm": 1.7040213346481323, |
|
"learning_rate": 0.0001999167799344583, |
|
"loss": 1.7118, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1620745542949757, |
|
"grad_norm": 1.647182583808899, |
|
"learning_rate": 0.00019986997879385487, |
|
"loss": 1.5532, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.19448946515397084, |
|
"grad_norm": 1.7023558616638184, |
|
"learning_rate": 0.0001998127873193367, |
|
"loss": 1.4578, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22690437601296595, |
|
"grad_norm": 1.5219024419784546, |
|
"learning_rate": 0.00019974521146102537, |
|
"loss": 1.4007, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2593192868719611, |
|
"grad_norm": 1.5780329704284668, |
|
"learning_rate": 0.00019966725824941932, |
|
"loss": 1.409, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2917341977309562, |
|
"grad_norm": 1.7620179653167725, |
|
"learning_rate": 0.00019957893579466252, |
|
"loss": 1.3303, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3241491085899514, |
|
"grad_norm": 2.03952693939209, |
|
"learning_rate": 0.00019948025328570042, |
|
"loss": 1.2894, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3565640194489465, |
|
"grad_norm": 1.8262405395507812, |
|
"learning_rate": 0.00019937122098932428, |
|
"loss": 1.2291, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3889789303079417, |
|
"grad_norm": 1.7981160879135132, |
|
"learning_rate": 0.00019925185024910277, |
|
"loss": 1.2562, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4213938411669368, |
|
"grad_norm": 1.7449265718460083, |
|
"learning_rate": 0.000199122153484202, |
|
"loss": 1.2019, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4538087520259319, |
|
"grad_norm": 1.8147671222686768, |
|
"learning_rate": 0.0001989821441880933, |
|
"loss": 1.1528, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4862236628849271, |
|
"grad_norm": 2.1661763191223145, |
|
"learning_rate": 0.00019883183692714936, |
|
"loss": 1.148, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5186385737439222, |
|
"grad_norm": 2.311131477355957, |
|
"learning_rate": 0.0001986712473391289, |
|
"loss": 1.1565, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5510534846029174, |
|
"grad_norm": 1.8314425945281982, |
|
"learning_rate": 0.00019850039213154973, |
|
"loss": 1.1258, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5834683954619124, |
|
"grad_norm": 2.260610580444336, |
|
"learning_rate": 0.0001983192890799503, |
|
"loss": 1.0491, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6158833063209076, |
|
"grad_norm": 2.0741446018218994, |
|
"learning_rate": 0.00019812795702604073, |
|
"loss": 1.0044, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6482982171799028, |
|
"grad_norm": 2.3104991912841797, |
|
"learning_rate": 0.00019792641587574212, |
|
"loss": 1.0391, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6807131280388979, |
|
"grad_norm": 2.024132251739502, |
|
"learning_rate": 0.00019771468659711595, |
|
"loss": 1.0802, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.713128038897893, |
|
"grad_norm": 2.667066812515259, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 1.0215, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7455429497568882, |
|
"grad_norm": 2.127154588699341, |
|
"learning_rate": 0.00019726075282462845, |
|
"loss": 1.0016, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7779578606158833, |
|
"grad_norm": 2.398350238800049, |
|
"learning_rate": 0.00019701859555740648, |
|
"loss": 1.0767, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8103727714748784, |
|
"grad_norm": 2.4408252239227295, |
|
"learning_rate": 0.00019676634461022224, |
|
"loss": 0.969, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8427876823338736, |
|
"grad_norm": 2.225031614303589, |
|
"learning_rate": 0.000196504026226914, |
|
"loss": 1.0099, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8752025931928687, |
|
"grad_norm": 2.4684712886810303, |
|
"learning_rate": 0.00019623166769872213, |
|
"loss": 0.9523, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9076175040518638, |
|
"grad_norm": 2.4853105545043945, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 0.8981, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.940032414910859, |
|
"grad_norm": 2.3052029609680176, |
|
"learning_rate": 0.00019565694459251458, |
|
"loss": 0.9157, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9724473257698542, |
|
"grad_norm": 2.227735996246338, |
|
"learning_rate": 0.00019535463980789277, |
|
"loss": 0.9085, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0048622366288493, |
|
"grad_norm": 2.3589160442352295, |
|
"learning_rate": 0.00019504241445895436, |
|
"loss": 0.9731, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0372771474878444, |
|
"grad_norm": 2.2362122535705566, |
|
"learning_rate": 0.000194720301029191, |
|
"loss": 0.8788, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0696920583468394, |
|
"grad_norm": 2.2762033939361572, |
|
"learning_rate": 0.00019438833303083678, |
|
"loss": 0.8274, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.1021069692058347, |
|
"grad_norm": 2.9542107582092285, |
|
"learning_rate": 0.00019404654500138117, |
|
"loss": 0.8729, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1345218800648298, |
|
"grad_norm": 2.541868209838867, |
|
"learning_rate": 0.0001936949724999762, |
|
"loss": 0.834, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1669367909238249, |
|
"grad_norm": 2.314488649368286, |
|
"learning_rate": 0.0001933336521037367, |
|
"loss": 0.8432, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1993517017828201, |
|
"grad_norm": 2.504655361175537, |
|
"learning_rate": 0.00019296262140393498, |
|
"loss": 0.8251, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2317666126418152, |
|
"grad_norm": 2.568459987640381, |
|
"learning_rate": 0.0001925819190020898, |
|
"loss": 0.8152, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2641815235008105, |
|
"grad_norm": 2.7527027130126953, |
|
"learning_rate": 0.0001921915845059505, |
|
"loss": 0.8599, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2965964343598055, |
|
"grad_norm": 3.213120222091675, |
|
"learning_rate": 0.00019179165852537596, |
|
"loss": 0.7986, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3290113452188006, |
|
"grad_norm": 2.718676805496216, |
|
"learning_rate": 0.00019138218266810986, |
|
"loss": 0.7604, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3614262560777957, |
|
"grad_norm": 2.3581483364105225, |
|
"learning_rate": 0.00019096319953545185, |
|
"loss": 0.8096, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.393841166936791, |
|
"grad_norm": 2.966271162033081, |
|
"learning_rate": 0.0001905347527178252, |
|
"loss": 0.873, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.426256077795786, |
|
"grad_norm": 2.8818719387054443, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 0.78, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4586709886547813, |
|
"grad_norm": 3.3322315216064453, |
|
"learning_rate": 0.00018964964730766508, |
|
"loss": 0.8353, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4910858995137763, |
|
"grad_norm": 2.844998359680176, |
|
"learning_rate": 0.0001891930808002694, |
|
"loss": 0.7845, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.5235008103727714, |
|
"grad_norm": 2.704843044281006, |
|
"learning_rate": 0.00018872723476860034, |
|
"loss": 0.7735, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5559157212317665, |
|
"grad_norm": 2.935382604598999, |
|
"learning_rate": 0.00018825215767863214, |
|
"loss": 0.8104, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5883306320907618, |
|
"grad_norm": 4.604203701019287, |
|
"learning_rate": 0.00018776789895672558, |
|
"loss": 0.7568, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.620745542949757, |
|
"grad_norm": 2.889993190765381, |
|
"learning_rate": 0.00018727450898448563, |
|
"loss": 0.7854, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.653160453808752, |
|
"grad_norm": 2.4442269802093506, |
|
"learning_rate": 0.00018677203909351988, |
|
"loss": 0.7134, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6855753646677472, |
|
"grad_norm": 3.6301393508911133, |
|
"learning_rate": 0.00018626054156009806, |
|
"loss": 0.7339, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.7179902755267422, |
|
"grad_norm": 2.4832422733306885, |
|
"learning_rate": 0.00018574006959971333, |
|
"loss": 0.7377, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.7504051863857373, |
|
"grad_norm": 2.5708158016204834, |
|
"learning_rate": 0.00018521067736154568, |
|
"loss": 0.6838, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7828200972447326, |
|
"grad_norm": 2.2001380920410156, |
|
"learning_rate": 0.00018467241992282843, |
|
"loss": 0.7134, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8152350081037278, |
|
"grad_norm": 2.669135332107544, |
|
"learning_rate": 0.00018412535328311814, |
|
"loss": 0.7912, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.847649918962723, |
|
"grad_norm": 2.5101096630096436, |
|
"learning_rate": 0.0001835695343584683, |
|
"loss": 0.7279, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.880064829821718, |
|
"grad_norm": 1.9649478197097778, |
|
"learning_rate": 0.00018300502097550806, |
|
"loss": 0.6177, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.912479740680713, |
|
"grad_norm": 2.6189944744110107, |
|
"learning_rate": 0.00018243187186542593, |
|
"loss": 0.6625, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.9448946515397083, |
|
"grad_norm": 2.7569029331207275, |
|
"learning_rate": 0.00018185014665785936, |
|
"loss": 0.6726, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9773095623987034, |
|
"grad_norm": 2.448319673538208, |
|
"learning_rate": 0.00018125990587469123, |
|
"loss": 0.7116, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.0097244732576987, |
|
"grad_norm": 2.4071240425109863, |
|
"learning_rate": 0.000180661210923753, |
|
"loss": 0.7076, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.0421393841166937, |
|
"grad_norm": 2.3127925395965576, |
|
"learning_rate": 0.00018005412409243606, |
|
"loss": 0.658, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.0745542949756888, |
|
"grad_norm": 2.6991357803344727, |
|
"learning_rate": 0.00017943870854121124, |
|
"loss": 0.6366, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.106969205834684, |
|
"grad_norm": 2.266465902328491, |
|
"learning_rate": 0.00017881502829705808, |
|
"loss": 0.6914, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.139384116693679, |
|
"grad_norm": 2.4977452754974365, |
|
"learning_rate": 0.000178183148246803, |
|
"loss": 0.6819, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.1717990275526744, |
|
"grad_norm": 2.3938634395599365, |
|
"learning_rate": 0.00017754313413036906, |
|
"loss": 0.6578, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.2042139384116695, |
|
"grad_norm": 2.4295880794525146, |
|
"learning_rate": 0.0001768950525339362, |
|
"loss": 0.6468, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.2366288492706645, |
|
"grad_norm": 2.621041774749756, |
|
"learning_rate": 0.00017623897088301385, |
|
"loss": 0.684, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.2690437601296596, |
|
"grad_norm": 1.9489623308181763, |
|
"learning_rate": 0.00017557495743542585, |
|
"loss": 0.5617, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.3014586709886546, |
|
"grad_norm": 2.1973514556884766, |
|
"learning_rate": 0.00017490308127420928, |
|
"loss": 0.6397, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.3338735818476497, |
|
"grad_norm": 3.3451037406921387, |
|
"learning_rate": 0.000174223412300427, |
|
"loss": 0.6761, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.366288492706645, |
|
"grad_norm": 3.31162691116333, |
|
"learning_rate": 0.00017353602122589527, |
|
"loss": 0.7305, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.3987034035656403, |
|
"grad_norm": 2.1021759510040283, |
|
"learning_rate": 0.00017284097956582692, |
|
"loss": 0.699, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.4311183144246353, |
|
"grad_norm": 2.290239095687866, |
|
"learning_rate": 0.0001721383596313912, |
|
"loss": 0.7112, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.4635332252836304, |
|
"grad_norm": 2.1187663078308105, |
|
"learning_rate": 0.00017142823452219038, |
|
"loss": 0.6473, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.4959481361426255, |
|
"grad_norm": 2.947108745574951, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.6325, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.528363047001621, |
|
"grad_norm": 2.2677927017211914, |
|
"learning_rate": 0.00016998576507435618, |
|
"loss": 0.6988, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.560777957860616, |
|
"grad_norm": 3.2524213790893555, |
|
"learning_rate": 0.00016925357080824118, |
|
"loss": 0.6584, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.593192868719611, |
|
"grad_norm": 2.626284122467041, |
|
"learning_rate": 0.00016851417149678444, |
|
"loss": 0.5591, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.625607779578606, |
|
"grad_norm": 2.319129705429077, |
|
"learning_rate": 0.0001677676440660636, |
|
"loss": 0.5546, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.658022690437601, |
|
"grad_norm": 2.2486627101898193, |
|
"learning_rate": 0.00016701406618375596, |
|
"loss": 0.6935, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.6904376012965967, |
|
"grad_norm": 2.283766508102417, |
|
"learning_rate": 0.00016625351625105796, |
|
"loss": 0.673, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.7228525121555913, |
|
"grad_norm": 2.7028627395629883, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 0.6226, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.755267423014587, |
|
"grad_norm": 2.0403945446014404, |
|
"learning_rate": 0.00016471181745785672, |
|
"loss": 0.6318, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.787682333873582, |
|
"grad_norm": 2.255915641784668, |
|
"learning_rate": 0.00016393082899355516, |
|
"loss": 0.5682, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.820097244732577, |
|
"grad_norm": 2.118745803833008, |
|
"learning_rate": 0.0001631431892545791, |
|
"loss": 0.6582, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.852512155591572, |
|
"grad_norm": 2.0190374851226807, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.5768, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.884927066450567, |
|
"grad_norm": 2.165712356567383, |
|
"learning_rate": 0.00016154828441584655, |
|
"loss": 0.5919, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.9173419773095626, |
|
"grad_norm": 2.0873501300811768, |
|
"learning_rate": 0.00016074118524777477, |
|
"loss": 0.6416, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.9497568881685576, |
|
"grad_norm": 2.1629409790039062, |
|
"learning_rate": 0.0001599277666511347, |
|
"loss": 0.6438, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.9821717990275527, |
|
"grad_norm": 2.325366973876953, |
|
"learning_rate": 0.00015910811325286768, |
|
"loss": 0.6756, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.0145867098865478, |
|
"grad_norm": 2.101053237915039, |
|
"learning_rate": 0.00015828231032857503, |
|
"loss": 0.5999, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.047001620745543, |
|
"grad_norm": 1.8169053792953491, |
|
"learning_rate": 0.00015745044379364634, |
|
"loss": 0.6478, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.079416531604538, |
|
"grad_norm": 2.284700632095337, |
|
"learning_rate": 0.00015661260019432077, |
|
"loss": 0.6163, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.1118314424635334, |
|
"grad_norm": 3.3290014266967773, |
|
"learning_rate": 0.00015576886669868296, |
|
"loss": 0.5751, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.1442463533225284, |
|
"grad_norm": 2.246464252471924, |
|
"learning_rate": 0.0001549193310875942, |
|
"loss": 0.5481, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.1766612641815235, |
|
"grad_norm": 2.6920738220214844, |
|
"learning_rate": 0.00015406408174555976, |
|
"loss": 0.587, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.2090761750405186, |
|
"grad_norm": 2.41670560836792, |
|
"learning_rate": 0.00015320320765153367, |
|
"loss": 0.5998, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.2414910858995136, |
|
"grad_norm": 1.9720252752304077, |
|
"learning_rate": 0.00015233679836966122, |
|
"loss": 0.5327, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.2739059967585087, |
|
"grad_norm": 1.99226975440979, |
|
"learning_rate": 0.000151464944039961, |
|
"loss": 0.5932, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.306320907617504, |
|
"grad_norm": 2.752708911895752, |
|
"learning_rate": 0.00015058773536894685, |
|
"loss": 0.5989, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.3387358184764993, |
|
"grad_norm": 1.9078640937805176, |
|
"learning_rate": 0.00014970526362019079, |
|
"loss": 0.5447, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.3711507293354943, |
|
"grad_norm": 2.3850903511047363, |
|
"learning_rate": 0.00014881762060482814, |
|
"loss": 0.5739, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.4035656401944894, |
|
"grad_norm": 1.8733508586883545, |
|
"learning_rate": 0.0001479248986720057, |
|
"loss": 0.5407, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.4359805510534844, |
|
"grad_norm": 1.924999713897705, |
|
"learning_rate": 0.0001470271906992737, |
|
"loss": 0.576, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.46839546191248, |
|
"grad_norm": 3.5489633083343506, |
|
"learning_rate": 0.00014612459008292306, |
|
"loss": 0.5886, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.500810372771475, |
|
"grad_norm": 2.7815005779266357, |
|
"learning_rate": 0.00014521719072826858, |
|
"loss": 0.5426, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.53322528363047, |
|
"grad_norm": 1.678726315498352, |
|
"learning_rate": 0.00014430508703987914, |
|
"loss": 0.5983, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.565640194489465, |
|
"grad_norm": 2.03645658493042, |
|
"learning_rate": 0.00014338837391175582, |
|
"loss": 0.6045, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.59805510534846, |
|
"grad_norm": 2.302565813064575, |
|
"learning_rate": 0.00014246714671745965, |
|
"loss": 0.5739, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.6304700162074557, |
|
"grad_norm": 1.8924729824066162, |
|
"learning_rate": 0.00014154150130018866, |
|
"loss": 0.5296, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.6628849270664503, |
|
"grad_norm": 1.8624053001403809, |
|
"learning_rate": 0.00014061153396280674, |
|
"loss": 0.5886, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.695299837925446, |
|
"grad_norm": 2.516749143600464, |
|
"learning_rate": 0.00013967734145782425, |
|
"loss": 0.6285, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.727714748784441, |
|
"grad_norm": 2.6221561431884766, |
|
"learning_rate": 0.0001387390209773323, |
|
"loss": 0.6025, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.760129659643436, |
|
"grad_norm": 1.7727645635604858, |
|
"learning_rate": 0.00013779667014289065, |
|
"loss": 0.604, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.792544570502431, |
|
"grad_norm": 1.945691704750061, |
|
"learning_rate": 0.00013685038699537164, |
|
"loss": 0.6304, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.824959481361426, |
|
"grad_norm": 2.1147408485412598, |
|
"learning_rate": 0.00013590026998475986, |
|
"loss": 0.5754, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.8573743922204216, |
|
"grad_norm": 2.109344959259033, |
|
"learning_rate": 0.00013494641795990986, |
|
"loss": 0.6092, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.8897893030794166, |
|
"grad_norm": 1.9908583164215088, |
|
"learning_rate": 0.00013398893015826167, |
|
"loss": 0.5375, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.9222042139384117, |
|
"grad_norm": 2.164598226547241, |
|
"learning_rate": 0.00013302790619551674, |
|
"loss": 0.5625, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.9546191247974067, |
|
"grad_norm": 2.0762100219726562, |
|
"learning_rate": 0.00013206344605527355, |
|
"loss": 0.6212, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.987034035656402, |
|
"grad_norm": 2.2591452598571777, |
|
"learning_rate": 0.00013109565007862596, |
|
"loss": 0.5491, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.019448946515397, |
|
"grad_norm": 2.087096691131592, |
|
"learning_rate": 0.00013012461895372344, |
|
"loss": 0.5869, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.051863857374392, |
|
"grad_norm": 1.8744010925292969, |
|
"learning_rate": 0.00012915045370529585, |
|
"loss": 0.5585, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.084278768233387, |
|
"grad_norm": 2.3939051628112793, |
|
"learning_rate": 0.00012817325568414297, |
|
"loss": 0.5298, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.116693679092383, |
|
"grad_norm": 2.6422526836395264, |
|
"learning_rate": 0.00012719312655658994, |
|
"loss": 0.5765, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.1491085899513775, |
|
"grad_norm": 2.3070068359375, |
|
"learning_rate": 0.00012621016829391022, |
|
"loss": 0.5601, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.181523500810373, |
|
"grad_norm": 2.8217501640319824, |
|
"learning_rate": 0.0001252244831617165, |
|
"loss": 0.555, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.213938411669368, |
|
"grad_norm": 2.306428909301758, |
|
"learning_rate": 0.00012423617370932127, |
|
"loss": 0.546, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.246353322528363, |
|
"grad_norm": 2.691399097442627, |
|
"learning_rate": 0.00012324534275906748, |
|
"loss": 0.5794, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.278768233387358, |
|
"grad_norm": 2.576984167098999, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 0.4677, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.311183144246353, |
|
"grad_norm": 2.3105177879333496, |
|
"learning_rate": 0.00012125652895529766, |
|
"loss": 0.4888, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.343598055105349, |
|
"grad_norm": 2.361992359161377, |
|
"learning_rate": 0.0001202587530152081, |
|
"loss": 0.5242, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.376012965964343, |
|
"grad_norm": 2.0497052669525146, |
|
"learning_rate": 0.00011925886938258605, |
|
"loss": 0.5341, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.408427876823339, |
|
"grad_norm": 2.1436045169830322, |
|
"learning_rate": 0.00011825698208393619, |
|
"loss": 0.565, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.4408427876823335, |
|
"grad_norm": 2.7684664726257324, |
|
"learning_rate": 0.00011725319535422188, |
|
"loss": 0.5257, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.473257698541329, |
|
"grad_norm": 2.2785606384277344, |
|
"learning_rate": 0.00011624761362602061, |
|
"loss": 0.5249, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.5056726094003245, |
|
"grad_norm": 4.94090461730957, |
|
"learning_rate": 0.00011524034151865903, |
|
"loss": 0.5505, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.538087520259319, |
|
"grad_norm": 1.984014868736267, |
|
"learning_rate": 0.00011423148382732853, |
|
"loss": 0.5028, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.570502431118315, |
|
"grad_norm": 2.671797275543213, |
|
"learning_rate": 0.00011322114551218239, |
|
"loss": 0.5099, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.602917341977309, |
|
"grad_norm": 2.390010356903076, |
|
"learning_rate": 0.000112209431687416, |
|
"loss": 0.5825, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.635332252836305, |
|
"grad_norm": 2.2715370655059814, |
|
"learning_rate": 0.00011119644761033078, |
|
"loss": 0.4849, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.667747163695299, |
|
"grad_norm": 1.9864623546600342, |
|
"learning_rate": 0.00011018229867038356, |
|
"loss": 0.5466, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.700162074554295, |
|
"grad_norm": 3.156597137451172, |
|
"learning_rate": 0.00010916709037822173, |
|
"loss": 0.5345, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.73257698541329, |
|
"grad_norm": 1.721481204032898, |
|
"learning_rate": 0.00010815092835470633, |
|
"loss": 0.5449, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.764991896272285, |
|
"grad_norm": 1.9270528554916382, |
|
"learning_rate": 0.00010713391831992323, |
|
"loss": 0.5303, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.7974068071312805, |
|
"grad_norm": 2.3868727684020996, |
|
"learning_rate": 0.00010611616608218429, |
|
"loss": 0.5057, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.829821717990275, |
|
"grad_norm": 1.9875102043151855, |
|
"learning_rate": 0.00010509777752701899, |
|
"loss": 0.5823, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.862236628849271, |
|
"grad_norm": 3.3236424922943115, |
|
"learning_rate": 0.00010407885860615859, |
|
"loss": 0.5184, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.894651539708266, |
|
"grad_norm": 2.4154036045074463, |
|
"learning_rate": 0.00010305951532651261, |
|
"loss": 0.5869, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.927066450567261, |
|
"grad_norm": 2.291398286819458, |
|
"learning_rate": 0.00010203985373914056, |
|
"loss": 0.4688, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.959481361426256, |
|
"grad_norm": 1.7252575159072876, |
|
"learning_rate": 0.00010101997992821797, |
|
"loss": 0.5315, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.991896272285251, |
|
"grad_norm": 2.263150453567505, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5146, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 5.024311183144246, |
|
"grad_norm": 2.434597969055176, |
|
"learning_rate": 9.898002007178204e-05, |
|
"loss": 0.5458, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 5.056726094003242, |
|
"grad_norm": 2.3639023303985596, |
|
"learning_rate": 9.79601462608595e-05, |
|
"loss": 0.4089, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 5.0891410048622365, |
|
"grad_norm": 1.8895375728607178, |
|
"learning_rate": 9.69404846734874e-05, |
|
"loss": 0.5381, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 5.121555915721232, |
|
"grad_norm": 2.34358549118042, |
|
"learning_rate": 9.592114139384145e-05, |
|
"loss": 0.5385, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 5.153970826580227, |
|
"grad_norm": 2.613858699798584, |
|
"learning_rate": 9.490222247298099e-05, |
|
"loss": 0.5411, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.186385737439222, |
|
"grad_norm": 2.525398015975952, |
|
"learning_rate": 9.388383391781575e-05, |
|
"loss": 0.5645, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.218800648298217, |
|
"grad_norm": 2.6018924713134766, |
|
"learning_rate": 9.286608168007678e-05, |
|
"loss": 0.4934, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 5.251215559157212, |
|
"grad_norm": 2.232938051223755, |
|
"learning_rate": 9.184907164529368e-05, |
|
"loss": 0.4636, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 5.283630470016208, |
|
"grad_norm": 1.9651650190353394, |
|
"learning_rate": 9.083290962177828e-05, |
|
"loss": 0.5732, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 5.316045380875202, |
|
"grad_norm": 2.057870626449585, |
|
"learning_rate": 8.981770132961649e-05, |
|
"loss": 0.5266, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 5.348460291734198, |
|
"grad_norm": 2.1037161350250244, |
|
"learning_rate": 8.880355238966923e-05, |
|
"loss": 0.4874, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.3808752025931925, |
|
"grad_norm": 2.544857978820801, |
|
"learning_rate": 8.779056831258402e-05, |
|
"loss": 0.4599, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 5.413290113452188, |
|
"grad_norm": 1.9760535955429077, |
|
"learning_rate": 8.67788544878176e-05, |
|
"loss": 0.4675, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 5.4457050243111835, |
|
"grad_norm": 1.8703639507293701, |
|
"learning_rate": 8.57685161726715e-05, |
|
"loss": 0.4413, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.478119935170178, |
|
"grad_norm": 2.0244393348693848, |
|
"learning_rate": 8.475965848134099e-05, |
|
"loss": 0.4954, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 5.510534846029174, |
|
"grad_norm": 1.9399715662002563, |
|
"learning_rate": 8.375238637397942e-05, |
|
"loss": 0.4709, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.542949756888168, |
|
"grad_norm": 2.892331600189209, |
|
"learning_rate": 8.274680464577816e-05, |
|
"loss": 0.5021, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.575364667747164, |
|
"grad_norm": 1.9682655334472656, |
|
"learning_rate": 8.174301791606385e-05, |
|
"loss": 0.5648, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.607779578606159, |
|
"grad_norm": 2.52781343460083, |
|
"learning_rate": 8.074113061741397e-05, |
|
"loss": 0.5293, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 5.640194489465154, |
|
"grad_norm": 2.3522939682006836, |
|
"learning_rate": 7.974124698479192e-05, |
|
"loss": 0.5401, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.672609400324149, |
|
"grad_norm": 2.7281885147094727, |
|
"learning_rate": 7.874347104470234e-05, |
|
"loss": 0.4837, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.705024311183144, |
|
"grad_norm": 2.070852279663086, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 0.5035, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 5.7374392220421395, |
|
"grad_norm": 2.5109684467315674, |
|
"learning_rate": 7.675465724093254e-05, |
|
"loss": 0.4815, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 5.769854132901134, |
|
"grad_norm": 2.1801116466522217, |
|
"learning_rate": 7.576382629067877e-05, |
|
"loss": 0.497, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 5.80226904376013, |
|
"grad_norm": 2.372771978378296, |
|
"learning_rate": 7.47755168382835e-05, |
|
"loss": 0.499, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 5.834683954619125, |
|
"grad_norm": 2.1925764083862305, |
|
"learning_rate": 7.378983170608982e-05, |
|
"loss": 0.4702, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.86709886547812, |
|
"grad_norm": 2.0080742835998535, |
|
"learning_rate": 7.280687344341007e-05, |
|
"loss": 0.4886, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 5.899513776337115, |
|
"grad_norm": 2.1518988609313965, |
|
"learning_rate": 7.182674431585704e-05, |
|
"loss": 0.5052, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 5.93192868719611, |
|
"grad_norm": 2.1942789554595947, |
|
"learning_rate": 7.084954629470417e-05, |
|
"loss": 0.4922, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 5.964343598055105, |
|
"grad_norm": 2.380929470062256, |
|
"learning_rate": 6.98753810462766e-05, |
|
"loss": 0.4879, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 5.9967585089141, |
|
"grad_norm": 2.1748461723327637, |
|
"learning_rate": 6.890434992137407e-05, |
|
"loss": 0.4609, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 6.0291734197730955, |
|
"grad_norm": 1.9654253721237183, |
|
"learning_rate": 6.793655394472644e-05, |
|
"loss": 0.4869, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 6.061588330632091, |
|
"grad_norm": 2.2075066566467285, |
|
"learning_rate": 6.697209380448333e-05, |
|
"loss": 0.467, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 6.094003241491086, |
|
"grad_norm": 1.5888482332229614, |
|
"learning_rate": 6.601106984173835e-05, |
|
"loss": 0.4614, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 6.126418152350081, |
|
"grad_norm": 2.0989603996276855, |
|
"learning_rate": 6.505358204009017e-05, |
|
"loss": 0.4693, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 6.158833063209076, |
|
"grad_norm": 2.3755545616149902, |
|
"learning_rate": 6.409973001524012e-05, |
|
"loss": 0.4883, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 6.191247974068071, |
|
"grad_norm": 2.5981929302215576, |
|
"learning_rate": 6.314961300462841e-05, |
|
"loss": 0.4616, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 6.223662884927067, |
|
"grad_norm": 2.172053575515747, |
|
"learning_rate": 6.220332985710936e-05, |
|
"loss": 0.471, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 6.256077795786061, |
|
"grad_norm": 2.0949530601501465, |
|
"learning_rate": 6.126097902266772e-05, |
|
"loss": 0.405, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 6.288492706645057, |
|
"grad_norm": 2.166477680206299, |
|
"learning_rate": 6.0322658542175736e-05, |
|
"loss": 0.4973, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 6.3209076175040515, |
|
"grad_norm": 1.9600187540054321, |
|
"learning_rate": 5.938846603719332e-05, |
|
"loss": 0.4483, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 6.353322528363047, |
|
"grad_norm": 2.055723190307617, |
|
"learning_rate": 5.845849869981137e-05, |
|
"loss": 0.4557, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 6.3857374392220425, |
|
"grad_norm": 1.9242348670959473, |
|
"learning_rate": 5.7532853282540387e-05, |
|
"loss": 0.5454, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 6.418152350081037, |
|
"grad_norm": 2.1381163597106934, |
|
"learning_rate": 5.6611626088244194e-05, |
|
"loss": 0.4838, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 6.450567260940033, |
|
"grad_norm": 2.3222544193267822, |
|
"learning_rate": 5.569491296012095e-05, |
|
"loss": 0.5139, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 6.482982171799027, |
|
"grad_norm": 2.2901763916015625, |
|
"learning_rate": 5.478280927173145e-05, |
|
"loss": 0.4359, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.515397082658023, |
|
"grad_norm": 2.2004926204681396, |
|
"learning_rate": 5.387540991707697e-05, |
|
"loss": 0.5119, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 6.547811993517017, |
|
"grad_norm": 2.1431353092193604, |
|
"learning_rate": 5.297280930072632e-05, |
|
"loss": 0.4609, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 6.580226904376013, |
|
"grad_norm": 2.191962718963623, |
|
"learning_rate": 5.207510132799436e-05, |
|
"loss": 0.4566, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 6.612641815235008, |
|
"grad_norm": 1.8917332887649536, |
|
"learning_rate": 5.11823793951719e-05, |
|
"loss": 0.4692, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 6.645056726094003, |
|
"grad_norm": 2.5211122035980225, |
|
"learning_rate": 5.029473637980926e-05, |
|
"loss": 0.4943, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 6.6774716369529985, |
|
"grad_norm": 2.163224935531616, |
|
"learning_rate": 4.9412264631053216e-05, |
|
"loss": 0.4562, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 6.709886547811994, |
|
"grad_norm": 1.8480360507965088, |
|
"learning_rate": 4.853505596003905e-05, |
|
"loss": 0.4499, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 6.742301458670989, |
|
"grad_norm": 2.506883382797241, |
|
"learning_rate": 4.7663201630338816e-05, |
|
"loss": 0.4378, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 6.774716369529984, |
|
"grad_norm": 3.5621259212493896, |
|
"learning_rate": 4.6796792348466356e-05, |
|
"loss": 0.4866, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 6.807131280388979, |
|
"grad_norm": 2.3629701137542725, |
|
"learning_rate": 4.593591825444028e-05, |
|
"loss": 0.4785, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 6.839546191247974, |
|
"grad_norm": 2.753488779067993, |
|
"learning_rate": 4.5080668912405855e-05, |
|
"loss": 0.5154, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 6.871961102106969, |
|
"grad_norm": 2.4580469131469727, |
|
"learning_rate": 4.423113330131707e-05, |
|
"loss": 0.4569, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 6.904376012965964, |
|
"grad_norm": 2.252030849456787, |
|
"learning_rate": 4.3387399805679255e-05, |
|
"loss": 0.5148, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 6.93679092382496, |
|
"grad_norm": 2.605240821838379, |
|
"learning_rate": 4.25495562063537e-05, |
|
"loss": 0.4743, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 6.9692058346839545, |
|
"grad_norm": 2.0974061489105225, |
|
"learning_rate": 4.1717689671425e-05, |
|
"loss": 0.4211, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 7.00162074554295, |
|
"grad_norm": 1.8956321477890015, |
|
"learning_rate": 4.089188674713236e-05, |
|
"loss": 0.4759, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 7.034035656401945, |
|
"grad_norm": 2.391812801361084, |
|
"learning_rate": 4.007223334886531e-05, |
|
"loss": 0.4802, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 7.06645056726094, |
|
"grad_norm": 2.2890560626983643, |
|
"learning_rate": 3.9258814752225284e-05, |
|
"loss": 0.4934, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 7.098865478119935, |
|
"grad_norm": 1.9036931991577148, |
|
"learning_rate": 3.845171558415348e-05, |
|
"loss": 0.4442, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 7.13128038897893, |
|
"grad_norm": 2.3117735385894775, |
|
"learning_rate": 3.7651019814126654e-05, |
|
"loss": 0.4885, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 7.163695299837926, |
|
"grad_norm": 2.3671298027038574, |
|
"learning_rate": 3.6856810745420886e-05, |
|
"loss": 0.4552, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 7.19611021069692, |
|
"grad_norm": 2.2790403366088867, |
|
"learning_rate": 3.606917100644488e-05, |
|
"loss": 0.4461, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 7.228525121555916, |
|
"grad_norm": 3.0118699073791504, |
|
"learning_rate": 3.528818254214329e-05, |
|
"loss": 0.4328, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 7.2609400324149105, |
|
"grad_norm": 2.2282354831695557, |
|
"learning_rate": 3.45139266054715e-05, |
|
"loss": 0.4658, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 7.293354943273906, |
|
"grad_norm": 2.078871965408325, |
|
"learning_rate": 3.374648374894204e-05, |
|
"loss": 0.3856, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 7.3257698541329015, |
|
"grad_norm": 2.222506523132324, |
|
"learning_rate": 3.298593381624406e-05, |
|
"loss": 0.4336, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 7.358184764991896, |
|
"grad_norm": 2.130702257156372, |
|
"learning_rate": 3.223235593393642e-05, |
|
"loss": 0.4888, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 7.390599675850892, |
|
"grad_norm": 1.9949439764022827, |
|
"learning_rate": 3.1485828503215585e-05, |
|
"loss": 0.4748, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 7.423014586709886, |
|
"grad_norm": 2.212193727493286, |
|
"learning_rate": 3.074642919175883e-05, |
|
"loss": 0.4602, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 7.455429497568882, |
|
"grad_norm": 2.4536526203155518, |
|
"learning_rate": 3.0014234925643837e-05, |
|
"loss": 0.4509, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 7.487844408427877, |
|
"grad_norm": 1.9900058507919312, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.3902, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 7.520259319286872, |
|
"grad_norm": 2.0711722373962402, |
|
"learning_rate": 2.8571765477809643e-05, |
|
"loss": 0.4508, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 7.552674230145867, |
|
"grad_norm": 2.640794515609741, |
|
"learning_rate": 2.7861640368608844e-05, |
|
"loss": 0.4944, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 7.585089141004862, |
|
"grad_norm": 2.4325428009033203, |
|
"learning_rate": 2.71590204341731e-05, |
|
"loss": 0.4438, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 7.6175040518638575, |
|
"grad_norm": 2.28466534614563, |
|
"learning_rate": 2.6463978774104758e-05, |
|
"loss": 0.4577, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 7.649918962722852, |
|
"grad_norm": 2.5928237438201904, |
|
"learning_rate": 2.5776587699573006e-05, |
|
"loss": 0.5176, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 7.682333873581848, |
|
"grad_norm": 1.911382794380188, |
|
"learning_rate": 2.509691872579075e-05, |
|
"loss": 0.4663, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 7.714748784440843, |
|
"grad_norm": 2.9931087493896484, |
|
"learning_rate": 2.4425042564574184e-05, |
|
"loss": 0.4284, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 7.747163695299838, |
|
"grad_norm": 2.6645960807800293, |
|
"learning_rate": 2.3761029116986178e-05, |
|
"loss": 0.472, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 7.779578606158833, |
|
"grad_norm": 2.0710065364837646, |
|
"learning_rate": 2.3104947466063787e-05, |
|
"loss": 0.4391, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 7.811993517017828, |
|
"grad_norm": 2.8465991020202637, |
|
"learning_rate": 2.2456865869630972e-05, |
|
"loss": 0.4844, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 7.844408427876823, |
|
"grad_norm": 2.3676860332489014, |
|
"learning_rate": 2.181685175319702e-05, |
|
"loss": 0.4028, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 7.876823338735819, |
|
"grad_norm": 2.0618326663970947, |
|
"learning_rate": 2.118497170294195e-05, |
|
"loss": 0.5056, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 7.9092382495948135, |
|
"grad_norm": 2.6422433853149414, |
|
"learning_rate": 2.0561291458788733e-05, |
|
"loss": 0.4602, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 7.941653160453809, |
|
"grad_norm": 2.1969330310821533, |
|
"learning_rate": 1.994587590756397e-05, |
|
"loss": 0.4256, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 7.974068071312804, |
|
"grad_norm": 2.574162721633911, |
|
"learning_rate": 1.9338789076247e-05, |
|
"loss": 0.4586, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 8.006482982171798, |
|
"grad_norm": 1.6469364166259766, |
|
"learning_rate": 1.874009412530877e-05, |
|
"loss": 0.419, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 8.038897893030795, |
|
"grad_norm": 2.7968332767486572, |
|
"learning_rate": 1.8149853342140645e-05, |
|
"loss": 0.4858, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 8.07131280388979, |
|
"grad_norm": 2.0327930450439453, |
|
"learning_rate": 1.7568128134574113e-05, |
|
"loss": 0.447, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 8.103727714748784, |
|
"grad_norm": 1.877231478691101, |
|
"learning_rate": 1.6994979024491942e-05, |
|
"loss": 0.4012, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 8.13614262560778, |
|
"grad_norm": 2.364046573638916, |
|
"learning_rate": 1.64304656415317e-05, |
|
"loss": 0.419, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 8.168557536466775, |
|
"grad_norm": 2.0480690002441406, |
|
"learning_rate": 1.587464671688187e-05, |
|
"loss": 0.4063, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 8.20097244732577, |
|
"grad_norm": 1.9457817077636719, |
|
"learning_rate": 1.5327580077171587e-05, |
|
"loss": 0.389, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 8.233387358184766, |
|
"grad_norm": 2.2139978408813477, |
|
"learning_rate": 1.4789322638454351e-05, |
|
"loss": 0.4014, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 8.26580226904376, |
|
"grad_norm": 2.7978177070617676, |
|
"learning_rate": 1.4259930400286669e-05, |
|
"loss": 0.4783, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 8.298217179902755, |
|
"grad_norm": 2.2762577533721924, |
|
"learning_rate": 1.373945843990192e-05, |
|
"loss": 0.4984, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 8.33063209076175, |
|
"grad_norm": 2.136845827102661, |
|
"learning_rate": 1.322796090648013e-05, |
|
"loss": 0.4487, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 8.363047001620746, |
|
"grad_norm": 2.696369171142578, |
|
"learning_rate": 1.272549101551438e-05, |
|
"loss": 0.4836, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 8.39546191247974, |
|
"grad_norm": 1.9966158866882324, |
|
"learning_rate": 1.2232101043274436e-05, |
|
"loss": 0.4004, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 8.427876823338735, |
|
"grad_norm": 2.289769411087036, |
|
"learning_rate": 1.1747842321367886e-05, |
|
"loss": 0.4693, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 8.460291734197732, |
|
"grad_norm": 1.8652838468551636, |
|
"learning_rate": 1.1272765231399685e-05, |
|
"loss": 0.4859, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 8.492706645056726, |
|
"grad_norm": 2.4249916076660156, |
|
"learning_rate": 1.0806919199730615e-05, |
|
"loss": 0.4189, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 8.525121555915721, |
|
"grad_norm": 2.178057909011841, |
|
"learning_rate": 1.035035269233493e-05, |
|
"loss": 0.4055, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 8.557536466774716, |
|
"grad_norm": 2.8611860275268555, |
|
"learning_rate": 9.903113209758096e-06, |
|
"loss": 0.4782, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 8.589951377633712, |
|
"grad_norm": 1.872373342514038, |
|
"learning_rate": 9.465247282174805e-06, |
|
"loss": 0.4262, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 8.622366288492707, |
|
"grad_norm": 2.744319200515747, |
|
"learning_rate": 9.036800464548157e-06, |
|
"loss": 0.4203, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 8.654781199351701, |
|
"grad_norm": 2.082002639770508, |
|
"learning_rate": 8.617817331890154e-06, |
|
"loss": 0.449, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 8.687196110210698, |
|
"grad_norm": 1.7450381517410278, |
|
"learning_rate": 8.208341474624071e-06, |
|
"loss": 0.4141, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 8.719611021069692, |
|
"grad_norm": 1.9353231191635132, |
|
"learning_rate": 7.808415494049514e-06, |
|
"loss": 0.4344, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 8.752025931928687, |
|
"grad_norm": 2.474977493286133, |
|
"learning_rate": 7.4180809979102036e-06, |
|
"loss": 0.4689, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 8.784440842787681, |
|
"grad_norm": 2.5193610191345215, |
|
"learning_rate": 7.0373785960650475e-06, |
|
"loss": 0.4791, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 8.816855753646678, |
|
"grad_norm": 2.5343973636627197, |
|
"learning_rate": 6.666347896263325e-06, |
|
"loss": 0.4609, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 8.849270664505672, |
|
"grad_norm": 2.6066677570343018, |
|
"learning_rate": 6.3050275000238414e-06, |
|
"loss": 0.4913, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 8.881685575364667, |
|
"grad_norm": 1.9997366666793823, |
|
"learning_rate": 5.953454998618857e-06, |
|
"loss": 0.4407, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 8.914100486223663, |
|
"grad_norm": 2.1540727615356445, |
|
"learning_rate": 5.611666969163243e-06, |
|
"loss": 0.4356, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 8.946515397082658, |
|
"grad_norm": 2.627286195755005, |
|
"learning_rate": 5.27969897080901e-06, |
|
"loss": 0.3782, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 8.978930307941653, |
|
"grad_norm": 2.116828680038452, |
|
"learning_rate": 4.957585541045684e-06, |
|
"loss": 0.4608, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 9.011345218800649, |
|
"grad_norm": 2.2772679328918457, |
|
"learning_rate": 4.6453601921072395e-06, |
|
"loss": 0.4667, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 9.043760129659644, |
|
"grad_norm": 1.7504632472991943, |
|
"learning_rate": 4.34305540748543e-06, |
|
"loss": 0.4035, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 9.076175040518638, |
|
"grad_norm": 1.660288691520691, |
|
"learning_rate": 4.050702638550275e-06, |
|
"loss": 0.4033, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 9.108589951377633, |
|
"grad_norm": 2.4989752769470215, |
|
"learning_rate": 3.768332301277866e-06, |
|
"loss": 0.4518, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 9.14100486223663, |
|
"grad_norm": 1.8193674087524414, |
|
"learning_rate": 3.495973773086014e-06, |
|
"loss": 0.3902, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 9.173419773095624, |
|
"grad_norm": 2.869234323501587, |
|
"learning_rate": 3.233655389777801e-06, |
|
"loss": 0.4842, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 9.205834683954619, |
|
"grad_norm": 2.684445381164551, |
|
"learning_rate": 2.9814044425935606e-06, |
|
"loss": 0.4263, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 9.238249594813615, |
|
"grad_norm": 2.5223307609558105, |
|
"learning_rate": 2.739247175371562e-06, |
|
"loss": 0.4493, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 9.27066450567261, |
|
"grad_norm": 2.3767752647399902, |
|
"learning_rate": 2.5072087818176382e-06, |
|
"loss": 0.4369, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 9.303079416531604, |
|
"grad_norm": 2.1580522060394287, |
|
"learning_rate": 2.2853134028840594e-06, |
|
"loss": 0.4093, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 9.335494327390599, |
|
"grad_norm": 1.7261040210723877, |
|
"learning_rate": 2.073584124257899e-06, |
|
"loss": 0.451, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 9.367909238249595, |
|
"grad_norm": 1.6289855241775513, |
|
"learning_rate": 1.8720429739592982e-06, |
|
"loss": 0.4305, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 9.40032414910859, |
|
"grad_norm": 2.0105481147766113, |
|
"learning_rate": 1.6807109200496995e-06, |
|
"loss": 0.4455, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 9.432739059967584, |
|
"grad_norm": 2.8889999389648438, |
|
"learning_rate": 1.4996078684503144e-06, |
|
"loss": 0.4451, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 9.46515397082658, |
|
"grad_norm": 2.474945306777954, |
|
"learning_rate": 1.3287526608711131e-06, |
|
"loss": 0.4945, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 9.497568881685575, |
|
"grad_norm": 1.9426692724227905, |
|
"learning_rate": 1.1681630728506699e-06, |
|
"loss": 0.4767, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 9.52998379254457, |
|
"grad_norm": 2.104893445968628, |
|
"learning_rate": 1.0178558119067315e-06, |
|
"loss": 0.4088, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 9.562398703403566, |
|
"grad_norm": 2.283007860183716, |
|
"learning_rate": 8.778465157979976e-07, |
|
"loss": 0.3957, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 9.594813614262561, |
|
"grad_norm": 2.111274480819702, |
|
"learning_rate": 7.481497508972312e-07, |
|
"loss": 0.4488, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 9.627228525121556, |
|
"grad_norm": 1.921383023262024, |
|
"learning_rate": 6.287790106757396e-07, |
|
"loss": 0.4685, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 9.65964343598055, |
|
"grad_norm": 2.4498233795166016, |
|
"learning_rate": 5.19746714299596e-07, |
|
"loss": 0.4298, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 9.692058346839547, |
|
"grad_norm": 2.4924449920654297, |
|
"learning_rate": 4.210642053375069e-07, |
|
"loss": 0.4393, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 9.724473257698541, |
|
"grad_norm": 2.780961513519287, |
|
"learning_rate": 3.3274175058067846e-07, |
|
"loss": 0.4444, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 9.756888168557536, |
|
"grad_norm": 2.037053108215332, |
|
"learning_rate": 2.547885389746485e-07, |
|
"loss": 0.4866, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 9.789303079416532, |
|
"grad_norm": 1.8418095111846924, |
|
"learning_rate": 1.8721268066330676e-07, |
|
"loss": 0.3989, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 9.821717990275527, |
|
"grad_norm": 2.2461559772491455, |
|
"learning_rate": 1.300212061451367e-07, |
|
"loss": 0.4047, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 9.854132901134522, |
|
"grad_norm": 1.7511504888534546, |
|
"learning_rate": 8.322006554171146e-08, |
|
"loss": 0.4171, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 9.886547811993516, |
|
"grad_norm": 2.4576804637908936, |
|
"learning_rate": 4.6814127978722644e-08, |
|
"loss": 0.4155, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 9.918962722852513, |
|
"grad_norm": 2.030487298965454, |
|
"learning_rate": 2.080718107935198e-08, |
|
"loss": 0.4397, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 9.951377633711507, |
|
"grad_norm": 2.6254327297210693, |
|
"learning_rate": 5.201930570242208e-09, |
|
"loss": 0.4338, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 9.983792544570502, |
|
"grad_norm": 2.197205066680908, |
|
"learning_rate": 0.0, |
|
"loss": 0.4586, |
|
"step": 3080 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3080, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.87794451980288e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|