883k_finetune_model / trainer_state.json
Devie's picture
Update model files
4b025d2
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 6135,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008149959250203748,
"grad_norm": 0.18579324653121165,
"learning_rate": 0.0007999910290180627,
"loss": 0.6628,
"step": 50
},
{
"epoch": 0.016299918500407497,
"grad_norm": 0.17807681860788138,
"learning_rate": 0.0007997893323384012,
"loss": 0.5165,
"step": 100
},
{
"epoch": 0.02444987775061125,
"grad_norm": 0.19350751669754768,
"learning_rate": 0.0007993223754535443,
"loss": 0.4975,
"step": 150
},
{
"epoch": 0.032599837000814993,
"grad_norm": 0.1628741338405071,
"learning_rate": 0.0007985904681893655,
"loss": 0.4859,
"step": 200
},
{
"epoch": 0.040749796251018745,
"grad_norm": 0.1692654309709468,
"learning_rate": 0.0007975940961663036,
"loss": 0.4698,
"step": 250
},
{
"epoch": 0.0488997555012225,
"grad_norm": 0.16755101172401649,
"learning_rate": 0.0007963339204771541,
"loss": 0.4584,
"step": 300
},
{
"epoch": 0.05704971475142624,
"grad_norm": 0.16981406073926134,
"learning_rate": 0.0007948107772484337,
"loss": 0.4497,
"step": 350
},
{
"epoch": 0.06519967400162999,
"grad_norm": 0.13393142327752477,
"learning_rate": 0.0007930256770856106,
"loss": 0.435,
"step": 400
},
{
"epoch": 0.07334963325183375,
"grad_norm": 0.14862318790293866,
"learning_rate": 0.000790979804402568,
"loss": 0.4379,
"step": 450
},
{
"epoch": 0.08149959250203749,
"grad_norm": 0.14157872785652958,
"learning_rate": 0.0007886745166357449,
"loss": 0.4389,
"step": 500
},
{
"epoch": 0.08964955175224124,
"grad_norm": 0.14597403141832885,
"learning_rate": 0.0007861113433434774,
"loss": 0.4248,
"step": 550
},
{
"epoch": 0.097799511002445,
"grad_norm": 0.14439776167516288,
"learning_rate": 0.0007832919851911376,
"loss": 0.4316,
"step": 600
},
{
"epoch": 0.10594947025264874,
"grad_norm": 0.2043224587005451,
"learning_rate": 0.0007802183128227408,
"loss": 0.4168,
"step": 650
},
{
"epoch": 0.11409942950285248,
"grad_norm": 0.13509782410904447,
"learning_rate": 0.0007768923656197741,
"loss": 0.4198,
"step": 700
},
{
"epoch": 0.12224938875305623,
"grad_norm": 0.21692882701447652,
"learning_rate": 0.0007734641630410531,
"loss": 0.4286,
"step": 750
},
{
"epoch": 0.13039934800325997,
"grad_norm": 0.18392281414572664,
"learning_rate": 0.000769650312579427,
"loss": 0.4466,
"step": 800
},
{
"epoch": 0.13854930725346373,
"grad_norm": 0.17037687429265078,
"learning_rate": 0.000765591199150948,
"loss": 0.4376,
"step": 850
},
{
"epoch": 0.1466992665036675,
"grad_norm": 0.17552399857385595,
"learning_rate": 0.0007612895159772056,
"loss": 0.4264,
"step": 900
},
{
"epoch": 0.15484922575387122,
"grad_norm": 0.1438151353983127,
"learning_rate": 0.0007567481172248043,
"loss": 0.41,
"step": 950
},
{
"epoch": 0.16299918500407498,
"grad_norm": 0.1447417216259079,
"learning_rate": 0.0007519700161116256,
"loss": 0.4119,
"step": 1000
},
{
"epoch": 0.17114914425427874,
"grad_norm": 0.15199420470790867,
"learning_rate": 0.000746958382907557,
"loss": 0.412,
"step": 1050
},
{
"epoch": 0.17929910350448247,
"grad_norm": 0.14727418743789575,
"learning_rate": 0.0007417165428310189,
"loss": 0.3946,
"step": 1100
},
{
"epoch": 0.18744906275468623,
"grad_norm": 0.34678998038165904,
"learning_rate": 0.000736471024768781,
"loss": 0.393,
"step": 1150
},
{
"epoch": 0.19559902200489,
"grad_norm": 0.1482936724234658,
"learning_rate": 0.0007307882077545133,
"loss": 0.3992,
"step": 1200
},
{
"epoch": 0.20374898125509372,
"grad_norm": 0.14346039255661802,
"learning_rate": 0.00072488591277831,
"loss": 0.39,
"step": 1250
},
{
"epoch": 0.21189894050529748,
"grad_norm": 0.14534505858629046,
"learning_rate": 0.0007187680560126396,
"loss": 0.3878,
"step": 1300
},
{
"epoch": 0.2200488997555012,
"grad_norm": 0.1388631218472249,
"learning_rate": 0.0007124386966552088,
"loss": 0.3746,
"step": 1350
},
{
"epoch": 0.22819885900570497,
"grad_norm": 0.15086917327804153,
"learning_rate": 0.0007059020342356855,
"loss": 0.3859,
"step": 1400
},
{
"epoch": 0.23634881825590873,
"grad_norm": 0.15384912931521313,
"learning_rate": 0.0006991624058293096,
"loss": 0.3731,
"step": 1450
},
{
"epoch": 0.24449877750611246,
"grad_norm": 0.15914151929826,
"learning_rate": 0.000692224283179246,
"loss": 0.3754,
"step": 1500
},
{
"epoch": 0.2526487367563162,
"grad_norm": 0.11506887001881892,
"learning_rate": 0.0006850922697295807,
"loss": 0.3666,
"step": 1550
},
{
"epoch": 0.26079869600651995,
"grad_norm": 0.13052208168058513,
"learning_rate": 0.0006777710975709381,
"loss": 0.3766,
"step": 1600
},
{
"epoch": 0.26894865525672373,
"grad_norm": 0.11941691123999496,
"learning_rate": 0.0006702656243007372,
"loss": 0.3602,
"step": 1650
},
{
"epoch": 0.27709861450692747,
"grad_norm": 0.11925894836007636,
"learning_rate": 0.0006625808298001773,
"loss": 0.3612,
"step": 1700
},
{
"epoch": 0.2852485737571312,
"grad_norm": 0.14308144825812302,
"learning_rate": 0.0006547218129300866,
"loss": 0.3609,
"step": 1750
},
{
"epoch": 0.293398533007335,
"grad_norm": 0.14967189995049074,
"learning_rate": 0.0006466937881478278,
"loss": 0.3561,
"step": 1800
},
{
"epoch": 0.3015484922575387,
"grad_norm": 0.13228828753052413,
"learning_rate": 0.0006385020820475062,
"loss": 0.348,
"step": 1850
},
{
"epoch": 0.30969845150774244,
"grad_norm": 0.13322170574643283,
"learning_rate": 0.000630152129825775,
"loss": 0.3697,
"step": 1900
},
{
"epoch": 0.31784841075794623,
"grad_norm": 0.12720122919651028,
"learning_rate": 0.0006216494716755822,
"loss": 0.3448,
"step": 1950
},
{
"epoch": 0.32599837000814996,
"grad_norm": 0.13582416923221627,
"learning_rate": 0.0006129997491102531,
"loss": 0.3567,
"step": 2000
},
{
"epoch": 0.3341483292583537,
"grad_norm": 0.14925459708686192,
"learning_rate": 0.000604208701220346,
"loss": 0.3484,
"step": 2050
},
{
"epoch": 0.3422982885085575,
"grad_norm": 0.1243721342991605,
"learning_rate": 0.000595282160865766,
"loss": 0.3484,
"step": 2100
},
{
"epoch": 0.3504482477587612,
"grad_norm": 0.11953241203626794,
"learning_rate": 0.0005862260508056631,
"loss": 0.3458,
"step": 2150
},
{
"epoch": 0.35859820700896494,
"grad_norm": 0.11268782331878287,
"learning_rate": 0.0005770463797686815,
"loss": 0.339,
"step": 2200
},
{
"epoch": 0.36674816625916873,
"grad_norm": 0.11687380056427817,
"learning_rate": 0.0005677492384661679,
"loss": 0.3337,
"step": 2250
},
{
"epoch": 0.37489812550937246,
"grad_norm": 0.12673741702704472,
"learning_rate": 0.0005583407955509861,
"loss": 0.3346,
"step": 2300
},
{
"epoch": 0.3830480847595762,
"grad_norm": 0.11915610120268014,
"learning_rate": 0.0005488272935246143,
"loss": 0.333,
"step": 2350
},
{
"epoch": 0.39119804400978,
"grad_norm": 0.13402357429565842,
"learning_rate": 0.0005392150445952471,
"loss": 0.3305,
"step": 2400
},
{
"epoch": 0.3993480032599837,
"grad_norm": 0.1160573866470615,
"learning_rate": 0.0005295104264896449,
"loss": 0.34,
"step": 2450
},
{
"epoch": 0.40749796251018744,
"grad_norm": 0.11766125729550878,
"learning_rate": 0.0005197198782215126,
"loss": 0.3282,
"step": 2500
},
{
"epoch": 0.4156479217603912,
"grad_norm": 0.10590606446287697,
"learning_rate": 0.0005098498958192145,
"loss": 0.3299,
"step": 2550
},
{
"epoch": 0.42379788101059496,
"grad_norm": 0.11241439393785868,
"learning_rate": 0.0004999070280156597,
"loss": 0.3298,
"step": 2600
},
{
"epoch": 0.4319478402607987,
"grad_norm": 0.11125810401522715,
"learning_rate": 0.0004898978719032175,
"loss": 0.3215,
"step": 2650
},
{
"epoch": 0.4400977995110024,
"grad_norm": 0.11191318960071489,
"learning_rate": 0.0004798290685565476,
"loss": 0.3249,
"step": 2700
},
{
"epoch": 0.4482477587612062,
"grad_norm": 0.1287365840788498,
"learning_rate": 0.0004697072986262474,
"loss": 0.3161,
"step": 2750
},
{
"epoch": 0.45639771801140994,
"grad_norm": 0.11552326003349313,
"learning_rate": 0.00045953927790623976,
"loss": 0.3124,
"step": 2800
},
{
"epoch": 0.46454767726161367,
"grad_norm": 0.11748617144150195,
"learning_rate": 0.0004493317528778449,
"loss": 0.3218,
"step": 2850
},
{
"epoch": 0.47269763651181745,
"grad_norm": 0.10367103487291035,
"learning_rate": 0.00043909149623349,
"loss": 0.3038,
"step": 2900
},
{
"epoch": 0.4808475957620212,
"grad_norm": 0.13997935598004313,
"learning_rate": 0.00042882530238302793,
"loss": 0.3079,
"step": 2950
},
{
"epoch": 0.4889975550122249,
"grad_norm": 0.11405505781771122,
"learning_rate": 0.000418539982945647,
"loss": 0.3161,
"step": 3000
},
{
"epoch": 0.4971475142624287,
"grad_norm": 0.10244350957803788,
"learning_rate": 0.000408242362230361,
"loss": 0.3121,
"step": 3050
},
{
"epoch": 0.5052974735126324,
"grad_norm": 0.11487103054153286,
"learning_rate": 0.0003979392727080819,
"loss": 0.3048,
"step": 3100
},
{
"epoch": 0.5134474327628362,
"grad_norm": 0.1105131381250625,
"learning_rate": 0.0003876375504782742,
"loss": 0.2951,
"step": 3150
},
{
"epoch": 0.5215973920130399,
"grad_norm": 0.1114339713919424,
"learning_rate": 0.00037734403073320455,
"loss": 0.2978,
"step": 3200
},
{
"epoch": 0.5297473512632437,
"grad_norm": 0.12439423946252776,
"learning_rate": 0.0003670655432227906,
"loss": 0.2977,
"step": 3250
},
{
"epoch": 0.5378973105134475,
"grad_norm": 0.10946864392870362,
"learning_rate": 0.0003568089077230634,
"loss": 0.2966,
"step": 3300
},
{
"epoch": 0.5460472697636511,
"grad_norm": 0.09312964606321276,
"learning_rate": 0.00034658092951124573,
"loss": 0.2877,
"step": 3350
},
{
"epoch": 0.5541972290138549,
"grad_norm": 0.1251440088268339,
"learning_rate": 0.00033638839485045124,
"loss": 0.2953,
"step": 3400
},
{
"epoch": 0.5623471882640587,
"grad_norm": 0.11260490319865255,
"learning_rate": 0.00032623806648699865,
"loss": 0.2836,
"step": 3450
},
{
"epoch": 0.5704971475142624,
"grad_norm": 0.10260196893419725,
"learning_rate": 0.00031613667916333013,
"loss": 0.2883,
"step": 3500
},
{
"epoch": 0.5786471067644662,
"grad_norm": 0.11067247135798225,
"learning_rate": 0.0003060909351495104,
"loss": 0.2919,
"step": 3550
},
{
"epoch": 0.58679706601467,
"grad_norm": 0.13049869044328746,
"learning_rate": 0.00029610749979627,
"loss": 0.2801,
"step": 3600
},
{
"epoch": 0.5949470252648736,
"grad_norm": 0.0994668923515135,
"learning_rate": 0.0002861929971125462,
"loss": 0.2764,
"step": 3650
},
{
"epoch": 0.6030969845150774,
"grad_norm": 0.12787979565602525,
"learning_rate": 0.0002763540053704528,
"loss": 0.2828,
"step": 3700
},
{
"epoch": 0.6112469437652812,
"grad_norm": 0.11702233580318924,
"learning_rate": 0.0002665970527405966,
"loss": 0.275,
"step": 3750
},
{
"epoch": 0.6193969030154849,
"grad_norm": 0.12216378448453033,
"learning_rate": 0.0002569286129606376,
"loss": 0.2781,
"step": 3800
},
{
"epoch": 0.6275468622656887,
"grad_norm": 0.11991224657683039,
"learning_rate": 0.00024735510103996296,
"loss": 0.2779,
"step": 3850
},
{
"epoch": 0.6356968215158925,
"grad_norm": 0.11355699073704052,
"learning_rate": 0.00023788286900332977,
"loss": 0.278,
"step": 3900
},
{
"epoch": 0.6438467807660961,
"grad_norm": 0.10548043571718951,
"learning_rate": 0.00022851820167629582,
"loss": 0.2737,
"step": 3950
},
{
"epoch": 0.6519967400162999,
"grad_norm": 0.10359945452094396,
"learning_rate": 0.0002192673125152389,
"loss": 0.2724,
"step": 4000
},
{
"epoch": 0.6601466992665037,
"grad_norm": 0.11760044543664713,
"learning_rate": 0.0002101363394847284,
"loss": 0.2629,
"step": 4050
},
{
"epoch": 0.6682966585167074,
"grad_norm": 0.1061394336514801,
"learning_rate": 0.00020113134098498586,
"loss": 0.2686,
"step": 4100
},
{
"epoch": 0.6764466177669112,
"grad_norm": 0.12584575464846712,
"learning_rate": 0.00019225829183213756,
"loss": 0.2699,
"step": 4150
},
{
"epoch": 0.684596577017115,
"grad_norm": 0.11274050205692296,
"learning_rate": 0.00018352307929392337,
"loss": 0.26,
"step": 4200
},
{
"epoch": 0.6927465362673186,
"grad_norm": 0.10624178207114862,
"learning_rate": 0.0001749314991834945,
"loss": 0.2676,
"step": 4250
},
{
"epoch": 0.7008964955175224,
"grad_norm": 0.10585200267430328,
"learning_rate": 0.00016648925201389348,
"loss": 0.2699,
"step": 4300
},
{
"epoch": 0.7090464547677262,
"grad_norm": 0.1036684915414757,
"learning_rate": 0.00015820193921576214,
"loss": 0.266,
"step": 4350
},
{
"epoch": 0.7171964140179299,
"grad_norm": 0.11470877255663611,
"learning_rate": 0.00015007505942079362,
"loss": 0.2574,
"step": 4400
},
{
"epoch": 0.7253463732681337,
"grad_norm": 0.11442215173910815,
"learning_rate": 0.00014211400481339013,
"loss": 0.2536,
"step": 4450
},
{
"epoch": 0.7334963325183375,
"grad_norm": 0.10579448760201741,
"learning_rate": 0.00013432405755294893,
"loss": 0.2518,
"step": 4500
},
{
"epoch": 0.7416462917685411,
"grad_norm": 0.11019413291456029,
"learning_rate": 0.0001267103862691497,
"loss": 0.2528,
"step": 4550
},
{
"epoch": 0.7497962510187449,
"grad_norm": 0.1237941013535732,
"learning_rate": 0.00011927804263256903,
"loss": 0.2506,
"step": 4600
},
{
"epoch": 0.7579462102689487,
"grad_norm": 0.11747893253572085,
"learning_rate": 0.0001120319580028975,
"loss": 0.2432,
"step": 4650
},
{
"epoch": 0.7660961695191524,
"grad_norm": 0.13243835140024088,
"learning_rate": 0.00010497694015698214,
"loss": 0.2502,
"step": 4700
},
{
"epoch": 0.7742461287693562,
"grad_norm": 0.12096359907685643,
"learning_rate": 9.811767009886681e-05,
"loss": 0.2515,
"step": 4750
},
{
"epoch": 0.78239608801956,
"grad_norm": 0.1010867422514499,
"learning_rate": 9.145869895394685e-05,
"loss": 0.2471,
"step": 4800
},
{
"epoch": 0.7905460472697636,
"grad_norm": 0.12108612955883465,
"learning_rate": 8.500444494929692e-05,
"loss": 0.2508,
"step": 4850
},
{
"epoch": 0.7986960065199674,
"grad_norm": 0.11827129819048823,
"learning_rate": 7.875919048217753e-05,
"loss": 0.2421,
"step": 4900
},
{
"epoch": 0.8068459657701712,
"grad_norm": 0.10889141696060357,
"learning_rate": 7.272707927866531e-05,
"loss": 0.2444,
"step": 4950
},
{
"epoch": 0.8149959250203749,
"grad_norm": 0.11478478031330713,
"learning_rate": 6.691211364428989e-05,
"loss": 0.239,
"step": 5000
},
{
"epoch": 0.8231458842705787,
"grad_norm": 0.12806490540385113,
"learning_rate": 6.131815180850508e-05,
"loss": 0.2429,
"step": 5050
},
{
"epoch": 0.8312958435207825,
"grad_norm": 0.10357727349126965,
"learning_rate": 5.5948905364753945e-05,
"loss": 0.2467,
"step": 5100
},
{
"epoch": 0.8394458027709861,
"grad_norm": 0.1101292515864723,
"learning_rate": 5.080793680782607e-05,
"loss": 0.2405,
"step": 5150
},
{
"epoch": 0.8475957620211899,
"grad_norm": 0.10144980764898691,
"learning_rate": 4.5898657170142746e-05,
"loss": 0.235,
"step": 5200
},
{
"epoch": 0.8557457212713936,
"grad_norm": 0.11277819168557472,
"learning_rate": 4.1224323758537155e-05,
"loss": 0.2341,
"step": 5250
},
{
"epoch": 0.8638956805215974,
"grad_norm": 0.10800888686256803,
"learning_rate": 3.678803799303134e-05,
"loss": 0.24,
"step": 5300
},
{
"epoch": 0.8720456397718012,
"grad_norm": 0.11901575566402685,
"learning_rate": 3.2592743349044186e-05,
"loss": 0.2341,
"step": 5350
},
{
"epoch": 0.8801955990220048,
"grad_norm": 0.12444346139158177,
"learning_rate": 2.8641223404395524e-05,
"loss": 0.2291,
"step": 5400
},
{
"epoch": 0.8883455582722086,
"grad_norm": 0.10818161109308624,
"learning_rate": 2.4936099992402606e-05,
"loss": 0.2357,
"step": 5450
},
{
"epoch": 0.8964955175224124,
"grad_norm": 0.12545021889499927,
"learning_rate": 2.1479831462293265e-05,
"loss": 0.2349,
"step": 5500
},
{
"epoch": 0.9046454767726161,
"grad_norm": 0.14021615830916254,
"learning_rate": 1.8274711048092084e-05,
"loss": 0.2306,
"step": 5550
},
{
"epoch": 0.9127954360228199,
"grad_norm": 0.10349748867815296,
"learning_rate": 1.5322865347059044e-05,
"loss": 0.2243,
"step": 5600
},
{
"epoch": 0.9209453952730237,
"grad_norm": 0.11963300064608776,
"learning_rate": 1.2626252908692638e-05,
"loss": 0.2333,
"step": 5650
},
{
"epoch": 0.9290953545232273,
"grad_norm": 0.10953396833548994,
"learning_rate": 1.0186662935232384e-05,
"loss": 0.2339,
"step": 5700
},
{
"epoch": 0.9372453137734311,
"grad_norm": 0.1224263724970897,
"learning_rate": 8.00571409452302e-06,
"loss": 0.2313,
"step": 5750
},
{
"epoch": 0.9453952730236349,
"grad_norm": 0.12352858513849092,
"learning_rate": 6.084853446028671e-06,
"loss": 0.2296,
"step": 5800
},
{
"epoch": 0.9535452322738386,
"grad_norm": 0.12633679586576274,
"learning_rate": 4.425355480708859e-06,
"loss": 0.2354,
"step": 5850
},
{
"epoch": 0.9616951915240424,
"grad_norm": 0.10627066652464928,
"learning_rate": 3.028321275393786e-06,
"loss": 0.2332,
"step": 5900
},
{
"epoch": 0.9698451507742462,
"grad_norm": 0.12495472546410136,
"learning_rate": 1.8946777622199652e-06,
"loss": 0.2297,
"step": 5950
},
{
"epoch": 0.9779951100244498,
"grad_norm": 0.11479937267870474,
"learning_rate": 1.0251771136106314e-06,
"loss": 0.2372,
"step": 6000
},
{
"epoch": 0.9861450692746536,
"grad_norm": 0.1274989460799885,
"learning_rate": 4.203962432096642e-07,
"loss": 0.2316,
"step": 6050
},
{
"epoch": 0.9942950285248574,
"grad_norm": 0.10997701182931913,
"learning_rate": 8.073642309907036e-08,
"loss": 0.2289,
"step": 6100
},
{
"epoch": 1.0,
"step": 6135,
"total_flos": 307270109495296.0,
"train_loss": 0.320441021565606,
"train_runtime": 141453.5974,
"train_samples_per_second": 6.245,
"train_steps_per_second": 0.043
}
],
"logging_steps": 50,
"max_steps": 6135,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 307270109495296.0,
"train_batch_size": 18,
"trial_name": null,
"trial_params": null
}