{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 6135, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008149959250203748, "grad_norm": 0.18579324653121165, "learning_rate": 0.0007999910290180627, "loss": 0.6628, "step": 50 }, { "epoch": 0.016299918500407497, "grad_norm": 0.17807681860788138, "learning_rate": 0.0007997893323384012, "loss": 0.5165, "step": 100 }, { "epoch": 0.02444987775061125, "grad_norm": 0.19350751669754768, "learning_rate": 0.0007993223754535443, "loss": 0.4975, "step": 150 }, { "epoch": 0.032599837000814993, "grad_norm": 0.1628741338405071, "learning_rate": 0.0007985904681893655, "loss": 0.4859, "step": 200 }, { "epoch": 0.040749796251018745, "grad_norm": 0.1692654309709468, "learning_rate": 0.0007975940961663036, "loss": 0.4698, "step": 250 }, { "epoch": 0.0488997555012225, "grad_norm": 0.16755101172401649, "learning_rate": 0.0007963339204771541, "loss": 0.4584, "step": 300 }, { "epoch": 0.05704971475142624, "grad_norm": 0.16981406073926134, "learning_rate": 0.0007948107772484337, "loss": 0.4497, "step": 350 }, { "epoch": 0.06519967400162999, "grad_norm": 0.13393142327752477, "learning_rate": 0.0007930256770856106, "loss": 0.435, "step": 400 }, { "epoch": 0.07334963325183375, "grad_norm": 0.14862318790293866, "learning_rate": 0.000790979804402568, "loss": 0.4379, "step": 450 }, { "epoch": 0.08149959250203749, "grad_norm": 0.14157872785652958, "learning_rate": 0.0007886745166357449, "loss": 0.4389, "step": 500 }, { "epoch": 0.08964955175224124, "grad_norm": 0.14597403141832885, "learning_rate": 0.0007861113433434774, "loss": 0.4248, "step": 550 }, { "epoch": 0.097799511002445, "grad_norm": 0.14439776167516288, "learning_rate": 0.0007832919851911376, "loss": 0.4316, "step": 600 }, { "epoch": 0.10594947025264874, "grad_norm": 0.2043224587005451, "learning_rate": 0.0007802183128227408, "loss": 0.4168, "step": 650 }, { "epoch": 0.11409942950285248, "grad_norm": 0.13509782410904447, "learning_rate": 0.0007768923656197741, "loss": 0.4198, "step": 700 }, { "epoch": 0.12224938875305623, "grad_norm": 0.21692882701447652, "learning_rate": 0.0007734641630410531, "loss": 0.4286, "step": 750 }, { "epoch": 0.13039934800325997, "grad_norm": 0.18392281414572664, "learning_rate": 0.000769650312579427, "loss": 0.4466, "step": 800 }, { "epoch": 0.13854930725346373, "grad_norm": 0.17037687429265078, "learning_rate": 0.000765591199150948, "loss": 0.4376, "step": 850 }, { "epoch": 0.1466992665036675, "grad_norm": 0.17552399857385595, "learning_rate": 0.0007612895159772056, "loss": 0.4264, "step": 900 }, { "epoch": 0.15484922575387122, "grad_norm": 0.1438151353983127, "learning_rate": 0.0007567481172248043, "loss": 0.41, "step": 950 }, { "epoch": 0.16299918500407498, "grad_norm": 0.1447417216259079, "learning_rate": 0.0007519700161116256, "loss": 0.4119, "step": 1000 }, { "epoch": 0.17114914425427874, "grad_norm": 0.15199420470790867, "learning_rate": 0.000746958382907557, "loss": 0.412, "step": 1050 }, { "epoch": 0.17929910350448247, "grad_norm": 0.14727418743789575, "learning_rate": 0.0007417165428310189, "loss": 0.3946, "step": 1100 }, { "epoch": 0.18744906275468623, "grad_norm": 0.34678998038165904, "learning_rate": 0.000736471024768781, "loss": 0.393, "step": 1150 }, { "epoch": 0.19559902200489, "grad_norm": 0.1482936724234658, "learning_rate": 0.0007307882077545133, "loss": 0.3992, "step": 1200 }, { "epoch": 0.20374898125509372, "grad_norm": 0.14346039255661802, "learning_rate": 0.00072488591277831, "loss": 0.39, "step": 1250 }, { "epoch": 0.21189894050529748, "grad_norm": 0.14534505858629046, "learning_rate": 0.0007187680560126396, "loss": 0.3878, "step": 1300 }, { "epoch": 0.2200488997555012, "grad_norm": 0.1388631218472249, "learning_rate": 0.0007124386966552088, "loss": 0.3746, "step": 1350 }, { "epoch": 0.22819885900570497, "grad_norm": 0.15086917327804153, "learning_rate": 0.0007059020342356855, "loss": 0.3859, "step": 1400 }, { "epoch": 0.23634881825590873, "grad_norm": 0.15384912931521313, "learning_rate": 0.0006991624058293096, "loss": 0.3731, "step": 1450 }, { "epoch": 0.24449877750611246, "grad_norm": 0.15914151929826, "learning_rate": 0.000692224283179246, "loss": 0.3754, "step": 1500 }, { "epoch": 0.2526487367563162, "grad_norm": 0.11506887001881892, "learning_rate": 0.0006850922697295807, "loss": 0.3666, "step": 1550 }, { "epoch": 0.26079869600651995, "grad_norm": 0.13052208168058513, "learning_rate": 0.0006777710975709381, "loss": 0.3766, "step": 1600 }, { "epoch": 0.26894865525672373, "grad_norm": 0.11941691123999496, "learning_rate": 0.0006702656243007372, "loss": 0.3602, "step": 1650 }, { "epoch": 0.27709861450692747, "grad_norm": 0.11925894836007636, "learning_rate": 0.0006625808298001773, "loss": 0.3612, "step": 1700 }, { "epoch": 0.2852485737571312, "grad_norm": 0.14308144825812302, "learning_rate": 0.0006547218129300866, "loss": 0.3609, "step": 1750 }, { "epoch": 0.293398533007335, "grad_norm": 0.14967189995049074, "learning_rate": 0.0006466937881478278, "loss": 0.3561, "step": 1800 }, { "epoch": 0.3015484922575387, "grad_norm": 0.13228828753052413, "learning_rate": 0.0006385020820475062, "loss": 0.348, "step": 1850 }, { "epoch": 0.30969845150774244, "grad_norm": 0.13322170574643283, "learning_rate": 0.000630152129825775, "loss": 0.3697, "step": 1900 }, { "epoch": 0.31784841075794623, "grad_norm": 0.12720122919651028, "learning_rate": 0.0006216494716755822, "loss": 0.3448, "step": 1950 }, { "epoch": 0.32599837000814996, "grad_norm": 0.13582416923221627, "learning_rate": 0.0006129997491102531, "loss": 0.3567, "step": 2000 }, { "epoch": 0.3341483292583537, "grad_norm": 0.14925459708686192, "learning_rate": 0.000604208701220346, "loss": 0.3484, "step": 2050 }, { "epoch": 0.3422982885085575, "grad_norm": 0.1243721342991605, "learning_rate": 0.000595282160865766, "loss": 0.3484, "step": 2100 }, { "epoch": 0.3504482477587612, "grad_norm": 0.11953241203626794, "learning_rate": 0.0005862260508056631, "loss": 0.3458, "step": 2150 }, { "epoch": 0.35859820700896494, "grad_norm": 0.11268782331878287, "learning_rate": 0.0005770463797686815, "loss": 0.339, "step": 2200 }, { "epoch": 0.36674816625916873, "grad_norm": 0.11687380056427817, "learning_rate": 0.0005677492384661679, "loss": 0.3337, "step": 2250 }, { "epoch": 0.37489812550937246, "grad_norm": 0.12673741702704472, "learning_rate": 0.0005583407955509861, "loss": 0.3346, "step": 2300 }, { "epoch": 0.3830480847595762, "grad_norm": 0.11915610120268014, "learning_rate": 0.0005488272935246143, "loss": 0.333, "step": 2350 }, { "epoch": 0.39119804400978, "grad_norm": 0.13402357429565842, "learning_rate": 0.0005392150445952471, "loss": 0.3305, "step": 2400 }, { "epoch": 0.3993480032599837, "grad_norm": 0.1160573866470615, "learning_rate": 0.0005295104264896449, "loss": 0.34, "step": 2450 }, { "epoch": 0.40749796251018744, "grad_norm": 0.11766125729550878, "learning_rate": 0.0005197198782215126, "loss": 0.3282, "step": 2500 }, { "epoch": 0.4156479217603912, "grad_norm": 0.10590606446287697, "learning_rate": 0.0005098498958192145, "loss": 0.3299, "step": 2550 }, { "epoch": 0.42379788101059496, "grad_norm": 0.11241439393785868, "learning_rate": 0.0004999070280156597, "loss": 0.3298, "step": 2600 }, { "epoch": 0.4319478402607987, "grad_norm": 0.11125810401522715, "learning_rate": 0.0004898978719032175, "loss": 0.3215, "step": 2650 }, { "epoch": 0.4400977995110024, "grad_norm": 0.11191318960071489, "learning_rate": 0.0004798290685565476, "loss": 0.3249, "step": 2700 }, { "epoch": 0.4482477587612062, "grad_norm": 0.1287365840788498, "learning_rate": 0.0004697072986262474, "loss": 0.3161, "step": 2750 }, { "epoch": 0.45639771801140994, "grad_norm": 0.11552326003349313, "learning_rate": 0.00045953927790623976, "loss": 0.3124, "step": 2800 }, { "epoch": 0.46454767726161367, "grad_norm": 0.11748617144150195, "learning_rate": 0.0004493317528778449, "loss": 0.3218, "step": 2850 }, { "epoch": 0.47269763651181745, "grad_norm": 0.10367103487291035, "learning_rate": 0.00043909149623349, "loss": 0.3038, "step": 2900 }, { "epoch": 0.4808475957620212, "grad_norm": 0.13997935598004313, "learning_rate": 0.00042882530238302793, "loss": 0.3079, "step": 2950 }, { "epoch": 0.4889975550122249, "grad_norm": 0.11405505781771122, "learning_rate": 0.000418539982945647, "loss": 0.3161, "step": 3000 }, { "epoch": 0.4971475142624287, "grad_norm": 0.10244350957803788, "learning_rate": 0.000408242362230361, "loss": 0.3121, "step": 3050 }, { "epoch": 0.5052974735126324, "grad_norm": 0.11487103054153286, "learning_rate": 0.0003979392727080819, "loss": 0.3048, "step": 3100 }, { "epoch": 0.5134474327628362, "grad_norm": 0.1105131381250625, "learning_rate": 0.0003876375504782742, "loss": 0.2951, "step": 3150 }, { "epoch": 0.5215973920130399, "grad_norm": 0.1114339713919424, "learning_rate": 0.00037734403073320455, "loss": 0.2978, "step": 3200 }, { "epoch": 0.5297473512632437, "grad_norm": 0.12439423946252776, "learning_rate": 0.0003670655432227906, "loss": 0.2977, "step": 3250 }, { "epoch": 0.5378973105134475, "grad_norm": 0.10946864392870362, "learning_rate": 0.0003568089077230634, "loss": 0.2966, "step": 3300 }, { "epoch": 0.5460472697636511, "grad_norm": 0.09312964606321276, "learning_rate": 0.00034658092951124573, "loss": 0.2877, "step": 3350 }, { "epoch": 0.5541972290138549, "grad_norm": 0.1251440088268339, "learning_rate": 0.00033638839485045124, "loss": 0.2953, "step": 3400 }, { "epoch": 0.5623471882640587, "grad_norm": 0.11260490319865255, "learning_rate": 0.00032623806648699865, "loss": 0.2836, "step": 3450 }, { "epoch": 0.5704971475142624, "grad_norm": 0.10260196893419725, "learning_rate": 0.00031613667916333013, "loss": 0.2883, "step": 3500 }, { "epoch": 0.5786471067644662, "grad_norm": 0.11067247135798225, "learning_rate": 0.0003060909351495104, "loss": 0.2919, "step": 3550 }, { "epoch": 0.58679706601467, "grad_norm": 0.13049869044328746, "learning_rate": 0.00029610749979627, "loss": 0.2801, "step": 3600 }, { "epoch": 0.5949470252648736, "grad_norm": 0.0994668923515135, "learning_rate": 0.0002861929971125462, "loss": 0.2764, "step": 3650 }, { "epoch": 0.6030969845150774, "grad_norm": 0.12787979565602525, "learning_rate": 0.0002763540053704528, "loss": 0.2828, "step": 3700 }, { "epoch": 0.6112469437652812, "grad_norm": 0.11702233580318924, "learning_rate": 0.0002665970527405966, "loss": 0.275, "step": 3750 }, { "epoch": 0.6193969030154849, "grad_norm": 0.12216378448453033, "learning_rate": 0.0002569286129606376, "loss": 0.2781, "step": 3800 }, { "epoch": 0.6275468622656887, "grad_norm": 0.11991224657683039, "learning_rate": 0.00024735510103996296, "loss": 0.2779, "step": 3850 }, { "epoch": 0.6356968215158925, "grad_norm": 0.11355699073704052, "learning_rate": 0.00023788286900332977, "loss": 0.278, "step": 3900 }, { "epoch": 0.6438467807660961, "grad_norm": 0.10548043571718951, "learning_rate": 0.00022851820167629582, "loss": 0.2737, "step": 3950 }, { "epoch": 0.6519967400162999, "grad_norm": 0.10359945452094396, "learning_rate": 0.0002192673125152389, "loss": 0.2724, "step": 4000 }, { "epoch": 0.6601466992665037, "grad_norm": 0.11760044543664713, "learning_rate": 0.0002101363394847284, "loss": 0.2629, "step": 4050 }, { "epoch": 0.6682966585167074, "grad_norm": 0.1061394336514801, "learning_rate": 0.00020113134098498586, "loss": 0.2686, "step": 4100 }, { "epoch": 0.6764466177669112, "grad_norm": 0.12584575464846712, "learning_rate": 0.00019225829183213756, "loss": 0.2699, "step": 4150 }, { "epoch": 0.684596577017115, "grad_norm": 0.11274050205692296, "learning_rate": 0.00018352307929392337, "loss": 0.26, "step": 4200 }, { "epoch": 0.6927465362673186, "grad_norm": 0.10624178207114862, "learning_rate": 0.0001749314991834945, "loss": 0.2676, "step": 4250 }, { "epoch": 0.7008964955175224, "grad_norm": 0.10585200267430328, "learning_rate": 0.00016648925201389348, "loss": 0.2699, "step": 4300 }, { "epoch": 0.7090464547677262, "grad_norm": 0.1036684915414757, "learning_rate": 0.00015820193921576214, "loss": 0.266, "step": 4350 }, { "epoch": 0.7171964140179299, "grad_norm": 0.11470877255663611, "learning_rate": 0.00015007505942079362, "loss": 0.2574, "step": 4400 }, { "epoch": 0.7253463732681337, "grad_norm": 0.11442215173910815, "learning_rate": 0.00014211400481339013, "loss": 0.2536, "step": 4450 }, { "epoch": 0.7334963325183375, "grad_norm": 0.10579448760201741, "learning_rate": 0.00013432405755294893, "loss": 0.2518, "step": 4500 }, { "epoch": 0.7416462917685411, "grad_norm": 0.11019413291456029, "learning_rate": 0.0001267103862691497, "loss": 0.2528, "step": 4550 }, { "epoch": 0.7497962510187449, "grad_norm": 0.1237941013535732, "learning_rate": 0.00011927804263256903, "loss": 0.2506, "step": 4600 }, { "epoch": 0.7579462102689487, "grad_norm": 0.11747893253572085, "learning_rate": 0.0001120319580028975, "loss": 0.2432, "step": 4650 }, { "epoch": 0.7660961695191524, "grad_norm": 0.13243835140024088, "learning_rate": 0.00010497694015698214, "loss": 0.2502, "step": 4700 }, { "epoch": 0.7742461287693562, "grad_norm": 0.12096359907685643, "learning_rate": 9.811767009886681e-05, "loss": 0.2515, "step": 4750 }, { "epoch": 0.78239608801956, "grad_norm": 0.1010867422514499, "learning_rate": 9.145869895394685e-05, "loss": 0.2471, "step": 4800 }, { "epoch": 0.7905460472697636, "grad_norm": 0.12108612955883465, "learning_rate": 8.500444494929692e-05, "loss": 0.2508, "step": 4850 }, { "epoch": 0.7986960065199674, "grad_norm": 0.11827129819048823, "learning_rate": 7.875919048217753e-05, "loss": 0.2421, "step": 4900 }, { "epoch": 0.8068459657701712, "grad_norm": 0.10889141696060357, "learning_rate": 7.272707927866531e-05, "loss": 0.2444, "step": 4950 }, { "epoch": 0.8149959250203749, "grad_norm": 0.11478478031330713, "learning_rate": 6.691211364428989e-05, "loss": 0.239, "step": 5000 }, { "epoch": 0.8231458842705787, "grad_norm": 0.12806490540385113, "learning_rate": 6.131815180850508e-05, "loss": 0.2429, "step": 5050 }, { "epoch": 0.8312958435207825, "grad_norm": 0.10357727349126965, "learning_rate": 5.5948905364753945e-05, "loss": 0.2467, "step": 5100 }, { "epoch": 0.8394458027709861, "grad_norm": 0.1101292515864723, "learning_rate": 5.080793680782607e-05, "loss": 0.2405, "step": 5150 }, { "epoch": 0.8475957620211899, "grad_norm": 0.10144980764898691, "learning_rate": 4.5898657170142746e-05, "loss": 0.235, "step": 5200 }, { "epoch": 0.8557457212713936, "grad_norm": 0.11277819168557472, "learning_rate": 4.1224323758537155e-05, "loss": 0.2341, "step": 5250 }, { "epoch": 0.8638956805215974, "grad_norm": 0.10800888686256803, "learning_rate": 3.678803799303134e-05, "loss": 0.24, "step": 5300 }, { "epoch": 0.8720456397718012, "grad_norm": 0.11901575566402685, "learning_rate": 3.2592743349044186e-05, "loss": 0.2341, "step": 5350 }, { "epoch": 0.8801955990220048, "grad_norm": 0.12444346139158177, "learning_rate": 2.8641223404395524e-05, "loss": 0.2291, "step": 5400 }, { "epoch": 0.8883455582722086, "grad_norm": 0.10818161109308624, "learning_rate": 2.4936099992402606e-05, "loss": 0.2357, "step": 5450 }, { "epoch": 0.8964955175224124, "grad_norm": 0.12545021889499927, "learning_rate": 2.1479831462293265e-05, "loss": 0.2349, "step": 5500 }, { "epoch": 0.9046454767726161, "grad_norm": 0.14021615830916254, "learning_rate": 1.8274711048092084e-05, "loss": 0.2306, "step": 5550 }, { "epoch": 0.9127954360228199, "grad_norm": 0.10349748867815296, "learning_rate": 1.5322865347059044e-05, "loss": 0.2243, "step": 5600 }, { "epoch": 0.9209453952730237, "grad_norm": 0.11963300064608776, "learning_rate": 1.2626252908692638e-05, "loss": 0.2333, "step": 5650 }, { "epoch": 0.9290953545232273, "grad_norm": 0.10953396833548994, "learning_rate": 1.0186662935232384e-05, "loss": 0.2339, "step": 5700 }, { "epoch": 0.9372453137734311, "grad_norm": 0.1224263724970897, "learning_rate": 8.00571409452302e-06, "loss": 0.2313, "step": 5750 }, { "epoch": 0.9453952730236349, "grad_norm": 0.12352858513849092, "learning_rate": 6.084853446028671e-06, "loss": 0.2296, "step": 5800 }, { "epoch": 0.9535452322738386, "grad_norm": 0.12633679586576274, "learning_rate": 4.425355480708859e-06, "loss": 0.2354, "step": 5850 }, { "epoch": 0.9616951915240424, "grad_norm": 0.10627066652464928, "learning_rate": 3.028321275393786e-06, "loss": 0.2332, "step": 5900 }, { "epoch": 0.9698451507742462, "grad_norm": 0.12495472546410136, "learning_rate": 1.8946777622199652e-06, "loss": 0.2297, "step": 5950 }, { "epoch": 0.9779951100244498, "grad_norm": 0.11479937267870474, "learning_rate": 1.0251771136106314e-06, "loss": 0.2372, "step": 6000 }, { "epoch": 0.9861450692746536, "grad_norm": 0.1274989460799885, "learning_rate": 4.203962432096642e-07, "loss": 0.2316, "step": 6050 }, { "epoch": 0.9942950285248574, "grad_norm": 0.10997701182931913, "learning_rate": 8.073642309907036e-08, "loss": 0.2289, "step": 6100 }, { "epoch": 1.0, "step": 6135, "total_flos": 307270109495296.0, "train_loss": 0.320441021565606, "train_runtime": 141453.5974, "train_samples_per_second": 6.245, "train_steps_per_second": 0.043 } ], "logging_steps": 50, "max_steps": 6135, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 307270109495296.0, "train_batch_size": 18, "trial_name": null, "trial_params": null }