{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6788866259334692, "eval_steps": 125, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013577732518669382, "eval_loss": 1.933146357536316, "eval_runtime": 7.8521, "eval_samples_per_second": 158.046, "eval_steps_per_second": 19.867, "step": 1 }, { "epoch": 0.006788866259334691, "grad_norm": 0.1681375950574875, "learning_rate": 5e-05, "loss": 1.7982, "step": 5 }, { "epoch": 0.013577732518669382, "grad_norm": 0.18828649818897247, "learning_rate": 0.0001, "loss": 1.7923, "step": 10 }, { "epoch": 0.020366598778004074, "grad_norm": 0.2104337513446808, "learning_rate": 9.99743108100344e-05, "loss": 1.8487, "step": 15 }, { "epoch": 0.027155465037338764, "grad_norm": 0.2100468873977661, "learning_rate": 9.989726963751682e-05, "loss": 1.7487, "step": 20 }, { "epoch": 0.03394433129667346, "grad_norm": 0.21163082122802734, "learning_rate": 9.976895564745991e-05, "loss": 1.8533, "step": 25 }, { "epoch": 0.04073319755600815, "grad_norm": 0.23047874867916107, "learning_rate": 9.95895006911623e-05, "loss": 1.7436, "step": 30 }, { "epoch": 0.04752206381534284, "grad_norm": 0.2215753197669983, "learning_rate": 9.935908917072252e-05, "loss": 1.7594, "step": 35 }, { "epoch": 0.05431093007467753, "grad_norm": 0.25571998953819275, "learning_rate": 9.907795784955327e-05, "loss": 1.6957, "step": 40 }, { "epoch": 0.06109979633401222, "grad_norm": 0.22950182855129242, "learning_rate": 9.874639560909117e-05, "loss": 1.6263, "step": 45 }, { "epoch": 0.06788866259334692, "grad_norm": 0.24856191873550415, "learning_rate": 9.836474315195147e-05, "loss": 1.6548, "step": 50 }, { "epoch": 0.0746775288526816, "grad_norm": 0.23954300582408905, "learning_rate": 9.793339265183303e-05, "loss": 1.6122, "step": 55 }, { "epoch": 0.0814663951120163, "grad_norm": 0.21579493582248688, "learning_rate": 9.745278735053343e-05, "loss": 1.6289, "step": 60 }, { "epoch": 0.08825526137135098, "grad_norm": 0.2218601107597351, "learning_rate": 9.692342110248802e-05, "loss": 1.6328, "step": 65 }, { "epoch": 0.09504412763068568, "grad_norm": 0.23692508041858673, "learning_rate": 9.63458378673011e-05, "loss": 1.5982, "step": 70 }, { "epoch": 0.10183299389002037, "grad_norm": 0.1912090927362442, "learning_rate": 9.572063115079063e-05, "loss": 1.6324, "step": 75 }, { "epoch": 0.10862186014935506, "grad_norm": 0.23047535121440887, "learning_rate": 9.504844339512095e-05, "loss": 1.4911, "step": 80 }, { "epoch": 0.11541072640868975, "grad_norm": 0.20515207946300507, "learning_rate": 9.432996531865002e-05, "loss": 1.5588, "step": 85 }, { "epoch": 0.12219959266802444, "grad_norm": 0.21330514550209045, "learning_rate": 9.356593520616948e-05, "loss": 1.4904, "step": 90 }, { "epoch": 0.12898845892735913, "grad_norm": 0.23363173007965088, "learning_rate": 9.275713815026731e-05, "loss": 1.6034, "step": 95 }, { "epoch": 0.13577732518669383, "grad_norm": 0.17454369366168976, "learning_rate": 9.190440524459203e-05, "loss": 1.5246, "step": 100 }, { "epoch": 0.1425661914460285, "grad_norm": 0.18648678064346313, "learning_rate": 9.10086127298478e-05, "loss": 1.5047, "step": 105 }, { "epoch": 0.1493550577053632, "grad_norm": 0.19136574864387512, "learning_rate": 9.007068109339784e-05, "loss": 1.5417, "step": 110 }, { "epoch": 0.1561439239646979, "grad_norm": 0.22987370193004608, "learning_rate": 8.90915741234015e-05, "loss": 1.5261, "step": 115 }, { "epoch": 0.1629327902240326, "grad_norm": 0.20558615028858185, "learning_rate": 8.807229791845673e-05, "loss": 1.5108, "step": 120 }, { "epoch": 0.1697216564833673, "grad_norm": 0.22430284321308136, "learning_rate": 8.701389985376578e-05, "loss": 1.533, "step": 125 }, { "epoch": 0.1697216564833673, "eval_loss": 1.5494850873947144, "eval_runtime": 7.7808, "eval_samples_per_second": 159.495, "eval_steps_per_second": 20.049, "step": 125 }, { "epoch": 0.17651052274270196, "grad_norm": 0.20106786489486694, "learning_rate": 8.591746750488639e-05, "loss": 1.538, "step": 130 }, { "epoch": 0.18329938900203666, "grad_norm": 0.21852199733257294, "learning_rate": 8.478412753017433e-05, "loss": 1.5041, "step": 135 }, { "epoch": 0.19008825526137135, "grad_norm": 0.21438615024089813, "learning_rate": 8.361504451306585e-05, "loss": 1.4955, "step": 140 }, { "epoch": 0.19687712152070605, "grad_norm": 0.26983126997947693, "learning_rate": 8.241141976538943e-05, "loss": 1.5475, "step": 145 }, { "epoch": 0.20366598778004075, "grad_norm": 0.21069645881652832, "learning_rate": 8.117449009293668e-05, "loss": 1.5053, "step": 150 }, { "epoch": 0.21045485403937542, "grad_norm": 0.19532069563865662, "learning_rate": 7.990552652456081e-05, "loss": 1.5054, "step": 155 }, { "epoch": 0.2172437202987101, "grad_norm": 0.24983985722064972, "learning_rate": 7.860583300610849e-05, "loss": 1.4672, "step": 160 }, { "epoch": 0.2240325865580448, "grad_norm": 0.23071178793907166, "learning_rate": 7.727674506052743e-05, "loss": 1.4741, "step": 165 }, { "epoch": 0.2308214528173795, "grad_norm": 0.22894631326198578, "learning_rate": 7.591962841552627e-05, "loss": 1.5716, "step": 170 }, { "epoch": 0.23761031907671418, "grad_norm": 0.2204715609550476, "learning_rate": 7.45358776001969e-05, "loss": 1.4912, "step": 175 }, { "epoch": 0.24439918533604887, "grad_norm": 0.20580367743968964, "learning_rate": 7.312691451204178e-05, "loss": 1.5067, "step": 180 }, { "epoch": 0.25118805159538354, "grad_norm": 0.22472696006298065, "learning_rate": 7.169418695587791e-05, "loss": 1.5726, "step": 185 }, { "epoch": 0.25797691785471827, "grad_norm": 0.27027320861816406, "learning_rate": 7.023916715611969e-05, "loss": 1.4581, "step": 190 }, { "epoch": 0.26476578411405294, "grad_norm": 0.23604366183280945, "learning_rate": 6.876335024396872e-05, "loss": 1.471, "step": 195 }, { "epoch": 0.27155465037338766, "grad_norm": 0.2352292388677597, "learning_rate": 6.726825272106538e-05, "loss": 1.4378, "step": 200 }, { "epoch": 0.27834351663272233, "grad_norm": 0.2463415563106537, "learning_rate": 6.575541090118105e-05, "loss": 1.4849, "step": 205 }, { "epoch": 0.285132382892057, "grad_norm": 0.2296827733516693, "learning_rate": 6.422637933155162e-05, "loss": 1.3912, "step": 210 }, { "epoch": 0.2919212491513917, "grad_norm": 0.2640281915664673, "learning_rate": 6.268272919547537e-05, "loss": 1.4078, "step": 215 }, { "epoch": 0.2987101154107264, "grad_norm": 0.31660768389701843, "learning_rate": 6.112604669781572e-05, "loss": 1.4462, "step": 220 }, { "epoch": 0.3054989816700611, "grad_norm": 0.22085396945476532, "learning_rate": 5.955793143506863e-05, "loss": 1.4574, "step": 225 }, { "epoch": 0.3122878479293958, "grad_norm": 0.24354016780853271, "learning_rate": 5.7979994751668964e-05, "loss": 1.5296, "step": 230 }, { "epoch": 0.31907671418873046, "grad_norm": 0.24986490607261658, "learning_rate": 5.6393858084225305e-05, "loss": 1.4725, "step": 235 }, { "epoch": 0.3258655804480652, "grad_norm": 0.2531109154224396, "learning_rate": 5.480115129538409e-05, "loss": 1.4542, "step": 240 }, { "epoch": 0.33265444670739985, "grad_norm": 0.19950221478939056, "learning_rate": 5.320351099903565e-05, "loss": 1.4582, "step": 245 }, { "epoch": 0.3394433129667346, "grad_norm": 0.2342836707830429, "learning_rate": 5.1602578878582776e-05, "loss": 1.4385, "step": 250 }, { "epoch": 0.3394433129667346, "eval_loss": 1.4789527654647827, "eval_runtime": 7.722, "eval_samples_per_second": 160.71, "eval_steps_per_second": 20.202, "step": 250 }, { "epoch": 0.34623217922606925, "grad_norm": 0.22694501280784607, "learning_rate": 5e-05, "loss": 1.4285, "step": 255 }, { "epoch": 0.3530210454854039, "grad_norm": 0.23925504088401794, "learning_rate": 4.839742112141724e-05, "loss": 1.4588, "step": 260 }, { "epoch": 0.35980991174473864, "grad_norm": 0.2735687494277954, "learning_rate": 4.679648900096436e-05, "loss": 1.4691, "step": 265 }, { "epoch": 0.3665987780040733, "grad_norm": 0.29210373759269714, "learning_rate": 4.5198848704615914e-05, "loss": 1.441, "step": 270 }, { "epoch": 0.37338764426340804, "grad_norm": 0.2414471060037613, "learning_rate": 4.3606141915774693e-05, "loss": 1.4356, "step": 275 }, { "epoch": 0.3801765105227427, "grad_norm": 0.23838582634925842, "learning_rate": 4.2020005248331054e-05, "loss": 1.4412, "step": 280 }, { "epoch": 0.3869653767820774, "grad_norm": 0.22352631390094757, "learning_rate": 4.04420685649314e-05, "loss": 1.459, "step": 285 }, { "epoch": 0.3937542430414121, "grad_norm": 0.25490501523017883, "learning_rate": 3.887395330218429e-05, "loss": 1.4208, "step": 290 }, { "epoch": 0.40054310930074677, "grad_norm": 0.26434803009033203, "learning_rate": 3.731727080452464e-05, "loss": 1.4264, "step": 295 }, { "epoch": 0.4073319755600815, "grad_norm": 0.23617196083068848, "learning_rate": 3.5773620668448384e-05, "loss": 1.3935, "step": 300 }, { "epoch": 0.41412084181941616, "grad_norm": 0.23081578314304352, "learning_rate": 3.424458909881897e-05, "loss": 1.4113, "step": 305 }, { "epoch": 0.42090970807875083, "grad_norm": 0.24254988133907318, "learning_rate": 3.273174727893463e-05, "loss": 1.4908, "step": 310 }, { "epoch": 0.42769857433808556, "grad_norm": 0.21009749174118042, "learning_rate": 3.12366497560313e-05, "loss": 1.4607, "step": 315 }, { "epoch": 0.4344874405974202, "grad_norm": 0.2565441131591797, "learning_rate": 2.976083284388031e-05, "loss": 1.4664, "step": 320 }, { "epoch": 0.4412763068567549, "grad_norm": 0.2448188215494156, "learning_rate": 2.8305813044122097e-05, "loss": 1.4369, "step": 325 }, { "epoch": 0.4480651731160896, "grad_norm": 0.19102215766906738, "learning_rate": 2.687308548795825e-05, "loss": 1.3941, "step": 330 }, { "epoch": 0.4548540393754243, "grad_norm": 0.23744595050811768, "learning_rate": 2.5464122399803125e-05, "loss": 1.4053, "step": 335 }, { "epoch": 0.461642905634759, "grad_norm": 0.25465700030326843, "learning_rate": 2.4080371584473748e-05, "loss": 1.4054, "step": 340 }, { "epoch": 0.4684317718940937, "grad_norm": 0.26104259490966797, "learning_rate": 2.272325493947257e-05, "loss": 1.4511, "step": 345 }, { "epoch": 0.47522063815342835, "grad_norm": 0.22542956471443176, "learning_rate": 2.139416699389153e-05, "loss": 1.4673, "step": 350 }, { "epoch": 0.4820095044127631, "grad_norm": 0.2605678141117096, "learning_rate": 2.0094473475439202e-05, "loss": 1.3932, "step": 355 }, { "epoch": 0.48879837067209775, "grad_norm": 0.24848726391792297, "learning_rate": 1.8825509907063327e-05, "loss": 1.4204, "step": 360 }, { "epoch": 0.4955872369314325, "grad_norm": 0.2906915843486786, "learning_rate": 1.758858023461059e-05, "loss": 1.3734, "step": 365 }, { "epoch": 0.5023761031907671, "grad_norm": 0.25691258907318115, "learning_rate": 1.6384955486934156e-05, "loss": 1.4086, "step": 370 }, { "epoch": 0.5091649694501018, "grad_norm": 0.2475077509880066, "learning_rate": 1.5215872469825682e-05, "loss": 1.4393, "step": 375 }, { "epoch": 0.5091649694501018, "eval_loss": 1.4588358402252197, "eval_runtime": 7.7286, "eval_samples_per_second": 160.573, "eval_steps_per_second": 20.185, "step": 375 }, { "epoch": 0.5159538357094365, "grad_norm": 0.3304094076156616, "learning_rate": 1.4082532495113626e-05, "loss": 1.4469, "step": 380 }, { "epoch": 0.5227427019687713, "grad_norm": 0.2766299545764923, "learning_rate": 1.2986100146234232e-05, "loss": 1.357, "step": 385 }, { "epoch": 0.5295315682281059, "grad_norm": 0.29647424817085266, "learning_rate": 1.1927702081543279e-05, "loss": 1.47, "step": 390 }, { "epoch": 0.5363204344874406, "grad_norm": 0.2745124399662018, "learning_rate": 1.090842587659851e-05, "loss": 1.398, "step": 395 }, { "epoch": 0.5431093007467753, "grad_norm": 0.2885884642601013, "learning_rate": 9.929318906602175e-06, "loss": 1.4597, "step": 400 }, { "epoch": 0.5498981670061099, "grad_norm": 0.24886402487754822, "learning_rate": 8.991387270152201e-06, "loss": 1.4669, "step": 405 }, { "epoch": 0.5566870332654447, "grad_norm": 0.2334742695093155, "learning_rate": 8.09559475540797e-06, "loss": 1.4107, "step": 410 }, { "epoch": 0.5634758995247794, "grad_norm": 0.27737849950790405, "learning_rate": 7.242861849732696e-06, "loss": 1.5048, "step": 415 }, { "epoch": 0.570264765784114, "grad_norm": 0.23328500986099243, "learning_rate": 6.43406479383053e-06, "loss": 1.4487, "step": 420 }, { "epoch": 0.5770536320434487, "grad_norm": 0.32235902547836304, "learning_rate": 5.670034681349995e-06, "loss": 1.4791, "step": 425 }, { "epoch": 0.5838424983027835, "grad_norm": 0.3034408688545227, "learning_rate": 4.951556604879048e-06, "loss": 1.4643, "step": 430 }, { "epoch": 0.5906313645621182, "grad_norm": 0.24768030643463135, "learning_rate": 4.279368849209381e-06, "loss": 1.4803, "step": 435 }, { "epoch": 0.5974202308214528, "grad_norm": 0.23609156906604767, "learning_rate": 3.654162132698918e-06, "loss": 1.3877, "step": 440 }, { "epoch": 0.6042090970807875, "grad_norm": 0.25662508606910706, "learning_rate": 3.076578897511978e-06, "loss": 1.4758, "step": 445 }, { "epoch": 0.6109979633401222, "grad_norm": 0.24194221198558807, "learning_rate": 2.547212649466568e-06, "loss": 1.4027, "step": 450 }, { "epoch": 0.6177868295994569, "grad_norm": 0.3545863628387451, "learning_rate": 2.066607348166971e-06, "loss": 1.4078, "step": 455 }, { "epoch": 0.6245756958587916, "grad_norm": 0.2740108072757721, "learning_rate": 1.6352568480485276e-06, "loss": 1.4463, "step": 460 }, { "epoch": 0.6313645621181263, "grad_norm": 0.2730073630809784, "learning_rate": 1.2536043909088191e-06, "loss": 1.4089, "step": 465 }, { "epoch": 0.6381534283774609, "grad_norm": 0.2379709631204605, "learning_rate": 9.220421504467281e-07, "loss": 1.4428, "step": 470 }, { "epoch": 0.6449422946367956, "grad_norm": 0.2579050362110138, "learning_rate": 6.409108292774913e-07, "loss": 1.4112, "step": 475 }, { "epoch": 0.6517311608961304, "grad_norm": 0.2765941917896271, "learning_rate": 4.104993088376974e-07, "loss": 1.4722, "step": 480 }, { "epoch": 0.658520027155465, "grad_norm": 0.27427244186401367, "learning_rate": 2.310443525400885e-07, "loss": 1.3907, "step": 485 }, { "epoch": 0.6653088934147997, "grad_norm": 0.2611384689807892, "learning_rate": 1.0273036248318324e-07, "loss": 1.4292, "step": 490 }, { "epoch": 0.6720977596741344, "grad_norm": 0.2796425223350525, "learning_rate": 2.568918996560532e-08, "loss": 1.4264, "step": 495 }, { "epoch": 0.6788866259334692, "grad_norm": 0.21907183527946472, "learning_rate": 0.0, "loss": 1.4451, "step": 500 }, { "epoch": 0.6788866259334692, "eval_loss": 1.4559390544891357, "eval_runtime": 7.7312, "eval_samples_per_second": 160.519, "eval_steps_per_second": 20.178, "step": 500 } ], "logging_steps": 5, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0682976996163584e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }