{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998682013882787, "eval_steps": 427, "global_step": 4266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030225814954749144, "grad_norm": 8.637994766235352, "learning_rate": 4.777802058246361e-05, "loss": 0.7443, "step": 43 }, { "epoch": 0.06045162990949829, "grad_norm": 8.726279258728027, "learning_rate": 4.77420943412748e-05, "loss": 0.3904, "step": 86 }, { "epoch": 0.09067744486424743, "grad_norm": 18.751718521118164, "learning_rate": 4.768225729865107e-05, "loss": 0.2274, "step": 129 }, { "epoch": 0.12090325981899658, "grad_norm": 11.341946601867676, "learning_rate": 4.759856945148865e-05, "loss": 0.2032, "step": 172 }, { "epoch": 0.1511290747737457, "grad_norm": 6.5374932289123535, "learning_rate": 4.749111471120532e-05, "loss": 0.1546, "step": 215 }, { "epoch": 0.18135488972849487, "grad_norm": 7.3821539878845215, "learning_rate": 4.7360000819604815e-05, "loss": 0.177, "step": 258 }, { "epoch": 0.211580704683244, "grad_norm": 4.396503448486328, "learning_rate": 4.7205359240847186e-05, "loss": 0.1611, "step": 301 }, { "epoch": 0.24180651963799316, "grad_norm": 4.967313766479492, "learning_rate": 4.702734502963346e-05, "loss": 0.1408, "step": 344 }, { "epoch": 0.2720323345927423, "grad_norm": 9.22049331665039, "learning_rate": 4.6826136675736696e-05, "loss": 0.1235, "step": 387 }, { "epoch": 0.3001493717599508, "eval_accuracy": 0.9654081834354615, "eval_loss": 0.12473645806312561, "eval_precision": 0.9140316205533597, "eval_runtime": 42.4722, "eval_samples_per_second": 119.113, "eval_steps_per_second": 29.784, "step": 427 }, { "epoch": 0.3022581495474914, "grad_norm": 1.0973700284957886, "learning_rate": 4.660193592503536e-05, "loss": 0.1789, "step": 430 }, { "epoch": 0.3324839645022406, "grad_norm": 12.724760055541992, "learning_rate": 4.635496757722852e-05, "loss": 0.1526, "step": 473 }, { "epoch": 0.36270977945698973, "grad_norm": 0.692093014717102, "learning_rate": 4.60854792604356e-05, "loss": 0.0998, "step": 516 }, { "epoch": 0.39293559441173886, "grad_norm": 8.747737884521484, "learning_rate": 4.579374118290669e-05, "loss": 0.1417, "step": 559 }, { "epoch": 0.423161409366488, "grad_norm": 3.383861541748047, "learning_rate": 4.548004586209246e-05, "loss": 0.1416, "step": 602 }, { "epoch": 0.4533872243212371, "grad_norm": 2.690746545791626, "learning_rate": 4.514470783134528e-05, "loss": 0.1242, "step": 645 }, { "epoch": 0.4836130392759863, "grad_norm": 4.980698585510254, "learning_rate": 4.4788063324545544e-05, "loss": 0.0938, "step": 688 }, { "epoch": 0.5138388542307354, "grad_norm": 3.457531452178955, "learning_rate": 4.441046993896959e-05, "loss": 0.113, "step": 731 }, { "epoch": 0.5440646691854846, "grad_norm": 0.17746348679065704, "learning_rate": 4.40123062767371e-05, "loss": 0.086, "step": 774 }, { "epoch": 0.5742904841402338, "grad_norm": 1.7960216999053955, "learning_rate": 4.359397156519751e-05, "loss": 0.085, "step": 817 }, { "epoch": 0.6002987435199016, "eval_accuracy": 0.9703498715161099, "eval_loss": 0.190622478723526, "eval_precision": 0.918525703200776, "eval_runtime": 51.3537, "eval_samples_per_second": 98.513, "eval_steps_per_second": 24.633, "step": 854 }, { "epoch": 0.6045162990949828, "grad_norm": 0.05534067377448082, "learning_rate": 4.315588525663622e-05, "loss": 0.1477, "step": 860 }, { "epoch": 0.634742114049732, "grad_norm": 5.788769245147705, "learning_rate": 4.269848660770166e-05, "loss": 0.1854, "step": 903 }, { "epoch": 0.6649679290044812, "grad_norm": 4.581574440002441, "learning_rate": 4.222223423897522e-05, "loss": 0.0987, "step": 946 }, { "epoch": 0.6951937439592303, "grad_norm": 0.24250301718711853, "learning_rate": 4.1727605675125466e-05, "loss": 0.1237, "step": 989 }, { "epoch": 0.7254195589139795, "grad_norm": 6.998917579650879, "learning_rate": 4.1215096866107764e-05, "loss": 0.1304, "step": 1032 }, { "epoch": 0.7556453738687285, "grad_norm": 9.230533599853516, "learning_rate": 4.068522168988941e-05, "loss": 0.1212, "step": 1075 }, { "epoch": 0.7858711888234777, "grad_norm": 0.515550971031189, "learning_rate": 4.013851143719886e-05, "loss": 0.0942, "step": 1118 }, { "epoch": 0.8160970037782269, "grad_norm": 14.719630241394043, "learning_rate": 3.9575514278815625e-05, "loss": 0.1182, "step": 1161 }, { "epoch": 0.846322818732976, "grad_norm": 1.9909157752990723, "learning_rate": 3.899679471593512e-05, "loss": 0.099, "step": 1204 }, { "epoch": 0.8765486336877252, "grad_norm": 1.0096828937530518, "learning_rate": 3.840293301415938e-05, "loss": 0.1158, "step": 1247 }, { "epoch": 0.9004481152798524, "eval_accuracy": 0.9756868946432101, "eval_loss": 0.10856811702251434, "eval_precision": 0.941468253968254, "eval_runtime": 41.486, "eval_samples_per_second": 121.945, "eval_steps_per_second": 30.492, "step": 1281 }, { "epoch": 0.9067744486424743, "grad_norm": 0.669986367225647, "learning_rate": 3.779452462168126e-05, "loss": 0.101, "step": 1290 }, { "epoch": 0.9370002635972234, "grad_norm": 0.9615168571472168, "learning_rate": 3.7172179572245585e-05, "loss": 0.1166, "step": 1333 }, { "epoch": 0.9672260785519726, "grad_norm": 13.886622428894043, "learning_rate": 3.6536521873485673e-05, "loss": 0.1308, "step": 1376 }, { "epoch": 0.9974518935067217, "grad_norm": 0.23971609771251678, "learning_rate": 3.588818888124863e-05, "loss": 0.1266, "step": 1419 }, { "epoch": 1.0276777084614708, "grad_norm": 4.089532375335693, "learning_rate": 3.5227830660536916e-05, "loss": 0.0844, "step": 1462 }, { "epoch": 1.05790352341622, "grad_norm": 6.12522554397583, "learning_rate": 3.455610933370663e-05, "loss": 0.0843, "step": 1505 }, { "epoch": 1.0881293383709691, "grad_norm": 0.24417218565940857, "learning_rate": 3.3873698416576314e-05, "loss": 0.0825, "step": 1548 }, { "epoch": 1.1183551533257183, "grad_norm": 5.748846054077148, "learning_rate": 3.3181282143111886e-05, "loss": 0.0885, "step": 1591 }, { "epoch": 1.1485809682804675, "grad_norm": 0.8179301619529724, "learning_rate": 3.247955477936471e-05, "loss": 0.0882, "step": 1634 }, { "epoch": 1.1788067832352165, "grad_norm": 0.9235413670539856, "learning_rate": 3.176921992735089e-05, "loss": 0.0859, "step": 1677 }, { "epoch": 1.2005974870398033, "eval_accuracy": 0.9756868946432101, "eval_loss": 0.09278497099876404, "eval_precision": 0.9550102249488752, "eval_runtime": 41.4152, "eval_samples_per_second": 122.153, "eval_steps_per_second": 30.544, "step": 1708 }, { "epoch": 1.2090325981899657, "grad_norm": 0.0459032878279686, "learning_rate": 3.1050989819569484e-05, "loss": 0.0603, "step": 1720 }, { "epoch": 1.2392584131447149, "grad_norm": 2.4066104888916016, "learning_rate": 3.0325584604867403e-05, "loss": 0.071, "step": 1763 }, { "epoch": 1.269484228099464, "grad_norm": 9.008522987365723, "learning_rate": 2.9593731626366644e-05, "loss": 0.0678, "step": 1806 }, { "epoch": 1.2997100430542132, "grad_norm": 6.748093605041504, "learning_rate": 2.8856164692178087e-05, "loss": 0.1068, "step": 1849 }, { "epoch": 1.3299358580089624, "grad_norm": 0.8588669300079346, "learning_rate": 2.8113623339633016e-05, "loss": 0.0835, "step": 1892 }, { "epoch": 1.3601616729637114, "grad_norm": 2.512568235397339, "learning_rate": 2.736685209377016e-05, "loss": 0.0368, "step": 1935 }, { "epoch": 1.3903874879184606, "grad_norm": 3.670646905899048, "learning_rate": 2.661659972082166e-05, "loss": 0.0795, "step": 1978 }, { "epoch": 1.4206133028732097, "grad_norm": 0.5933941602706909, "learning_rate": 2.5863618477446485e-05, "loss": 0.0615, "step": 2021 }, { "epoch": 1.450839117827959, "grad_norm": 0.8403681516647339, "learning_rate": 2.5108663356464178e-05, "loss": 0.087, "step": 2064 }, { "epoch": 1.481064932782708, "grad_norm": 2.5098040103912354, "learning_rate": 2.4352491329845075e-05, "loss": 0.0821, "step": 2107 }, { "epoch": 1.500746858799754, "eval_accuracy": 0.9764775647361138, "eval_loss": 0.09056767076253891, "eval_precision": 0.935672514619883, "eval_runtime": 41.6907, "eval_samples_per_second": 121.346, "eval_steps_per_second": 30.342, "step": 2135 }, { "epoch": 1.511290747737457, "grad_norm": 8.705094337463379, "learning_rate": 2.3595860589716064e-05, "loss": 0.0714, "step": 2150 }, { "epoch": 1.5415165626922063, "grad_norm": 0.13956770300865173, "learning_rate": 2.2839529788143032e-05, "loss": 0.0696, "step": 2193 }, { "epoch": 1.5717423776469555, "grad_norm": 5.737203121185303, "learning_rate": 2.208425727645198e-05, "loss": 0.088, "step": 2236 }, { "epoch": 1.6019681926017046, "grad_norm": 2.958165407180786, "learning_rate": 2.1330800344851852e-05, "loss": 0.0797, "step": 2279 }, { "epoch": 1.6321940075564538, "grad_norm": 3.428828477859497, "learning_rate": 2.0579914463121218e-05, "loss": 0.0564, "step": 2322 }, { "epoch": 1.662419822511203, "grad_norm": 1.1754554510116577, "learning_rate": 1.983235252312024e-05, "loss": 0.0611, "step": 2365 }, { "epoch": 1.692645637465952, "grad_norm": 6.087173938751221, "learning_rate": 1.9088864083887505e-05, "loss": 0.0865, "step": 2408 }, { "epoch": 1.7228714524207012, "grad_norm": 3.829490900039673, "learning_rate": 1.835019462007857e-05, "loss": 0.0882, "step": 2451 }, { "epoch": 1.7530972673754504, "grad_norm": 7.614952087402344, "learning_rate": 1.761708477449973e-05, "loss": 0.0445, "step": 2494 }, { "epoch": 1.7833230823301993, "grad_norm": 35.23410415649414, "learning_rate": 1.689026961548663e-05, "loss": 0.0593, "step": 2537 }, { "epoch": 1.800896230559705, "eval_accuracy": 0.9794425775845028, "eval_loss": 0.07280829548835754, "eval_precision": 0.9513406156901688, "eval_runtime": 41.7842, "eval_samples_per_second": 121.075, "eval_steps_per_second": 30.275, "step": 2562 }, { "epoch": 1.8135488972849485, "grad_norm": 0.42250823974609375, "learning_rate": 1.6170477899872236e-05, "loss": 0.0682, "step": 2580 }, { "epoch": 1.8437747122396977, "grad_norm": 3.1888535022735596, "learning_rate": 1.5458431342283072e-05, "loss": 0.1021, "step": 2623 }, { "epoch": 1.8740005271944469, "grad_norm": 0.11506301164627075, "learning_rate": 1.4754843891496589e-05, "loss": 0.0655, "step": 2666 }, { "epoch": 1.904226342149196, "grad_norm": 0.1821933388710022, "learning_rate": 1.4060421014585115e-05, "loss": 0.0718, "step": 2709 }, { "epoch": 1.9344521571039452, "grad_norm": 4.954631805419922, "learning_rate": 1.3375858989564059e-05, "loss": 0.0727, "step": 2752 }, { "epoch": 1.9646779720586944, "grad_norm": 3.8650524616241455, "learning_rate": 1.270184420725387e-05, "loss": 0.0794, "step": 2795 }, { "epoch": 1.9949037870134436, "grad_norm": 1.1344525814056396, "learning_rate": 1.203905248305552e-05, "loss": 0.0832, "step": 2838 }, { "epoch": 2.025129601968193, "grad_norm": 0.8010286688804626, "learning_rate": 1.1388148379329648e-05, "loss": 0.0455, "step": 2881 }, { "epoch": 2.0553554169229415, "grad_norm": 0.04408084228634834, "learning_rate": 1.074978453905885e-05, "loss": 0.0328, "step": 2924 }, { "epoch": 2.0855812318776907, "grad_norm": 0.044715262949466705, "learning_rate": 1.0124601031461207e-05, "loss": 0.0527, "step": 2967 }, { "epoch": 2.1010456023196555, "eval_accuracy": 0.9774659023522435, "eval_loss": 0.08738358318805695, "eval_precision": 0.9481555333998006, "eval_runtime": 41.7468, "eval_samples_per_second": 121.183, "eval_steps_per_second": 30.302, "step": 2989 }, { "epoch": 2.11580704683244, "grad_norm": 0.9349909424781799, "learning_rate": 9.513224710211058e-06, "loss": 0.0401, "step": 3010 }, { "epoch": 2.146032861787189, "grad_norm": 9.170902252197266, "learning_rate": 8.91626858491075e-06, "loss": 0.0347, "step": 3053 }, { "epoch": 2.1762586767419383, "grad_norm": 0.14816665649414062, "learning_rate": 8.334331206443409e-06, "loss": 0.0252, "step": 3096 }, { "epoch": 2.2064844916966875, "grad_norm": 0.42291557788848877, "learning_rate": 7.767996066822973e-06, "loss": 0.054, "step": 3139 }, { "epoch": 2.2367103066514367, "grad_norm": 0.06259565055370331, "learning_rate": 7.2178310141434304e-06, "loss": 0.0534, "step": 3182 }, { "epoch": 2.266936121606186, "grad_norm": 0.24870078265666962, "learning_rate": 6.684387683213754e-06, "loss": 0.0706, "step": 3225 }, { "epoch": 2.297161936560935, "grad_norm": 4.754918098449707, "learning_rate": 6.168200942449347e-06, "loss": 0.0439, "step": 3268 }, { "epoch": 2.3273877515156842, "grad_norm": 0.26877379417419434, "learning_rate": 5.669788357574799e-06, "loss": 0.0586, "step": 3311 }, { "epoch": 2.357613566470433, "grad_norm": 0.19906023144721985, "learning_rate": 5.189649672675441e-06, "loss": 0.0324, "step": 3354 }, { "epoch": 2.387839381425182, "grad_norm": 0.01543935015797615, "learning_rate": 4.728266309118228e-06, "loss": 0.0263, "step": 3397 }, { "epoch": 2.4011949740796066, "eval_accuracy": 0.9794425775845028, "eval_loss": 0.0934256836771965, "eval_precision": 0.954045954045954, "eval_runtime": 41.7334, "eval_samples_per_second": 121.222, "eval_steps_per_second": 30.311, "step": 3416 }, { "epoch": 2.4180651963799313, "grad_norm": 23.245332717895508, "learning_rate": 4.286100882844285e-06, "loss": 0.029, "step": 3440 }, { "epoch": 2.4482910113346805, "grad_norm": 0.19245454668998718, "learning_rate": 3.863596740517041e-06, "loss": 0.0473, "step": 3483 }, { "epoch": 2.4785168262894297, "grad_norm": 0.045882448554039, "learning_rate": 3.461177514991228e-06, "loss": 0.0393, "step": 3526 }, { "epoch": 2.508742641244179, "grad_norm": 0.06130468472838402, "learning_rate": 3.079246700548261e-06, "loss": 0.0425, "step": 3569 }, { "epoch": 2.538968456198928, "grad_norm": 0.34582972526550293, "learning_rate": 2.718187248324039e-06, "loss": 0.0349, "step": 3612 }, { "epoch": 2.5691942711536773, "grad_norm": 1.3404638767242432, "learning_rate": 2.37836118233478e-06, "loss": 0.0362, "step": 3655 }, { "epoch": 2.5994200861084265, "grad_norm": 0.06872310489416122, "learning_rate": 2.060109236485834e-06, "loss": 0.0612, "step": 3698 }, { "epoch": 2.629645901063175, "grad_norm": 2.3359293937683105, "learning_rate": 1.7637505129275284e-06, "loss": 0.048, "step": 3741 }, { "epoch": 2.659871716017925, "grad_norm": 5.89841365814209, "learning_rate": 1.489582162100559e-06, "loss": 0.0357, "step": 3784 }, { "epoch": 2.6900975309726736, "grad_norm": 0.1834953874349594, "learning_rate": 1.2378790847916738e-06, "loss": 0.0566, "step": 3827 }, { "epoch": 2.701344345839557, "eval_accuracy": 0.9788495750148251, "eval_loss": 0.08470374345779419, "eval_precision": 0.9548192771084337, "eval_runtime": 41.7346, "eval_samples_per_second": 121.218, "eval_steps_per_second": 30.311, "step": 3843 }, { "epoch": 2.7203233459274228, "grad_norm": 6.001282215118408, "learning_rate": 1.008893656498535e-06, "loss": 0.0427, "step": 3870 }, { "epoch": 2.750549160882172, "grad_norm": 6.217470169067383, "learning_rate": 8.028554743799881e-07, "loss": 0.0367, "step": 3913 }, { "epoch": 2.780774975836921, "grad_norm": 4.40556526184082, "learning_rate": 6.199711270455366e-07, "loss": 0.0317, "step": 3956 }, { "epoch": 2.8110007907916703, "grad_norm": 0.33150240778923035, "learning_rate": 4.604239874148275e-07, "loss": 0.0429, "step": 3999 }, { "epoch": 2.8412266057464195, "grad_norm": 5.160287857055664, "learning_rate": 3.2437402885488323e-07, "loss": 0.0316, "step": 4042 }, { "epoch": 2.8714524207011687, "grad_norm": 0.10348004102706909, "learning_rate": 2.1195766477933204e-07, "loss": 0.032, "step": 4085 }, { "epoch": 2.901678235655918, "grad_norm": 0.10884183645248413, "learning_rate": 1.2328761187059095e-07, "loss": 0.0151, "step": 4128 }, { "epoch": 2.931904050610667, "grad_norm": 5.601053714752197, "learning_rate": 5.8452777062061706e-08, "loss": 0.0296, "step": 4171 }, { "epoch": 2.962129865565416, "grad_norm": 0.09085728973150253, "learning_rate": 1.7518168393672898e-08, "loss": 0.0384, "step": 4214 }, { "epoch": 2.9923556805201654, "grad_norm": 0.4429589807987213, "learning_rate": 5.248298301814908e-10, "loss": 0.0467, "step": 4257 } ], "logging_steps": 43, "max_steps": 4266, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.964461063575552e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }