{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.567398119122257, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001567398119122257, "grad_norm": 0.5552594661712646, "learning_rate": 3.448275862068966e-06, "loss": 3.3921, "step": 1 }, { "epoch": 0.01567398119122257, "grad_norm": 0.660531222820282, "learning_rate": 3.4482758620689657e-05, "loss": 3.5922, "step": 10 }, { "epoch": 0.03134796238244514, "grad_norm": 0.5598097443580627, "learning_rate": 6.896551724137931e-05, "loss": 3.619, "step": 20 }, { "epoch": 0.047021943573667714, "grad_norm": 0.5996779799461365, "learning_rate": 0.00010344827586206898, "loss": 3.4824, "step": 30 }, { "epoch": 0.06269592476489028, "grad_norm": 0.8172075152397156, "learning_rate": 0.00013793103448275863, "loss": 3.3997, "step": 40 }, { "epoch": 0.07836990595611286, "grad_norm": 1.6439019441604614, "learning_rate": 0.00017241379310344826, "loss": 3.2485, "step": 50 }, { "epoch": 0.09404388714733543, "grad_norm": 0.28180772066116333, "learning_rate": 0.00019999942697524717, "loss": 2.9975, "step": 60 }, { "epoch": 0.109717868338558, "grad_norm": 0.3870689868927002, "learning_rate": 0.00019997937179843937, "loss": 3.0446, "step": 70 }, { "epoch": 0.12539184952978055, "grad_norm": 0.575548529624939, "learning_rate": 0.00019993067195079803, "loss": 3.0178, "step": 80 }, { "epoch": 0.14106583072100312, "grad_norm": 0.781446635723114, "learning_rate": 0.00019985334138511237, "loss": 3.0394, "step": 90 }, { "epoch": 0.15673981191222572, "grad_norm": 1.725696325302124, "learning_rate": 0.00019974740225703878, "loss": 3.0751, "step": 100 }, { "epoch": 0.1724137931034483, "grad_norm": 0.2845414876937866, "learning_rate": 0.00019961288491875278, "loss": 2.9291, "step": 110 }, { "epoch": 0.18808777429467086, "grad_norm": 0.36810651421546936, "learning_rate": 0.00019944982791025333, "loss": 2.9491, "step": 120 }, { "epoch": 0.20376175548589343, "grad_norm": 0.5454439520835876, "learning_rate": 0.00019925827794832056, "loss": 3.0337, "step": 130 }, { "epoch": 0.219435736677116, "grad_norm": 0.6503669619560242, "learning_rate": 0.00019903828991313138, "loss": 3.0246, "step": 140 }, { "epoch": 0.23510971786833856, "grad_norm": 1.4392451047897339, "learning_rate": 0.00019878992683253582, "loss": 3.0232, "step": 150 }, { "epoch": 0.2507836990595611, "grad_norm": 0.2859440743923187, "learning_rate": 0.00019851325986399934, "loss": 2.8955, "step": 160 }, { "epoch": 0.2664576802507837, "grad_norm": 0.44268104434013367, "learning_rate": 0.0001982083682742156, "loss": 2.9338, "step": 170 }, { "epoch": 0.28213166144200624, "grad_norm": 0.5128395557403564, "learning_rate": 0.00019787533941639638, "loss": 3.0089, "step": 180 }, { "epoch": 0.29780564263322884, "grad_norm": 0.7328920364379883, "learning_rate": 0.00019751426870524407, "loss": 3.0157, "step": 190 }, { "epoch": 0.31347962382445144, "grad_norm": 1.5265012979507446, "learning_rate": 0.000197125259589615, "loss": 2.9007, "step": 200 }, { "epoch": 0.329153605015674, "grad_norm": 0.2766813635826111, "learning_rate": 0.0001967084235228807, "loss": 2.8275, "step": 210 }, { "epoch": 0.3448275862068966, "grad_norm": 0.36695396900177, "learning_rate": 0.00019626387993099579, "loss": 2.9158, "step": 220 }, { "epoch": 0.3605015673981191, "grad_norm": 0.5359162092208862, "learning_rate": 0.00019579175617828187, "loss": 2.9465, "step": 230 }, { "epoch": 0.3761755485893417, "grad_norm": 0.6529833674430847, "learning_rate": 0.0001952921875309368, "loss": 2.981, "step": 240 }, { "epoch": 0.39184952978056425, "grad_norm": 1.5314627885818481, "learning_rate": 0.00019476531711828027, "loss": 2.9737, "step": 250 }, { "epoch": 0.40752351097178685, "grad_norm": 0.2949506342411041, "learning_rate": 0.00019421129589174618, "loss": 2.8208, "step": 260 }, { "epoch": 0.4231974921630094, "grad_norm": 0.39567869901657104, "learning_rate": 0.00019363028258163447, "loss": 2.8557, "step": 270 }, { "epoch": 0.438871473354232, "grad_norm": 0.5587254166603088, "learning_rate": 0.00019302244365163376, "loss": 2.9494, "step": 280 }, { "epoch": 0.45454545454545453, "grad_norm": 0.7218978404998779, "learning_rate": 0.0001923879532511287, "loss": 2.9742, "step": 290 }, { "epoch": 0.4702194357366771, "grad_norm": 1.4482598304748535, "learning_rate": 0.0001917269931653049, "loss": 2.8646, "step": 300 }, { "epoch": 0.48589341692789967, "grad_norm": 0.2901701033115387, "learning_rate": 0.00019103975276306678, "loss": 2.7788, "step": 310 }, { "epoch": 0.5015673981191222, "grad_norm": 0.4310539960861206, "learning_rate": 0.00019032642894278192, "loss": 2.8655, "step": 320 }, { "epoch": 0.5172413793103449, "grad_norm": 0.5589954853057861, "learning_rate": 0.0001895872260758688, "loss": 2.914, "step": 330 }, { "epoch": 0.5329153605015674, "grad_norm": 0.7243526577949524, "learning_rate": 0.00018882235594824308, "loss": 2.9191, "step": 340 }, { "epoch": 0.54858934169279, "grad_norm": 1.4200222492218018, "learning_rate": 0.00018803203769963967, "loss": 2.8128, "step": 350 }, { "epoch": 0.5642633228840125, "grad_norm": 0.26594147086143494, "learning_rate": 0.000187216497760828, "loss": 2.762, "step": 360 }, { "epoch": 0.5799373040752351, "grad_norm": 0.3894258439540863, "learning_rate": 0.00018637596978873835, "loss": 2.9077, "step": 370 }, { "epoch": 0.5956112852664577, "grad_norm": 0.5348561406135559, "learning_rate": 0.00018551069459951758, "loss": 2.9292, "step": 380 }, { "epoch": 0.6112852664576802, "grad_norm": 0.746507465839386, "learning_rate": 0.00018462092009953408, "loss": 2.8795, "step": 390 }, { "epoch": 0.6269592476489029, "grad_norm": 1.5225753784179688, "learning_rate": 0.0001837069012143511, "loss": 2.8263, "step": 400 }, { "epoch": 0.6426332288401254, "grad_norm": 0.26905450224876404, "learning_rate": 0.00018276889981568906, "loss": 2.7218, "step": 410 }, { "epoch": 0.658307210031348, "grad_norm": 0.3912515342235565, "learning_rate": 0.00018180718464639787, "loss": 2.819, "step": 420 }, { "epoch": 0.6739811912225705, "grad_norm": 0.5661373138427734, "learning_rate": 0.00018082203124346045, "loss": 2.8772, "step": 430 }, { "epoch": 0.6896551724137931, "grad_norm": 0.805776059627533, "learning_rate": 0.0001798137218590498, "loss": 2.9562, "step": 440 }, { "epoch": 0.7053291536050157, "grad_norm": 1.4879732131958008, "learning_rate": 0.00017878254537966216, "loss": 2.7925, "step": 450 }, { "epoch": 0.7210031347962382, "grad_norm": 0.28762391209602356, "learning_rate": 0.00017772879724334937, "loss": 2.8006, "step": 460 }, { "epoch": 0.7366771159874608, "grad_norm": 0.41769474744796753, "learning_rate": 0.00017665277935507398, "loss": 2.8148, "step": 470 }, { "epoch": 0.7523510971786834, "grad_norm": 0.633671760559082, "learning_rate": 0.00017555480000021198, "loss": 2.8461, "step": 480 }, { "epoch": 0.768025078369906, "grad_norm": 0.816681444644928, "learning_rate": 0.00017443517375622704, "loss": 2.8826, "step": 490 }, { "epoch": 0.7836990595611285, "grad_norm": 1.3913438320159912, "learning_rate": 0.00017329422140254235, "loss": 2.7449, "step": 500 }, { "epoch": 0.799373040752351, "grad_norm": 0.30519339442253113, "learning_rate": 0.0001721322698286354, "loss": 2.7933, "step": 510 }, { "epoch": 0.8150470219435737, "grad_norm": 0.3911365270614624, "learning_rate": 0.0001709496519403823, "loss": 2.8433, "step": 520 }, { "epoch": 0.8307210031347962, "grad_norm": 0.5197113752365112, "learning_rate": 0.00016974670656467824, "loss": 2.828, "step": 530 }, { "epoch": 0.8463949843260188, "grad_norm": 0.7463776469230652, "learning_rate": 0.00016852377835236166, "loss": 2.9549, "step": 540 }, { "epoch": 0.8620689655172413, "grad_norm": 1.6616039276123047, "learning_rate": 0.00016728121767946977, "loss": 2.8843, "step": 550 }, { "epoch": 0.877742946708464, "grad_norm": 0.2525625228881836, "learning_rate": 0.00016601938054685385, "loss": 2.7715, "step": 560 }, { "epoch": 0.8934169278996865, "grad_norm": 0.4268622100353241, "learning_rate": 0.00016473862847818277, "loss": 2.8196, "step": 570 }, { "epoch": 0.9090909090909091, "grad_norm": 0.5538944005966187, "learning_rate": 0.00016343932841636456, "loss": 2.8619, "step": 580 }, { "epoch": 0.9247648902821317, "grad_norm": 0.7213597297668457, "learning_rate": 0.00016212185261841499, "loss": 2.9276, "step": 590 }, { "epoch": 0.9404388714733543, "grad_norm": 1.6927486658096313, "learning_rate": 0.00016078657854880376, "loss": 2.7446, "step": 600 }, { "epoch": 0.9561128526645768, "grad_norm": 0.3090650141239166, "learning_rate": 0.000159433888771309, "loss": 2.747, "step": 610 }, { "epoch": 0.9717868338557993, "grad_norm": 0.4454885423183441, "learning_rate": 0.00015806417083941002, "loss": 2.8622, "step": 620 }, { "epoch": 0.987460815047022, "grad_norm": 0.8089830279350281, "learning_rate": 0.00015667781718525157, "loss": 2.8568, "step": 630 }, { "epoch": 1.0031347962382444, "grad_norm": 0.21911662817001343, "learning_rate": 0.00015527522500720934, "loss": 2.7012, "step": 640 }, { "epoch": 1.0188087774294672, "grad_norm": 0.3196061849594116, "learning_rate": 0.00015385679615609042, "loss": 2.7127, "step": 650 }, { "epoch": 1.0344827586206897, "grad_norm": 0.4679659605026245, "learning_rate": 0.00015242293702000086, "loss": 2.7103, "step": 660 }, { "epoch": 1.0501567398119123, "grad_norm": 0.6506277322769165, "learning_rate": 0.00015097405840791276, "loss": 2.6762, "step": 670 }, { "epoch": 1.0658307210031348, "grad_norm": 0.9151834845542908, "learning_rate": 0.00014951057543196566, "loss": 2.6503, "step": 680 }, { "epoch": 1.0815047021943573, "grad_norm": 0.2834145724773407, "learning_rate": 0.00014803290738853395, "loss": 2.5103, "step": 690 }, { "epoch": 1.09717868338558, "grad_norm": 0.3199128210544586, "learning_rate": 0.00014654147763809637, "loss": 2.7147, "step": 700 }, { "epoch": 1.1128526645768024, "grad_norm": 0.5001178979873657, "learning_rate": 0.00014503671348394057, "loss": 2.7124, "step": 710 }, { "epoch": 1.1285266457680252, "grad_norm": 0.6373796463012695, "learning_rate": 0.0001435190460497384, "loss": 2.7012, "step": 720 }, { "epoch": 1.1442006269592477, "grad_norm": 0.9262835383415222, "learning_rate": 0.00014198891015602646, "loss": 2.6493, "step": 730 }, { "epoch": 1.1598746081504703, "grad_norm": 0.2955109775066376, "learning_rate": 0.00014044674419562734, "loss": 2.5023, "step": 740 }, { "epoch": 1.1755485893416928, "grad_norm": 0.34623944759368896, "learning_rate": 0.0001388929900080476, "loss": 2.6849, "step": 750 }, { "epoch": 1.1912225705329154, "grad_norm": 0.43281346559524536, "learning_rate": 0.00013732809275288828, "loss": 2.6655, "step": 760 }, { "epoch": 1.206896551724138, "grad_norm": 0.6403496265411377, "learning_rate": 0.000135752500782304, "loss": 2.6557, "step": 770 }, { "epoch": 1.2225705329153604, "grad_norm": 0.8644494414329529, "learning_rate": 0.00013416666551254748, "loss": 2.6162, "step": 780 }, { "epoch": 1.238244514106583, "grad_norm": 0.2673655152320862, "learning_rate": 0.00013257104129463614, "loss": 2.4991, "step": 790 }, { "epoch": 1.2539184952978055, "grad_norm": 0.3414609432220459, "learning_rate": 0.00013096608528417788, "loss": 2.6638, "step": 800 }, { "epoch": 1.2695924764890283, "grad_norm": 0.47588247060775757, "learning_rate": 0.00012935225731039348, "loss": 2.6465, "step": 810 }, { "epoch": 1.2852664576802508, "grad_norm": 0.6843693852424622, "learning_rate": 0.00012773001974437267, "loss": 2.6697, "step": 820 }, { "epoch": 1.3009404388714734, "grad_norm": 0.9213519096374512, "learning_rate": 0.0001260998373666022, "loss": 2.6691, "step": 830 }, { "epoch": 1.316614420062696, "grad_norm": 0.26475438475608826, "learning_rate": 0.0001244621772338036, "loss": 2.5109, "step": 840 }, { "epoch": 1.3322884012539185, "grad_norm": 0.3754674196243286, "learning_rate": 0.0001228175085451186, "loss": 2.6699, "step": 850 }, { "epoch": 1.347962382445141, "grad_norm": 0.49352237582206726, "learning_rate": 0.00012116630250768097, "loss": 2.6889, "step": 860 }, { "epoch": 1.3636363636363638, "grad_norm": 0.7134298086166382, "learning_rate": 0.00011950903220161285, "loss": 2.6502, "step": 870 }, { "epoch": 1.3793103448275863, "grad_norm": 0.9475263953208923, "learning_rate": 0.00011784617244448451, "loss": 2.6062, "step": 880 }, { "epoch": 1.3949843260188088, "grad_norm": 0.2767506539821625, "learning_rate": 0.0001161781996552765, "loss": 2.4584, "step": 890 }, { "epoch": 1.4106583072100314, "grad_norm": 0.38563597202301025, "learning_rate": 0.00011450559171788269, "loss": 2.6751, "step": 900 }, { "epoch": 1.426332288401254, "grad_norm": 0.466782808303833, "learning_rate": 0.00011282882784419398, "loss": 2.6181, "step": 910 }, { "epoch": 1.4420062695924765, "grad_norm": 0.7600008845329285, "learning_rate": 0.00011114838843680095, "loss": 2.6729, "step": 920 }, { "epoch": 1.457680250783699, "grad_norm": 0.956167459487915, "learning_rate": 0.0001094647549513561, "loss": 2.5486, "step": 930 }, { "epoch": 1.4733542319749215, "grad_norm": 0.257744699716568, "learning_rate": 0.00010777840975863383, "loss": 2.4545, "step": 940 }, { "epoch": 1.489028213166144, "grad_norm": 0.347033828496933, "learning_rate": 0.00010608983600632831, "loss": 2.6313, "step": 950 }, { "epoch": 1.5047021943573666, "grad_norm": 0.5105902552604675, "learning_rate": 0.00010439951748062912, "loss": 2.623, "step": 960 }, { "epoch": 1.5203761755485894, "grad_norm": 0.7116357684135437, "learning_rate": 0.00010270793846761347, "loss": 2.6493, "step": 970 }, { "epoch": 1.536050156739812, "grad_norm": 0.9367708563804626, "learning_rate": 0.00010101558361449552, "loss": 2.6081, "step": 980 }, { "epoch": 1.5517241379310345, "grad_norm": 0.25939515233039856, "learning_rate": 9.932293779077216e-05, "loss": 2.4289, "step": 990 }, { "epoch": 1.567398119122257, "grad_norm": 0.3675819933414459, "learning_rate": 9.763048594930502e-05, "loss": 2.6617, "step": 1000 } ], "logging_steps": 10, "max_steps": 1914, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.8760263209474458e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }