{ "best_metric": 1.7984042167663574, "best_model_checkpoint": "./results/checkpoint-1200", "epoch": 2.284626368396002, "eval_steps": 200, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01903855306996668, "grad_norm": 0.17994017899036407, "learning_rate": 5e-05, "loss": 2.1247, "step": 10 }, { "epoch": 0.03807710613993336, "grad_norm": 0.27629706263542175, "learning_rate": 0.0001, "loss": 2.0758, "step": 20 }, { "epoch": 0.05711565920990005, "grad_norm": 0.4726850092411041, "learning_rate": 0.00015, "loss": 2.0858, "step": 30 }, { "epoch": 0.07615421227986673, "grad_norm": 0.5583528876304626, "learning_rate": 0.0002, "loss": 2.0593, "step": 40 }, { "epoch": 0.09519276534983341, "grad_norm": 0.5730186104774475, "learning_rate": 0.00025, "loss": 2.0161, "step": 50 }, { "epoch": 0.1142313184198001, "grad_norm": 0.48230308294296265, "learning_rate": 0.0003, "loss": 1.9764, "step": 60 }, { "epoch": 0.13326987148976677, "grad_norm": 0.44312751293182373, "learning_rate": 0.00035, "loss": 1.9557, "step": 70 }, { "epoch": 0.15230842455973345, "grad_norm": 0.4186476171016693, "learning_rate": 0.0004, "loss": 1.9422, "step": 80 }, { "epoch": 0.17134697762970014, "grad_norm": 0.38540077209472656, "learning_rate": 0.00045000000000000004, "loss": 1.9189, "step": 90 }, { "epoch": 0.19038553069966682, "grad_norm": 0.35501590371131897, "learning_rate": 0.0005, "loss": 1.9254, "step": 100 }, { "epoch": 0.2094240837696335, "grad_norm": 0.40440383553504944, "learning_rate": 0.000498019801980198, "loss": 1.9032, "step": 110 }, { "epoch": 0.2284626368396002, "grad_norm": 0.39570745825767517, "learning_rate": 0.000496039603960396, "loss": 1.9029, "step": 120 }, { "epoch": 0.24750118990956688, "grad_norm": 0.4123484790325165, "learning_rate": 0.0004940594059405941, "loss": 1.8735, "step": 130 }, { "epoch": 0.26653974297953353, "grad_norm": 0.37050503492355347, "learning_rate": 0.0004920792079207921, "loss": 1.8739, "step": 140 }, { "epoch": 0.28557829604950025, "grad_norm": 0.4047178030014038, "learning_rate": 0.0004900990099009901, "loss": 1.8659, "step": 150 }, { "epoch": 0.3046168491194669, "grad_norm": 0.3643397092819214, "learning_rate": 0.0004881188118811881, "loss": 1.8689, "step": 160 }, { "epoch": 0.3236554021894336, "grad_norm": 0.37609240412712097, "learning_rate": 0.00048613861386138615, "loss": 1.8599, "step": 170 }, { "epoch": 0.3426939552594003, "grad_norm": 0.3859333395957947, "learning_rate": 0.00048415841584158414, "loss": 1.8441, "step": 180 }, { "epoch": 0.361732508329367, "grad_norm": 0.3943149447441101, "learning_rate": 0.0004821782178217822, "loss": 1.8366, "step": 190 }, { "epoch": 0.38077106139933364, "grad_norm": 0.41318005323410034, "learning_rate": 0.0004801980198019802, "loss": 1.8381, "step": 200 }, { "epoch": 0.38077106139933364, "eval_loss": 1.8646808862686157, "eval_runtime": 4.2162, "eval_samples_per_second": 23.718, "eval_steps_per_second": 1.66, "step": 200 }, { "epoch": 0.39980961446930036, "grad_norm": 0.3635823726654053, "learning_rate": 0.0004782178217821782, "loss": 1.8292, "step": 210 }, { "epoch": 0.418848167539267, "grad_norm": 0.3529907166957855, "learning_rate": 0.00047623762376237624, "loss": 1.8444, "step": 220 }, { "epoch": 0.43788672060923367, "grad_norm": 0.3581302762031555, "learning_rate": 0.00047425742574257423, "loss": 1.8352, "step": 230 }, { "epoch": 0.4569252736792004, "grad_norm": 0.3584224581718445, "learning_rate": 0.0004722772277227723, "loss": 1.8319, "step": 240 }, { "epoch": 0.47596382674916704, "grad_norm": 0.3439520299434662, "learning_rate": 0.0004702970297029703, "loss": 1.8296, "step": 250 }, { "epoch": 0.49500237981913375, "grad_norm": 0.3635288178920746, "learning_rate": 0.00046831683168316833, "loss": 1.8294, "step": 260 }, { "epoch": 0.5140409328891005, "grad_norm": 0.3621940612792969, "learning_rate": 0.0004663366336633664, "loss": 1.8245, "step": 270 }, { "epoch": 0.5330794859590671, "grad_norm": 0.3562050759792328, "learning_rate": 0.0004643564356435644, "loss": 1.8051, "step": 280 }, { "epoch": 0.5521180390290338, "grad_norm": 0.3374086618423462, "learning_rate": 0.00046237623762376243, "loss": 1.8205, "step": 290 }, { "epoch": 0.5711565920990005, "grad_norm": 0.33458590507507324, "learning_rate": 0.0004603960396039604, "loss": 1.8238, "step": 300 }, { "epoch": 0.5901951451689672, "grad_norm": 0.3511849045753479, "learning_rate": 0.0004584158415841584, "loss": 1.8074, "step": 310 }, { "epoch": 0.6092336982389338, "grad_norm": 0.3680996000766754, "learning_rate": 0.00045643564356435647, "loss": 1.8349, "step": 320 }, { "epoch": 0.6282722513089005, "grad_norm": 0.33489343523979187, "learning_rate": 0.00045445544554455447, "loss": 1.8304, "step": 330 }, { "epoch": 0.6473108043788672, "grad_norm": 0.3262704908847809, "learning_rate": 0.0004524752475247525, "loss": 1.8179, "step": 340 }, { "epoch": 0.6663493574488338, "grad_norm": 0.33311426639556885, "learning_rate": 0.0004504950495049505, "loss": 1.8075, "step": 350 }, { "epoch": 0.6853879105188005, "grad_norm": 0.3391004800796509, "learning_rate": 0.0004485148514851485, "loss": 1.8124, "step": 360 }, { "epoch": 0.7044264635887673, "grad_norm": 0.34050452709198, "learning_rate": 0.00044653465346534656, "loss": 1.8184, "step": 370 }, { "epoch": 0.723465016658734, "grad_norm": 0.320922315120697, "learning_rate": 0.00044455445544554456, "loss": 1.8129, "step": 380 }, { "epoch": 0.7425035697287006, "grad_norm": 0.3578341007232666, "learning_rate": 0.0004425742574257426, "loss": 1.7989, "step": 390 }, { "epoch": 0.7615421227986673, "grad_norm": 0.31143978238105774, "learning_rate": 0.0004405940594059406, "loss": 1.8054, "step": 400 }, { "epoch": 0.7615421227986673, "eval_loss": 1.829106330871582, "eval_runtime": 4.2436, "eval_samples_per_second": 23.565, "eval_steps_per_second": 1.65, "step": 400 }, { "epoch": 0.780580675868634, "grad_norm": 0.3297821581363678, "learning_rate": 0.0004386138613861386, "loss": 1.8165, "step": 410 }, { "epoch": 0.7996192289386007, "grad_norm": 0.33798128366470337, "learning_rate": 0.00043663366336633665, "loss": 1.8001, "step": 420 }, { "epoch": 0.8186577820085673, "grad_norm": 0.3441774547100067, "learning_rate": 0.00043465346534653465, "loss": 1.8057, "step": 430 }, { "epoch": 0.837696335078534, "grad_norm": 0.30104541778564453, "learning_rate": 0.0004326732673267327, "loss": 1.8122, "step": 440 }, { "epoch": 0.8567348881485007, "grad_norm": 0.31903618574142456, "learning_rate": 0.0004306930693069307, "loss": 1.8099, "step": 450 }, { "epoch": 0.8757734412184673, "grad_norm": 0.31247204542160034, "learning_rate": 0.0004287128712871287, "loss": 1.8132, "step": 460 }, { "epoch": 0.894811994288434, "grad_norm": 0.3191291391849518, "learning_rate": 0.00042673267326732674, "loss": 1.8143, "step": 470 }, { "epoch": 0.9138505473584008, "grad_norm": 0.3244192600250244, "learning_rate": 0.00042475247524752474, "loss": 1.7999, "step": 480 }, { "epoch": 0.9328891004283675, "grad_norm": 0.37674182653427124, "learning_rate": 0.0004227722772277228, "loss": 1.8097, "step": 490 }, { "epoch": 0.9519276534983341, "grad_norm": 0.31393611431121826, "learning_rate": 0.0004207920792079208, "loss": 1.802, "step": 500 }, { "epoch": 0.9709662065683008, "grad_norm": 0.3186231255531311, "learning_rate": 0.0004188118811881188, "loss": 1.8043, "step": 510 }, { "epoch": 0.9900047596382675, "grad_norm": 0.2924995422363281, "learning_rate": 0.00041683168316831683, "loss": 1.792, "step": 520 }, { "epoch": 1.009043312708234, "grad_norm": 0.3129435181617737, "learning_rate": 0.00041485148514851483, "loss": 1.8009, "step": 530 }, { "epoch": 1.028081865778201, "grad_norm": 0.2927923798561096, "learning_rate": 0.0004128712871287129, "loss": 1.8011, "step": 540 }, { "epoch": 1.0471204188481675, "grad_norm": 0.2918388545513153, "learning_rate": 0.0004108910891089109, "loss": 1.7946, "step": 550 }, { "epoch": 1.0661589719181341, "grad_norm": 0.2885777950286865, "learning_rate": 0.0004089108910891089, "loss": 1.8075, "step": 560 }, { "epoch": 1.085197524988101, "grad_norm": 0.30024921894073486, "learning_rate": 0.0004069306930693069, "loss": 1.7824, "step": 570 }, { "epoch": 1.1042360780580676, "grad_norm": 0.2903335988521576, "learning_rate": 0.000404950495049505, "loss": 1.7954, "step": 580 }, { "epoch": 1.1232746311280342, "grad_norm": 0.3008085787296295, "learning_rate": 0.000402970297029703, "loss": 1.7969, "step": 590 }, { "epoch": 1.142313184198001, "grad_norm": 0.29621192812919617, "learning_rate": 0.000400990099009901, "loss": 1.7803, "step": 600 }, { "epoch": 1.142313184198001, "eval_loss": 1.8143733739852905, "eval_runtime": 4.1557, "eval_samples_per_second": 24.063, "eval_steps_per_second": 1.684, "step": 600 }, { "epoch": 1.1613517372679676, "grad_norm": 0.30486541986465454, "learning_rate": 0.000399009900990099, "loss": 1.8, "step": 610 }, { "epoch": 1.1803902903379344, "grad_norm": 0.2792316675186157, "learning_rate": 0.00039702970297029707, "loss": 1.7822, "step": 620 }, { "epoch": 1.199428843407901, "grad_norm": 0.2918599545955658, "learning_rate": 0.00039504950495049506, "loss": 1.7808, "step": 630 }, { "epoch": 1.2184673964778676, "grad_norm": 0.2980496883392334, "learning_rate": 0.0003930693069306931, "loss": 1.7952, "step": 640 }, { "epoch": 1.2375059495478344, "grad_norm": 0.31613168120384216, "learning_rate": 0.0003910891089108911, "loss": 1.7996, "step": 650 }, { "epoch": 1.256544502617801, "grad_norm": 0.30946284532546997, "learning_rate": 0.0003891089108910891, "loss": 1.791, "step": 660 }, { "epoch": 1.2755830556877679, "grad_norm": 0.28848570585250854, "learning_rate": 0.00038712871287128716, "loss": 1.782, "step": 670 }, { "epoch": 1.2946216087577345, "grad_norm": 0.2725277543067932, "learning_rate": 0.00038514851485148515, "loss": 1.7847, "step": 680 }, { "epoch": 1.313660161827701, "grad_norm": 0.2864035665988922, "learning_rate": 0.0003831683168316832, "loss": 1.7938, "step": 690 }, { "epoch": 1.332698714897668, "grad_norm": 0.30256739258766174, "learning_rate": 0.0003811881188118812, "loss": 1.7947, "step": 700 }, { "epoch": 1.3517372679676345, "grad_norm": 0.2603744864463806, "learning_rate": 0.0003792079207920792, "loss": 1.8028, "step": 710 }, { "epoch": 1.370775821037601, "grad_norm": 0.3716331124305725, "learning_rate": 0.00037722772277227725, "loss": 1.7722, "step": 720 }, { "epoch": 1.389814374107568, "grad_norm": 0.35902512073516846, "learning_rate": 0.00037524752475247524, "loss": 1.7916, "step": 730 }, { "epoch": 1.4088529271775345, "grad_norm": 0.28538694977760315, "learning_rate": 0.0003732673267326733, "loss": 1.7812, "step": 740 }, { "epoch": 1.4278914802475011, "grad_norm": 0.29331693053245544, "learning_rate": 0.0003712871287128713, "loss": 1.7983, "step": 750 }, { "epoch": 1.446930033317468, "grad_norm": 0.31655997037887573, "learning_rate": 0.0003693069306930693, "loss": 1.7983, "step": 760 }, { "epoch": 1.4659685863874345, "grad_norm": 0.29052191972732544, "learning_rate": 0.00036732673267326734, "loss": 1.8021, "step": 770 }, { "epoch": 1.4850071394574011, "grad_norm": 0.2977640628814697, "learning_rate": 0.00036534653465346533, "loss": 1.7702, "step": 780 }, { "epoch": 1.504045692527368, "grad_norm": 0.27408239245414734, "learning_rate": 0.0003633663366336634, "loss": 1.7836, "step": 790 }, { "epoch": 1.5230842455973346, "grad_norm": 0.29241588711738586, "learning_rate": 0.0003613861386138614, "loss": 1.8005, "step": 800 }, { "epoch": 1.5230842455973346, "eval_loss": 1.805577039718628, "eval_runtime": 4.2437, "eval_samples_per_second": 23.564, "eval_steps_per_second": 1.649, "step": 800 }, { "epoch": 1.5421227986673012, "grad_norm": 0.2775736451148987, "learning_rate": 0.0003594059405940594, "loss": 1.7725, "step": 810 }, { "epoch": 1.561161351737268, "grad_norm": 0.2777954339981079, "learning_rate": 0.00035742574257425743, "loss": 1.7921, "step": 820 }, { "epoch": 1.5801999048072346, "grad_norm": 0.27932244539260864, "learning_rate": 0.0003554455445544554, "loss": 1.7853, "step": 830 }, { "epoch": 1.5992384578772012, "grad_norm": 0.28905799984931946, "learning_rate": 0.0003534653465346535, "loss": 1.785, "step": 840 }, { "epoch": 1.618277010947168, "grad_norm": 0.2713293433189392, "learning_rate": 0.00035148514851485147, "loss": 1.7959, "step": 850 }, { "epoch": 1.6373155640171349, "grad_norm": 0.27542880177497864, "learning_rate": 0.00034950495049504947, "loss": 1.791, "step": 860 }, { "epoch": 1.6563541170871012, "grad_norm": 0.3243546783924103, "learning_rate": 0.0003475247524752475, "loss": 1.7831, "step": 870 }, { "epoch": 1.675392670157068, "grad_norm": 0.2858756184577942, "learning_rate": 0.0003455445544554455, "loss": 1.7829, "step": 880 }, { "epoch": 1.6944312232270349, "grad_norm": 0.28570687770843506, "learning_rate": 0.0003435643564356436, "loss": 1.7793, "step": 890 }, { "epoch": 1.7134697762970015, "grad_norm": 0.2588244080543518, "learning_rate": 0.0003415841584158416, "loss": 1.796, "step": 900 }, { "epoch": 1.732508329366968, "grad_norm": 0.2729063928127289, "learning_rate": 0.0003396039603960396, "loss": 1.7789, "step": 910 }, { "epoch": 1.751546882436935, "grad_norm": 0.2799668312072754, "learning_rate": 0.00033762376237623766, "loss": 1.7859, "step": 920 }, { "epoch": 1.7705854355069015, "grad_norm": 0.2754090428352356, "learning_rate": 0.00033564356435643566, "loss": 1.7879, "step": 930 }, { "epoch": 1.789623988576868, "grad_norm": 0.26798099279403687, "learning_rate": 0.0003336633663366337, "loss": 1.7744, "step": 940 }, { "epoch": 1.808662541646835, "grad_norm": 0.2651982605457306, "learning_rate": 0.0003316831683168317, "loss": 1.7813, "step": 950 }, { "epoch": 1.8277010947168015, "grad_norm": 0.25073009729385376, "learning_rate": 0.0003297029702970297, "loss": 1.7875, "step": 960 }, { "epoch": 1.8467396477867681, "grad_norm": 0.2663566470146179, "learning_rate": 0.00032772277227722775, "loss": 1.7795, "step": 970 }, { "epoch": 1.865778200856735, "grad_norm": 0.25802338123321533, "learning_rate": 0.00032574257425742575, "loss": 1.7772, "step": 980 }, { "epoch": 1.8848167539267016, "grad_norm": 0.2851213216781616, "learning_rate": 0.0003237623762376238, "loss": 1.7836, "step": 990 }, { "epoch": 1.9038553069966682, "grad_norm": 0.27455398440361023, "learning_rate": 0.0003217821782178218, "loss": 1.771, "step": 1000 }, { "epoch": 1.9038553069966682, "eval_loss": 1.8010673522949219, "eval_runtime": 4.1928, "eval_samples_per_second": 23.85, "eval_steps_per_second": 1.67, "step": 1000 }, { "epoch": 1.922893860066635, "grad_norm": 0.27414214611053467, "learning_rate": 0.0003198019801980198, "loss": 1.7763, "step": 1010 }, { "epoch": 1.9419324131366016, "grad_norm": 0.28562483191490173, "learning_rate": 0.00031782178217821784, "loss": 1.8059, "step": 1020 }, { "epoch": 1.9609709662065682, "grad_norm": 0.27301162481307983, "learning_rate": 0.00031584158415841584, "loss": 1.7853, "step": 1030 }, { "epoch": 1.980009519276535, "grad_norm": 0.2673158645629883, "learning_rate": 0.0003138613861386139, "loss": 1.7867, "step": 1040 }, { "epoch": 1.9990480723465016, "grad_norm": 0.2679426074028015, "learning_rate": 0.0003118811881188119, "loss": 1.7871, "step": 1050 }, { "epoch": 2.018086625416468, "grad_norm": 0.28638601303100586, "learning_rate": 0.0003099009900990099, "loss": 1.7884, "step": 1060 }, { "epoch": 2.037125178486435, "grad_norm": 0.26236289739608765, "learning_rate": 0.00030792079207920793, "loss": 1.767, "step": 1070 }, { "epoch": 2.056163731556402, "grad_norm": 0.2774026095867157, "learning_rate": 0.00030594059405940593, "loss": 1.7735, "step": 1080 }, { "epoch": 2.0752022846263682, "grad_norm": 0.28758397698402405, "learning_rate": 0.000303960396039604, "loss": 1.7833, "step": 1090 }, { "epoch": 2.094240837696335, "grad_norm": 0.25563687086105347, "learning_rate": 0.000301980198019802, "loss": 1.7741, "step": 1100 }, { "epoch": 2.113279390766302, "grad_norm": 0.29064470529556274, "learning_rate": 0.0003, "loss": 1.7759, "step": 1110 }, { "epoch": 2.1323179438362683, "grad_norm": 0.26785504817962646, "learning_rate": 0.000298019801980198, "loss": 1.7971, "step": 1120 }, { "epoch": 2.151356496906235, "grad_norm": 0.26074618101119995, "learning_rate": 0.000296039603960396, "loss": 1.7843, "step": 1130 }, { "epoch": 2.170395049976202, "grad_norm": 0.2896900475025177, "learning_rate": 0.00029405940594059407, "loss": 1.7732, "step": 1140 }, { "epoch": 2.1894336030461683, "grad_norm": 0.2741701602935791, "learning_rate": 0.00029207920792079207, "loss": 1.7898, "step": 1150 }, { "epoch": 2.208472156116135, "grad_norm": 0.28687021136283875, "learning_rate": 0.00029009900990099006, "loss": 1.7825, "step": 1160 }, { "epoch": 2.227510709186102, "grad_norm": 0.27220088243484497, "learning_rate": 0.0002881188118811881, "loss": 1.7699, "step": 1170 }, { "epoch": 2.2465492622560683, "grad_norm": 0.2600407898426056, "learning_rate": 0.0002861386138613861, "loss": 1.7923, "step": 1180 }, { "epoch": 2.265587815326035, "grad_norm": 0.25748902559280396, "learning_rate": 0.00028415841584158416, "loss": 1.7768, "step": 1190 }, { "epoch": 2.284626368396002, "grad_norm": 0.2772551476955414, "learning_rate": 0.00028217821782178216, "loss": 1.7792, "step": 1200 }, { "epoch": 2.284626368396002, "eval_loss": 1.7984042167663574, "eval_runtime": 4.2152, "eval_samples_per_second": 23.723, "eval_steps_per_second": 1.661, "step": 1200 } ], "logging_steps": 10, "max_steps": 2625, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9776499976503296e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }