{ "best_metric": 1.805577039718628, "best_model_checkpoint": "./results/checkpoint-800", "epoch": 1.5230842455973346, "eval_steps": 200, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01903855306996668, "grad_norm": 0.17994017899036407, "learning_rate": 5e-05, "loss": 2.1247, "step": 10 }, { "epoch": 0.03807710613993336, "grad_norm": 0.27629706263542175, "learning_rate": 0.0001, "loss": 2.0758, "step": 20 }, { "epoch": 0.05711565920990005, "grad_norm": 0.4726850092411041, "learning_rate": 0.00015, "loss": 2.0858, "step": 30 }, { "epoch": 0.07615421227986673, "grad_norm": 0.5583528876304626, "learning_rate": 0.0002, "loss": 2.0593, "step": 40 }, { "epoch": 0.09519276534983341, "grad_norm": 0.5730186104774475, "learning_rate": 0.00025, "loss": 2.0161, "step": 50 }, { "epoch": 0.1142313184198001, "grad_norm": 0.48230308294296265, "learning_rate": 0.0003, "loss": 1.9764, "step": 60 }, { "epoch": 0.13326987148976677, "grad_norm": 0.44312751293182373, "learning_rate": 0.00035, "loss": 1.9557, "step": 70 }, { "epoch": 0.15230842455973345, "grad_norm": 0.4186476171016693, "learning_rate": 0.0004, "loss": 1.9422, "step": 80 }, { "epoch": 0.17134697762970014, "grad_norm": 0.38540077209472656, "learning_rate": 0.00045000000000000004, "loss": 1.9189, "step": 90 }, { "epoch": 0.19038553069966682, "grad_norm": 0.35501590371131897, "learning_rate": 0.0005, "loss": 1.9254, "step": 100 }, { "epoch": 0.2094240837696335, "grad_norm": 0.40440383553504944, "learning_rate": 0.000498019801980198, "loss": 1.9032, "step": 110 }, { "epoch": 0.2284626368396002, "grad_norm": 0.39570745825767517, "learning_rate": 0.000496039603960396, "loss": 1.9029, "step": 120 }, { "epoch": 0.24750118990956688, "grad_norm": 0.4123484790325165, "learning_rate": 0.0004940594059405941, "loss": 1.8735, "step": 130 }, { "epoch": 0.26653974297953353, "grad_norm": 0.37050503492355347, "learning_rate": 0.0004920792079207921, "loss": 1.8739, "step": 140 }, { "epoch": 0.28557829604950025, "grad_norm": 0.4047178030014038, "learning_rate": 0.0004900990099009901, "loss": 1.8659, "step": 150 }, { "epoch": 0.3046168491194669, "grad_norm": 0.3643397092819214, "learning_rate": 0.0004881188118811881, "loss": 1.8689, "step": 160 }, { "epoch": 0.3236554021894336, "grad_norm": 0.37609240412712097, "learning_rate": 0.00048613861386138615, "loss": 1.8599, "step": 170 }, { "epoch": 0.3426939552594003, "grad_norm": 0.3859333395957947, "learning_rate": 0.00048415841584158414, "loss": 1.8441, "step": 180 }, { "epoch": 0.361732508329367, "grad_norm": 0.3943149447441101, "learning_rate": 0.0004821782178217822, "loss": 1.8366, "step": 190 }, { "epoch": 0.38077106139933364, "grad_norm": 0.41318005323410034, "learning_rate": 0.0004801980198019802, "loss": 1.8381, "step": 200 }, { "epoch": 0.38077106139933364, "eval_loss": 1.8646808862686157, "eval_runtime": 4.2162, "eval_samples_per_second": 23.718, "eval_steps_per_second": 1.66, "step": 200 }, { "epoch": 0.39980961446930036, "grad_norm": 0.3635823726654053, "learning_rate": 0.0004782178217821782, "loss": 1.8292, "step": 210 }, { "epoch": 0.418848167539267, "grad_norm": 0.3529907166957855, "learning_rate": 0.00047623762376237624, "loss": 1.8444, "step": 220 }, { "epoch": 0.43788672060923367, "grad_norm": 0.3581302762031555, "learning_rate": 0.00047425742574257423, "loss": 1.8352, "step": 230 }, { "epoch": 0.4569252736792004, "grad_norm": 0.3584224581718445, "learning_rate": 0.0004722772277227723, "loss": 1.8319, "step": 240 }, { "epoch": 0.47596382674916704, "grad_norm": 0.3439520299434662, "learning_rate": 0.0004702970297029703, "loss": 1.8296, "step": 250 }, { "epoch": 0.49500237981913375, "grad_norm": 0.3635288178920746, "learning_rate": 0.00046831683168316833, "loss": 1.8294, "step": 260 }, { "epoch": 0.5140409328891005, "grad_norm": 0.3621940612792969, "learning_rate": 0.0004663366336633664, "loss": 1.8245, "step": 270 }, { "epoch": 0.5330794859590671, "grad_norm": 0.3562050759792328, "learning_rate": 0.0004643564356435644, "loss": 1.8051, "step": 280 }, { "epoch": 0.5521180390290338, "grad_norm": 0.3374086618423462, "learning_rate": 0.00046237623762376243, "loss": 1.8205, "step": 290 }, { "epoch": 0.5711565920990005, "grad_norm": 0.33458590507507324, "learning_rate": 0.0004603960396039604, "loss": 1.8238, "step": 300 }, { "epoch": 0.5901951451689672, "grad_norm": 0.3511849045753479, "learning_rate": 0.0004584158415841584, "loss": 1.8074, "step": 310 }, { "epoch": 0.6092336982389338, "grad_norm": 0.3680996000766754, "learning_rate": 0.00045643564356435647, "loss": 1.8349, "step": 320 }, { "epoch": 0.6282722513089005, "grad_norm": 0.33489343523979187, "learning_rate": 0.00045445544554455447, "loss": 1.8304, "step": 330 }, { "epoch": 0.6473108043788672, "grad_norm": 0.3262704908847809, "learning_rate": 0.0004524752475247525, "loss": 1.8179, "step": 340 }, { "epoch": 0.6663493574488338, "grad_norm": 0.33311426639556885, "learning_rate": 0.0004504950495049505, "loss": 1.8075, "step": 350 }, { "epoch": 0.6853879105188005, "grad_norm": 0.3391004800796509, "learning_rate": 0.0004485148514851485, "loss": 1.8124, "step": 360 }, { "epoch": 0.7044264635887673, "grad_norm": 0.34050452709198, "learning_rate": 0.00044653465346534656, "loss": 1.8184, "step": 370 }, { "epoch": 0.723465016658734, "grad_norm": 0.320922315120697, "learning_rate": 0.00044455445544554456, "loss": 1.8129, "step": 380 }, { "epoch": 0.7425035697287006, "grad_norm": 0.3578341007232666, "learning_rate": 0.0004425742574257426, "loss": 1.7989, "step": 390 }, { "epoch": 0.7615421227986673, "grad_norm": 0.31143978238105774, "learning_rate": 0.0004405940594059406, "loss": 1.8054, "step": 400 }, { "epoch": 0.7615421227986673, "eval_loss": 1.829106330871582, "eval_runtime": 4.2436, "eval_samples_per_second": 23.565, "eval_steps_per_second": 1.65, "step": 400 }, { "epoch": 0.780580675868634, "grad_norm": 0.3297821581363678, "learning_rate": 0.0004386138613861386, "loss": 1.8165, "step": 410 }, { "epoch": 0.7996192289386007, "grad_norm": 0.33798128366470337, "learning_rate": 0.00043663366336633665, "loss": 1.8001, "step": 420 }, { "epoch": 0.8186577820085673, "grad_norm": 0.3441774547100067, "learning_rate": 0.00043465346534653465, "loss": 1.8057, "step": 430 }, { "epoch": 0.837696335078534, "grad_norm": 0.30104541778564453, "learning_rate": 0.0004326732673267327, "loss": 1.8122, "step": 440 }, { "epoch": 0.8567348881485007, "grad_norm": 0.31903618574142456, "learning_rate": 0.0004306930693069307, "loss": 1.8099, "step": 450 }, { "epoch": 0.8757734412184673, "grad_norm": 0.31247204542160034, "learning_rate": 0.0004287128712871287, "loss": 1.8132, "step": 460 }, { "epoch": 0.894811994288434, "grad_norm": 0.3191291391849518, "learning_rate": 0.00042673267326732674, "loss": 1.8143, "step": 470 }, { "epoch": 0.9138505473584008, "grad_norm": 0.3244192600250244, "learning_rate": 0.00042475247524752474, "loss": 1.7999, "step": 480 }, { "epoch": 0.9328891004283675, "grad_norm": 0.37674182653427124, "learning_rate": 0.0004227722772277228, "loss": 1.8097, "step": 490 }, { "epoch": 0.9519276534983341, "grad_norm": 0.31393611431121826, "learning_rate": 0.0004207920792079208, "loss": 1.802, "step": 500 }, { "epoch": 0.9709662065683008, "grad_norm": 0.3186231255531311, "learning_rate": 0.0004188118811881188, "loss": 1.8043, "step": 510 }, { "epoch": 0.9900047596382675, "grad_norm": 0.2924995422363281, "learning_rate": 0.00041683168316831683, "loss": 1.792, "step": 520 }, { "epoch": 1.009043312708234, "grad_norm": 0.3129435181617737, "learning_rate": 0.00041485148514851483, "loss": 1.8009, "step": 530 }, { "epoch": 1.028081865778201, "grad_norm": 0.2927923798561096, "learning_rate": 0.0004128712871287129, "loss": 1.8011, "step": 540 }, { "epoch": 1.0471204188481675, "grad_norm": 0.2918388545513153, "learning_rate": 0.0004108910891089109, "loss": 1.7946, "step": 550 }, { "epoch": 1.0661589719181341, "grad_norm": 0.2885777950286865, "learning_rate": 0.0004089108910891089, "loss": 1.8075, "step": 560 }, { "epoch": 1.085197524988101, "grad_norm": 0.30024921894073486, "learning_rate": 0.0004069306930693069, "loss": 1.7824, "step": 570 }, { "epoch": 1.1042360780580676, "grad_norm": 0.2903335988521576, "learning_rate": 0.000404950495049505, "loss": 1.7954, "step": 580 }, { "epoch": 1.1232746311280342, "grad_norm": 0.3008085787296295, "learning_rate": 0.000402970297029703, "loss": 1.7969, "step": 590 }, { "epoch": 1.142313184198001, "grad_norm": 0.29621192812919617, "learning_rate": 0.000400990099009901, "loss": 1.7803, "step": 600 }, { "epoch": 1.142313184198001, "eval_loss": 1.8143733739852905, "eval_runtime": 4.1557, "eval_samples_per_second": 24.063, "eval_steps_per_second": 1.684, "step": 600 }, { "epoch": 1.1613517372679676, "grad_norm": 0.30486541986465454, "learning_rate": 0.000399009900990099, "loss": 1.8, "step": 610 }, { "epoch": 1.1803902903379344, "grad_norm": 0.2792316675186157, "learning_rate": 0.00039702970297029707, "loss": 1.7822, "step": 620 }, { "epoch": 1.199428843407901, "grad_norm": 0.2918599545955658, "learning_rate": 0.00039504950495049506, "loss": 1.7808, "step": 630 }, { "epoch": 1.2184673964778676, "grad_norm": 0.2980496883392334, "learning_rate": 0.0003930693069306931, "loss": 1.7952, "step": 640 }, { "epoch": 1.2375059495478344, "grad_norm": 0.31613168120384216, "learning_rate": 0.0003910891089108911, "loss": 1.7996, "step": 650 }, { "epoch": 1.256544502617801, "grad_norm": 0.30946284532546997, "learning_rate": 0.0003891089108910891, "loss": 1.791, "step": 660 }, { "epoch": 1.2755830556877679, "grad_norm": 0.28848570585250854, "learning_rate": 0.00038712871287128716, "loss": 1.782, "step": 670 }, { "epoch": 1.2946216087577345, "grad_norm": 0.2725277543067932, "learning_rate": 0.00038514851485148515, "loss": 1.7847, "step": 680 }, { "epoch": 1.313660161827701, "grad_norm": 0.2864035665988922, "learning_rate": 0.0003831683168316832, "loss": 1.7938, "step": 690 }, { "epoch": 1.332698714897668, "grad_norm": 0.30256739258766174, "learning_rate": 0.0003811881188118812, "loss": 1.7947, "step": 700 }, { "epoch": 1.3517372679676345, "grad_norm": 0.2603744864463806, "learning_rate": 0.0003792079207920792, "loss": 1.8028, "step": 710 }, { "epoch": 1.370775821037601, "grad_norm": 0.3716331124305725, "learning_rate": 0.00037722772277227725, "loss": 1.7722, "step": 720 }, { "epoch": 1.389814374107568, "grad_norm": 0.35902512073516846, "learning_rate": 0.00037524752475247524, "loss": 1.7916, "step": 730 }, { "epoch": 1.4088529271775345, "grad_norm": 0.28538694977760315, "learning_rate": 0.0003732673267326733, "loss": 1.7812, "step": 740 }, { "epoch": 1.4278914802475011, "grad_norm": 0.29331693053245544, "learning_rate": 0.0003712871287128713, "loss": 1.7983, "step": 750 }, { "epoch": 1.446930033317468, "grad_norm": 0.31655997037887573, "learning_rate": 0.0003693069306930693, "loss": 1.7983, "step": 760 }, { "epoch": 1.4659685863874345, "grad_norm": 0.29052191972732544, "learning_rate": 0.00036732673267326734, "loss": 1.8021, "step": 770 }, { "epoch": 1.4850071394574011, "grad_norm": 0.2977640628814697, "learning_rate": 0.00036534653465346533, "loss": 1.7702, "step": 780 }, { "epoch": 1.504045692527368, "grad_norm": 0.27408239245414734, "learning_rate": 0.0003633663366336634, "loss": 1.7836, "step": 790 }, { "epoch": 1.5230842455973346, "grad_norm": 0.29241588711738586, "learning_rate": 0.0003613861386138614, "loss": 1.8005, "step": 800 }, { "epoch": 1.5230842455973346, "eval_loss": 1.805577039718628, "eval_runtime": 4.2437, "eval_samples_per_second": 23.564, "eval_steps_per_second": 1.649, "step": 800 } ], "logging_steps": 10, "max_steps": 2625, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3184333317668864e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }