|
{ |
|
"best_metric": 1.7984042167663574, |
|
"best_model_checkpoint": "./results/checkpoint-1200", |
|
"epoch": 2.284626368396002, |
|
"eval_steps": 200, |
|
"global_step": 1200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01903855306996668, |
|
"grad_norm": 0.17994017899036407, |
|
"learning_rate": 5e-05, |
|
"loss": 2.1247, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03807710613993336, |
|
"grad_norm": 0.27629706263542175, |
|
"learning_rate": 0.0001, |
|
"loss": 2.0758, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05711565920990005, |
|
"grad_norm": 0.4726850092411041, |
|
"learning_rate": 0.00015, |
|
"loss": 2.0858, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07615421227986673, |
|
"grad_norm": 0.5583528876304626, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0593, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09519276534983341, |
|
"grad_norm": 0.5730186104774475, |
|
"learning_rate": 0.00025, |
|
"loss": 2.0161, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1142313184198001, |
|
"grad_norm": 0.48230308294296265, |
|
"learning_rate": 0.0003, |
|
"loss": 1.9764, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13326987148976677, |
|
"grad_norm": 0.44312751293182373, |
|
"learning_rate": 0.00035, |
|
"loss": 1.9557, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15230842455973345, |
|
"grad_norm": 0.4186476171016693, |
|
"learning_rate": 0.0004, |
|
"loss": 1.9422, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.17134697762970014, |
|
"grad_norm": 0.38540077209472656, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 1.9189, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.19038553069966682, |
|
"grad_norm": 0.35501590371131897, |
|
"learning_rate": 0.0005, |
|
"loss": 1.9254, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2094240837696335, |
|
"grad_norm": 0.40440383553504944, |
|
"learning_rate": 0.000498019801980198, |
|
"loss": 1.9032, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2284626368396002, |
|
"grad_norm": 0.39570745825767517, |
|
"learning_rate": 0.000496039603960396, |
|
"loss": 1.9029, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24750118990956688, |
|
"grad_norm": 0.4123484790325165, |
|
"learning_rate": 0.0004940594059405941, |
|
"loss": 1.8735, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.26653974297953353, |
|
"grad_norm": 0.37050503492355347, |
|
"learning_rate": 0.0004920792079207921, |
|
"loss": 1.8739, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.28557829604950025, |
|
"grad_norm": 0.4047178030014038, |
|
"learning_rate": 0.0004900990099009901, |
|
"loss": 1.8659, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3046168491194669, |
|
"grad_norm": 0.3643397092819214, |
|
"learning_rate": 0.0004881188118811881, |
|
"loss": 1.8689, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3236554021894336, |
|
"grad_norm": 0.37609240412712097, |
|
"learning_rate": 0.00048613861386138615, |
|
"loss": 1.8599, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3426939552594003, |
|
"grad_norm": 0.3859333395957947, |
|
"learning_rate": 0.00048415841584158414, |
|
"loss": 1.8441, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.361732508329367, |
|
"grad_norm": 0.3943149447441101, |
|
"learning_rate": 0.0004821782178217822, |
|
"loss": 1.8366, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.38077106139933364, |
|
"grad_norm": 0.41318005323410034, |
|
"learning_rate": 0.0004801980198019802, |
|
"loss": 1.8381, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.38077106139933364, |
|
"eval_loss": 1.8646808862686157, |
|
"eval_runtime": 4.2162, |
|
"eval_samples_per_second": 23.718, |
|
"eval_steps_per_second": 1.66, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.39980961446930036, |
|
"grad_norm": 0.3635823726654053, |
|
"learning_rate": 0.0004782178217821782, |
|
"loss": 1.8292, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.418848167539267, |
|
"grad_norm": 0.3529907166957855, |
|
"learning_rate": 0.00047623762376237624, |
|
"loss": 1.8444, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.43788672060923367, |
|
"grad_norm": 0.3581302762031555, |
|
"learning_rate": 0.00047425742574257423, |
|
"loss": 1.8352, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4569252736792004, |
|
"grad_norm": 0.3584224581718445, |
|
"learning_rate": 0.0004722772277227723, |
|
"loss": 1.8319, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.47596382674916704, |
|
"grad_norm": 0.3439520299434662, |
|
"learning_rate": 0.0004702970297029703, |
|
"loss": 1.8296, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.49500237981913375, |
|
"grad_norm": 0.3635288178920746, |
|
"learning_rate": 0.00046831683168316833, |
|
"loss": 1.8294, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5140409328891005, |
|
"grad_norm": 0.3621940612792969, |
|
"learning_rate": 0.0004663366336633664, |
|
"loss": 1.8245, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5330794859590671, |
|
"grad_norm": 0.3562050759792328, |
|
"learning_rate": 0.0004643564356435644, |
|
"loss": 1.8051, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5521180390290338, |
|
"grad_norm": 0.3374086618423462, |
|
"learning_rate": 0.00046237623762376243, |
|
"loss": 1.8205, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5711565920990005, |
|
"grad_norm": 0.33458590507507324, |
|
"learning_rate": 0.0004603960396039604, |
|
"loss": 1.8238, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5901951451689672, |
|
"grad_norm": 0.3511849045753479, |
|
"learning_rate": 0.0004584158415841584, |
|
"loss": 1.8074, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6092336982389338, |
|
"grad_norm": 0.3680996000766754, |
|
"learning_rate": 0.00045643564356435647, |
|
"loss": 1.8349, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6282722513089005, |
|
"grad_norm": 0.33489343523979187, |
|
"learning_rate": 0.00045445544554455447, |
|
"loss": 1.8304, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6473108043788672, |
|
"grad_norm": 0.3262704908847809, |
|
"learning_rate": 0.0004524752475247525, |
|
"loss": 1.8179, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6663493574488338, |
|
"grad_norm": 0.33311426639556885, |
|
"learning_rate": 0.0004504950495049505, |
|
"loss": 1.8075, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6853879105188005, |
|
"grad_norm": 0.3391004800796509, |
|
"learning_rate": 0.0004485148514851485, |
|
"loss": 1.8124, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7044264635887673, |
|
"grad_norm": 0.34050452709198, |
|
"learning_rate": 0.00044653465346534656, |
|
"loss": 1.8184, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.723465016658734, |
|
"grad_norm": 0.320922315120697, |
|
"learning_rate": 0.00044455445544554456, |
|
"loss": 1.8129, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7425035697287006, |
|
"grad_norm": 0.3578341007232666, |
|
"learning_rate": 0.0004425742574257426, |
|
"loss": 1.7989, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7615421227986673, |
|
"grad_norm": 0.31143978238105774, |
|
"learning_rate": 0.0004405940594059406, |
|
"loss": 1.8054, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7615421227986673, |
|
"eval_loss": 1.829106330871582, |
|
"eval_runtime": 4.2436, |
|
"eval_samples_per_second": 23.565, |
|
"eval_steps_per_second": 1.65, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.780580675868634, |
|
"grad_norm": 0.3297821581363678, |
|
"learning_rate": 0.0004386138613861386, |
|
"loss": 1.8165, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7996192289386007, |
|
"grad_norm": 0.33798128366470337, |
|
"learning_rate": 0.00043663366336633665, |
|
"loss": 1.8001, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8186577820085673, |
|
"grad_norm": 0.3441774547100067, |
|
"learning_rate": 0.00043465346534653465, |
|
"loss": 1.8057, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"grad_norm": 0.30104541778564453, |
|
"learning_rate": 0.0004326732673267327, |
|
"loss": 1.8122, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8567348881485007, |
|
"grad_norm": 0.31903618574142456, |
|
"learning_rate": 0.0004306930693069307, |
|
"loss": 1.8099, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8757734412184673, |
|
"grad_norm": 0.31247204542160034, |
|
"learning_rate": 0.0004287128712871287, |
|
"loss": 1.8132, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.894811994288434, |
|
"grad_norm": 0.3191291391849518, |
|
"learning_rate": 0.00042673267326732674, |
|
"loss": 1.8143, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9138505473584008, |
|
"grad_norm": 0.3244192600250244, |
|
"learning_rate": 0.00042475247524752474, |
|
"loss": 1.7999, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9328891004283675, |
|
"grad_norm": 0.37674182653427124, |
|
"learning_rate": 0.0004227722772277228, |
|
"loss": 1.8097, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9519276534983341, |
|
"grad_norm": 0.31393611431121826, |
|
"learning_rate": 0.0004207920792079208, |
|
"loss": 1.802, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9709662065683008, |
|
"grad_norm": 0.3186231255531311, |
|
"learning_rate": 0.0004188118811881188, |
|
"loss": 1.8043, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9900047596382675, |
|
"grad_norm": 0.2924995422363281, |
|
"learning_rate": 0.00041683168316831683, |
|
"loss": 1.792, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.009043312708234, |
|
"grad_norm": 0.3129435181617737, |
|
"learning_rate": 0.00041485148514851483, |
|
"loss": 1.8009, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.028081865778201, |
|
"grad_norm": 0.2927923798561096, |
|
"learning_rate": 0.0004128712871287129, |
|
"loss": 1.8011, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0471204188481675, |
|
"grad_norm": 0.2918388545513153, |
|
"learning_rate": 0.0004108910891089109, |
|
"loss": 1.7946, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0661589719181341, |
|
"grad_norm": 0.2885777950286865, |
|
"learning_rate": 0.0004089108910891089, |
|
"loss": 1.8075, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.085197524988101, |
|
"grad_norm": 0.30024921894073486, |
|
"learning_rate": 0.0004069306930693069, |
|
"loss": 1.7824, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.1042360780580676, |
|
"grad_norm": 0.2903335988521576, |
|
"learning_rate": 0.000404950495049505, |
|
"loss": 1.7954, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.1232746311280342, |
|
"grad_norm": 0.3008085787296295, |
|
"learning_rate": 0.000402970297029703, |
|
"loss": 1.7969, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.142313184198001, |
|
"grad_norm": 0.29621192812919617, |
|
"learning_rate": 0.000400990099009901, |
|
"loss": 1.7803, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.142313184198001, |
|
"eval_loss": 1.8143733739852905, |
|
"eval_runtime": 4.1557, |
|
"eval_samples_per_second": 24.063, |
|
"eval_steps_per_second": 1.684, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1613517372679676, |
|
"grad_norm": 0.30486541986465454, |
|
"learning_rate": 0.000399009900990099, |
|
"loss": 1.8, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.1803902903379344, |
|
"grad_norm": 0.2792316675186157, |
|
"learning_rate": 0.00039702970297029707, |
|
"loss": 1.7822, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.199428843407901, |
|
"grad_norm": 0.2918599545955658, |
|
"learning_rate": 0.00039504950495049506, |
|
"loss": 1.7808, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.2184673964778676, |
|
"grad_norm": 0.2980496883392334, |
|
"learning_rate": 0.0003930693069306931, |
|
"loss": 1.7952, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.2375059495478344, |
|
"grad_norm": 0.31613168120384216, |
|
"learning_rate": 0.0003910891089108911, |
|
"loss": 1.7996, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.256544502617801, |
|
"grad_norm": 0.30946284532546997, |
|
"learning_rate": 0.0003891089108910891, |
|
"loss": 1.791, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2755830556877679, |
|
"grad_norm": 0.28848570585250854, |
|
"learning_rate": 0.00038712871287128716, |
|
"loss": 1.782, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.2946216087577345, |
|
"grad_norm": 0.2725277543067932, |
|
"learning_rate": 0.00038514851485148515, |
|
"loss": 1.7847, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.313660161827701, |
|
"grad_norm": 0.2864035665988922, |
|
"learning_rate": 0.0003831683168316832, |
|
"loss": 1.7938, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.332698714897668, |
|
"grad_norm": 0.30256739258766174, |
|
"learning_rate": 0.0003811881188118812, |
|
"loss": 1.7947, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3517372679676345, |
|
"grad_norm": 0.2603744864463806, |
|
"learning_rate": 0.0003792079207920792, |
|
"loss": 1.8028, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.370775821037601, |
|
"grad_norm": 0.3716331124305725, |
|
"learning_rate": 0.00037722772277227725, |
|
"loss": 1.7722, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.389814374107568, |
|
"grad_norm": 0.35902512073516846, |
|
"learning_rate": 0.00037524752475247524, |
|
"loss": 1.7916, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.4088529271775345, |
|
"grad_norm": 0.28538694977760315, |
|
"learning_rate": 0.0003732673267326733, |
|
"loss": 1.7812, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.4278914802475011, |
|
"grad_norm": 0.29331693053245544, |
|
"learning_rate": 0.0003712871287128713, |
|
"loss": 1.7983, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.446930033317468, |
|
"grad_norm": 0.31655997037887573, |
|
"learning_rate": 0.0003693069306930693, |
|
"loss": 1.7983, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.4659685863874345, |
|
"grad_norm": 0.29052191972732544, |
|
"learning_rate": 0.00036732673267326734, |
|
"loss": 1.8021, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.4850071394574011, |
|
"grad_norm": 0.2977640628814697, |
|
"learning_rate": 0.00036534653465346533, |
|
"loss": 1.7702, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.504045692527368, |
|
"grad_norm": 0.27408239245414734, |
|
"learning_rate": 0.0003633663366336634, |
|
"loss": 1.7836, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.5230842455973346, |
|
"grad_norm": 0.29241588711738586, |
|
"learning_rate": 0.0003613861386138614, |
|
"loss": 1.8005, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5230842455973346, |
|
"eval_loss": 1.805577039718628, |
|
"eval_runtime": 4.2437, |
|
"eval_samples_per_second": 23.564, |
|
"eval_steps_per_second": 1.649, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5421227986673012, |
|
"grad_norm": 0.2775736451148987, |
|
"learning_rate": 0.0003594059405940594, |
|
"loss": 1.7725, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.561161351737268, |
|
"grad_norm": 0.2777954339981079, |
|
"learning_rate": 0.00035742574257425743, |
|
"loss": 1.7921, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.5801999048072346, |
|
"grad_norm": 0.27932244539260864, |
|
"learning_rate": 0.0003554455445544554, |
|
"loss": 1.7853, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.5992384578772012, |
|
"grad_norm": 0.28905799984931946, |
|
"learning_rate": 0.0003534653465346535, |
|
"loss": 1.785, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.618277010947168, |
|
"grad_norm": 0.2713293433189392, |
|
"learning_rate": 0.00035148514851485147, |
|
"loss": 1.7959, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.6373155640171349, |
|
"grad_norm": 0.27542880177497864, |
|
"learning_rate": 0.00034950495049504947, |
|
"loss": 1.791, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.6563541170871012, |
|
"grad_norm": 0.3243546783924103, |
|
"learning_rate": 0.0003475247524752475, |
|
"loss": 1.7831, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.675392670157068, |
|
"grad_norm": 0.2858756184577942, |
|
"learning_rate": 0.0003455445544554455, |
|
"loss": 1.7829, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.6944312232270349, |
|
"grad_norm": 0.28570687770843506, |
|
"learning_rate": 0.0003435643564356436, |
|
"loss": 1.7793, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.7134697762970015, |
|
"grad_norm": 0.2588244080543518, |
|
"learning_rate": 0.0003415841584158416, |
|
"loss": 1.796, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.732508329366968, |
|
"grad_norm": 0.2729063928127289, |
|
"learning_rate": 0.0003396039603960396, |
|
"loss": 1.7789, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.751546882436935, |
|
"grad_norm": 0.2799668312072754, |
|
"learning_rate": 0.00033762376237623766, |
|
"loss": 1.7859, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.7705854355069015, |
|
"grad_norm": 0.2754090428352356, |
|
"learning_rate": 0.00033564356435643566, |
|
"loss": 1.7879, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.789623988576868, |
|
"grad_norm": 0.26798099279403687, |
|
"learning_rate": 0.0003336633663366337, |
|
"loss": 1.7744, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.808662541646835, |
|
"grad_norm": 0.2651982605457306, |
|
"learning_rate": 0.0003316831683168317, |
|
"loss": 1.7813, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.8277010947168015, |
|
"grad_norm": 0.25073009729385376, |
|
"learning_rate": 0.0003297029702970297, |
|
"loss": 1.7875, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.8467396477867681, |
|
"grad_norm": 0.2663566470146179, |
|
"learning_rate": 0.00032772277227722775, |
|
"loss": 1.7795, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.865778200856735, |
|
"grad_norm": 0.25802338123321533, |
|
"learning_rate": 0.00032574257425742575, |
|
"loss": 1.7772, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.8848167539267016, |
|
"grad_norm": 0.2851213216781616, |
|
"learning_rate": 0.0003237623762376238, |
|
"loss": 1.7836, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.9038553069966682, |
|
"grad_norm": 0.27455398440361023, |
|
"learning_rate": 0.0003217821782178218, |
|
"loss": 1.771, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.9038553069966682, |
|
"eval_loss": 1.8010673522949219, |
|
"eval_runtime": 4.1928, |
|
"eval_samples_per_second": 23.85, |
|
"eval_steps_per_second": 1.67, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.922893860066635, |
|
"grad_norm": 0.27414214611053467, |
|
"learning_rate": 0.0003198019801980198, |
|
"loss": 1.7763, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.9419324131366016, |
|
"grad_norm": 0.28562483191490173, |
|
"learning_rate": 0.00031782178217821784, |
|
"loss": 1.8059, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.9609709662065682, |
|
"grad_norm": 0.27301162481307983, |
|
"learning_rate": 0.00031584158415841584, |
|
"loss": 1.7853, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.980009519276535, |
|
"grad_norm": 0.2673158645629883, |
|
"learning_rate": 0.0003138613861386139, |
|
"loss": 1.7867, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.9990480723465016, |
|
"grad_norm": 0.2679426074028015, |
|
"learning_rate": 0.0003118811881188119, |
|
"loss": 1.7871, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.018086625416468, |
|
"grad_norm": 0.28638601303100586, |
|
"learning_rate": 0.0003099009900990099, |
|
"loss": 1.7884, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.037125178486435, |
|
"grad_norm": 0.26236289739608765, |
|
"learning_rate": 0.00030792079207920793, |
|
"loss": 1.767, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.056163731556402, |
|
"grad_norm": 0.2774026095867157, |
|
"learning_rate": 0.00030594059405940593, |
|
"loss": 1.7735, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.0752022846263682, |
|
"grad_norm": 0.28758397698402405, |
|
"learning_rate": 0.000303960396039604, |
|
"loss": 1.7833, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.094240837696335, |
|
"grad_norm": 0.25563687086105347, |
|
"learning_rate": 0.000301980198019802, |
|
"loss": 1.7741, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.113279390766302, |
|
"grad_norm": 0.29064470529556274, |
|
"learning_rate": 0.0003, |
|
"loss": 1.7759, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.1323179438362683, |
|
"grad_norm": 0.26785504817962646, |
|
"learning_rate": 0.000298019801980198, |
|
"loss": 1.7971, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.151356496906235, |
|
"grad_norm": 0.26074618101119995, |
|
"learning_rate": 0.000296039603960396, |
|
"loss": 1.7843, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.170395049976202, |
|
"grad_norm": 0.2896900475025177, |
|
"learning_rate": 0.00029405940594059407, |
|
"loss": 1.7732, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.1894336030461683, |
|
"grad_norm": 0.2741701602935791, |
|
"learning_rate": 0.00029207920792079207, |
|
"loss": 1.7898, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.208472156116135, |
|
"grad_norm": 0.28687021136283875, |
|
"learning_rate": 0.00029009900990099006, |
|
"loss": 1.7825, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.227510709186102, |
|
"grad_norm": 0.27220088243484497, |
|
"learning_rate": 0.0002881188118811881, |
|
"loss": 1.7699, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.2465492622560683, |
|
"grad_norm": 0.2600407898426056, |
|
"learning_rate": 0.0002861386138613861, |
|
"loss": 1.7923, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.265587815326035, |
|
"grad_norm": 0.25748902559280396, |
|
"learning_rate": 0.00028415841584158416, |
|
"loss": 1.7768, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.284626368396002, |
|
"grad_norm": 0.2772551476955414, |
|
"learning_rate": 0.00028217821782178216, |
|
"loss": 1.7792, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.284626368396002, |
|
"eval_loss": 1.7984042167663574, |
|
"eval_runtime": 4.2152, |
|
"eval_samples_per_second": 23.723, |
|
"eval_steps_per_second": 1.661, |
|
"step": 1200 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2625, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9776499976503296e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|