final2 / trainer_state.json
yujinjin's picture
Initial upload
5092688 verified
{
"best_metric": 1.7984042167663574,
"best_model_checkpoint": "./results/checkpoint-1200",
"epoch": 2.284626368396002,
"eval_steps": 200,
"global_step": 1200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01903855306996668,
"grad_norm": 0.17994017899036407,
"learning_rate": 5e-05,
"loss": 2.1247,
"step": 10
},
{
"epoch": 0.03807710613993336,
"grad_norm": 0.27629706263542175,
"learning_rate": 0.0001,
"loss": 2.0758,
"step": 20
},
{
"epoch": 0.05711565920990005,
"grad_norm": 0.4726850092411041,
"learning_rate": 0.00015,
"loss": 2.0858,
"step": 30
},
{
"epoch": 0.07615421227986673,
"grad_norm": 0.5583528876304626,
"learning_rate": 0.0002,
"loss": 2.0593,
"step": 40
},
{
"epoch": 0.09519276534983341,
"grad_norm": 0.5730186104774475,
"learning_rate": 0.00025,
"loss": 2.0161,
"step": 50
},
{
"epoch": 0.1142313184198001,
"grad_norm": 0.48230308294296265,
"learning_rate": 0.0003,
"loss": 1.9764,
"step": 60
},
{
"epoch": 0.13326987148976677,
"grad_norm": 0.44312751293182373,
"learning_rate": 0.00035,
"loss": 1.9557,
"step": 70
},
{
"epoch": 0.15230842455973345,
"grad_norm": 0.4186476171016693,
"learning_rate": 0.0004,
"loss": 1.9422,
"step": 80
},
{
"epoch": 0.17134697762970014,
"grad_norm": 0.38540077209472656,
"learning_rate": 0.00045000000000000004,
"loss": 1.9189,
"step": 90
},
{
"epoch": 0.19038553069966682,
"grad_norm": 0.35501590371131897,
"learning_rate": 0.0005,
"loss": 1.9254,
"step": 100
},
{
"epoch": 0.2094240837696335,
"grad_norm": 0.40440383553504944,
"learning_rate": 0.000498019801980198,
"loss": 1.9032,
"step": 110
},
{
"epoch": 0.2284626368396002,
"grad_norm": 0.39570745825767517,
"learning_rate": 0.000496039603960396,
"loss": 1.9029,
"step": 120
},
{
"epoch": 0.24750118990956688,
"grad_norm": 0.4123484790325165,
"learning_rate": 0.0004940594059405941,
"loss": 1.8735,
"step": 130
},
{
"epoch": 0.26653974297953353,
"grad_norm": 0.37050503492355347,
"learning_rate": 0.0004920792079207921,
"loss": 1.8739,
"step": 140
},
{
"epoch": 0.28557829604950025,
"grad_norm": 0.4047178030014038,
"learning_rate": 0.0004900990099009901,
"loss": 1.8659,
"step": 150
},
{
"epoch": 0.3046168491194669,
"grad_norm": 0.3643397092819214,
"learning_rate": 0.0004881188118811881,
"loss": 1.8689,
"step": 160
},
{
"epoch": 0.3236554021894336,
"grad_norm": 0.37609240412712097,
"learning_rate": 0.00048613861386138615,
"loss": 1.8599,
"step": 170
},
{
"epoch": 0.3426939552594003,
"grad_norm": 0.3859333395957947,
"learning_rate": 0.00048415841584158414,
"loss": 1.8441,
"step": 180
},
{
"epoch": 0.361732508329367,
"grad_norm": 0.3943149447441101,
"learning_rate": 0.0004821782178217822,
"loss": 1.8366,
"step": 190
},
{
"epoch": 0.38077106139933364,
"grad_norm": 0.41318005323410034,
"learning_rate": 0.0004801980198019802,
"loss": 1.8381,
"step": 200
},
{
"epoch": 0.38077106139933364,
"eval_loss": 1.8646808862686157,
"eval_runtime": 4.2162,
"eval_samples_per_second": 23.718,
"eval_steps_per_second": 1.66,
"step": 200
},
{
"epoch": 0.39980961446930036,
"grad_norm": 0.3635823726654053,
"learning_rate": 0.0004782178217821782,
"loss": 1.8292,
"step": 210
},
{
"epoch": 0.418848167539267,
"grad_norm": 0.3529907166957855,
"learning_rate": 0.00047623762376237624,
"loss": 1.8444,
"step": 220
},
{
"epoch": 0.43788672060923367,
"grad_norm": 0.3581302762031555,
"learning_rate": 0.00047425742574257423,
"loss": 1.8352,
"step": 230
},
{
"epoch": 0.4569252736792004,
"grad_norm": 0.3584224581718445,
"learning_rate": 0.0004722772277227723,
"loss": 1.8319,
"step": 240
},
{
"epoch": 0.47596382674916704,
"grad_norm": 0.3439520299434662,
"learning_rate": 0.0004702970297029703,
"loss": 1.8296,
"step": 250
},
{
"epoch": 0.49500237981913375,
"grad_norm": 0.3635288178920746,
"learning_rate": 0.00046831683168316833,
"loss": 1.8294,
"step": 260
},
{
"epoch": 0.5140409328891005,
"grad_norm": 0.3621940612792969,
"learning_rate": 0.0004663366336633664,
"loss": 1.8245,
"step": 270
},
{
"epoch": 0.5330794859590671,
"grad_norm": 0.3562050759792328,
"learning_rate": 0.0004643564356435644,
"loss": 1.8051,
"step": 280
},
{
"epoch": 0.5521180390290338,
"grad_norm": 0.3374086618423462,
"learning_rate": 0.00046237623762376243,
"loss": 1.8205,
"step": 290
},
{
"epoch": 0.5711565920990005,
"grad_norm": 0.33458590507507324,
"learning_rate": 0.0004603960396039604,
"loss": 1.8238,
"step": 300
},
{
"epoch": 0.5901951451689672,
"grad_norm": 0.3511849045753479,
"learning_rate": 0.0004584158415841584,
"loss": 1.8074,
"step": 310
},
{
"epoch": 0.6092336982389338,
"grad_norm": 0.3680996000766754,
"learning_rate": 0.00045643564356435647,
"loss": 1.8349,
"step": 320
},
{
"epoch": 0.6282722513089005,
"grad_norm": 0.33489343523979187,
"learning_rate": 0.00045445544554455447,
"loss": 1.8304,
"step": 330
},
{
"epoch": 0.6473108043788672,
"grad_norm": 0.3262704908847809,
"learning_rate": 0.0004524752475247525,
"loss": 1.8179,
"step": 340
},
{
"epoch": 0.6663493574488338,
"grad_norm": 0.33311426639556885,
"learning_rate": 0.0004504950495049505,
"loss": 1.8075,
"step": 350
},
{
"epoch": 0.6853879105188005,
"grad_norm": 0.3391004800796509,
"learning_rate": 0.0004485148514851485,
"loss": 1.8124,
"step": 360
},
{
"epoch": 0.7044264635887673,
"grad_norm": 0.34050452709198,
"learning_rate": 0.00044653465346534656,
"loss": 1.8184,
"step": 370
},
{
"epoch": 0.723465016658734,
"grad_norm": 0.320922315120697,
"learning_rate": 0.00044455445544554456,
"loss": 1.8129,
"step": 380
},
{
"epoch": 0.7425035697287006,
"grad_norm": 0.3578341007232666,
"learning_rate": 0.0004425742574257426,
"loss": 1.7989,
"step": 390
},
{
"epoch": 0.7615421227986673,
"grad_norm": 0.31143978238105774,
"learning_rate": 0.0004405940594059406,
"loss": 1.8054,
"step": 400
},
{
"epoch": 0.7615421227986673,
"eval_loss": 1.829106330871582,
"eval_runtime": 4.2436,
"eval_samples_per_second": 23.565,
"eval_steps_per_second": 1.65,
"step": 400
},
{
"epoch": 0.780580675868634,
"grad_norm": 0.3297821581363678,
"learning_rate": 0.0004386138613861386,
"loss": 1.8165,
"step": 410
},
{
"epoch": 0.7996192289386007,
"grad_norm": 0.33798128366470337,
"learning_rate": 0.00043663366336633665,
"loss": 1.8001,
"step": 420
},
{
"epoch": 0.8186577820085673,
"grad_norm": 0.3441774547100067,
"learning_rate": 0.00043465346534653465,
"loss": 1.8057,
"step": 430
},
{
"epoch": 0.837696335078534,
"grad_norm": 0.30104541778564453,
"learning_rate": 0.0004326732673267327,
"loss": 1.8122,
"step": 440
},
{
"epoch": 0.8567348881485007,
"grad_norm": 0.31903618574142456,
"learning_rate": 0.0004306930693069307,
"loss": 1.8099,
"step": 450
},
{
"epoch": 0.8757734412184673,
"grad_norm": 0.31247204542160034,
"learning_rate": 0.0004287128712871287,
"loss": 1.8132,
"step": 460
},
{
"epoch": 0.894811994288434,
"grad_norm": 0.3191291391849518,
"learning_rate": 0.00042673267326732674,
"loss": 1.8143,
"step": 470
},
{
"epoch": 0.9138505473584008,
"grad_norm": 0.3244192600250244,
"learning_rate": 0.00042475247524752474,
"loss": 1.7999,
"step": 480
},
{
"epoch": 0.9328891004283675,
"grad_norm": 0.37674182653427124,
"learning_rate": 0.0004227722772277228,
"loss": 1.8097,
"step": 490
},
{
"epoch": 0.9519276534983341,
"grad_norm": 0.31393611431121826,
"learning_rate": 0.0004207920792079208,
"loss": 1.802,
"step": 500
},
{
"epoch": 0.9709662065683008,
"grad_norm": 0.3186231255531311,
"learning_rate": 0.0004188118811881188,
"loss": 1.8043,
"step": 510
},
{
"epoch": 0.9900047596382675,
"grad_norm": 0.2924995422363281,
"learning_rate": 0.00041683168316831683,
"loss": 1.792,
"step": 520
},
{
"epoch": 1.009043312708234,
"grad_norm": 0.3129435181617737,
"learning_rate": 0.00041485148514851483,
"loss": 1.8009,
"step": 530
},
{
"epoch": 1.028081865778201,
"grad_norm": 0.2927923798561096,
"learning_rate": 0.0004128712871287129,
"loss": 1.8011,
"step": 540
},
{
"epoch": 1.0471204188481675,
"grad_norm": 0.2918388545513153,
"learning_rate": 0.0004108910891089109,
"loss": 1.7946,
"step": 550
},
{
"epoch": 1.0661589719181341,
"grad_norm": 0.2885777950286865,
"learning_rate": 0.0004089108910891089,
"loss": 1.8075,
"step": 560
},
{
"epoch": 1.085197524988101,
"grad_norm": 0.30024921894073486,
"learning_rate": 0.0004069306930693069,
"loss": 1.7824,
"step": 570
},
{
"epoch": 1.1042360780580676,
"grad_norm": 0.2903335988521576,
"learning_rate": 0.000404950495049505,
"loss": 1.7954,
"step": 580
},
{
"epoch": 1.1232746311280342,
"grad_norm": 0.3008085787296295,
"learning_rate": 0.000402970297029703,
"loss": 1.7969,
"step": 590
},
{
"epoch": 1.142313184198001,
"grad_norm": 0.29621192812919617,
"learning_rate": 0.000400990099009901,
"loss": 1.7803,
"step": 600
},
{
"epoch": 1.142313184198001,
"eval_loss": 1.8143733739852905,
"eval_runtime": 4.1557,
"eval_samples_per_second": 24.063,
"eval_steps_per_second": 1.684,
"step": 600
},
{
"epoch": 1.1613517372679676,
"grad_norm": 0.30486541986465454,
"learning_rate": 0.000399009900990099,
"loss": 1.8,
"step": 610
},
{
"epoch": 1.1803902903379344,
"grad_norm": 0.2792316675186157,
"learning_rate": 0.00039702970297029707,
"loss": 1.7822,
"step": 620
},
{
"epoch": 1.199428843407901,
"grad_norm": 0.2918599545955658,
"learning_rate": 0.00039504950495049506,
"loss": 1.7808,
"step": 630
},
{
"epoch": 1.2184673964778676,
"grad_norm": 0.2980496883392334,
"learning_rate": 0.0003930693069306931,
"loss": 1.7952,
"step": 640
},
{
"epoch": 1.2375059495478344,
"grad_norm": 0.31613168120384216,
"learning_rate": 0.0003910891089108911,
"loss": 1.7996,
"step": 650
},
{
"epoch": 1.256544502617801,
"grad_norm": 0.30946284532546997,
"learning_rate": 0.0003891089108910891,
"loss": 1.791,
"step": 660
},
{
"epoch": 1.2755830556877679,
"grad_norm": 0.28848570585250854,
"learning_rate": 0.00038712871287128716,
"loss": 1.782,
"step": 670
},
{
"epoch": 1.2946216087577345,
"grad_norm": 0.2725277543067932,
"learning_rate": 0.00038514851485148515,
"loss": 1.7847,
"step": 680
},
{
"epoch": 1.313660161827701,
"grad_norm": 0.2864035665988922,
"learning_rate": 0.0003831683168316832,
"loss": 1.7938,
"step": 690
},
{
"epoch": 1.332698714897668,
"grad_norm": 0.30256739258766174,
"learning_rate": 0.0003811881188118812,
"loss": 1.7947,
"step": 700
},
{
"epoch": 1.3517372679676345,
"grad_norm": 0.2603744864463806,
"learning_rate": 0.0003792079207920792,
"loss": 1.8028,
"step": 710
},
{
"epoch": 1.370775821037601,
"grad_norm": 0.3716331124305725,
"learning_rate": 0.00037722772277227725,
"loss": 1.7722,
"step": 720
},
{
"epoch": 1.389814374107568,
"grad_norm": 0.35902512073516846,
"learning_rate": 0.00037524752475247524,
"loss": 1.7916,
"step": 730
},
{
"epoch": 1.4088529271775345,
"grad_norm": 0.28538694977760315,
"learning_rate": 0.0003732673267326733,
"loss": 1.7812,
"step": 740
},
{
"epoch": 1.4278914802475011,
"grad_norm": 0.29331693053245544,
"learning_rate": 0.0003712871287128713,
"loss": 1.7983,
"step": 750
},
{
"epoch": 1.446930033317468,
"grad_norm": 0.31655997037887573,
"learning_rate": 0.0003693069306930693,
"loss": 1.7983,
"step": 760
},
{
"epoch": 1.4659685863874345,
"grad_norm": 0.29052191972732544,
"learning_rate": 0.00036732673267326734,
"loss": 1.8021,
"step": 770
},
{
"epoch": 1.4850071394574011,
"grad_norm": 0.2977640628814697,
"learning_rate": 0.00036534653465346533,
"loss": 1.7702,
"step": 780
},
{
"epoch": 1.504045692527368,
"grad_norm": 0.27408239245414734,
"learning_rate": 0.0003633663366336634,
"loss": 1.7836,
"step": 790
},
{
"epoch": 1.5230842455973346,
"grad_norm": 0.29241588711738586,
"learning_rate": 0.0003613861386138614,
"loss": 1.8005,
"step": 800
},
{
"epoch": 1.5230842455973346,
"eval_loss": 1.805577039718628,
"eval_runtime": 4.2437,
"eval_samples_per_second": 23.564,
"eval_steps_per_second": 1.649,
"step": 800
},
{
"epoch": 1.5421227986673012,
"grad_norm": 0.2775736451148987,
"learning_rate": 0.0003594059405940594,
"loss": 1.7725,
"step": 810
},
{
"epoch": 1.561161351737268,
"grad_norm": 0.2777954339981079,
"learning_rate": 0.00035742574257425743,
"loss": 1.7921,
"step": 820
},
{
"epoch": 1.5801999048072346,
"grad_norm": 0.27932244539260864,
"learning_rate": 0.0003554455445544554,
"loss": 1.7853,
"step": 830
},
{
"epoch": 1.5992384578772012,
"grad_norm": 0.28905799984931946,
"learning_rate": 0.0003534653465346535,
"loss": 1.785,
"step": 840
},
{
"epoch": 1.618277010947168,
"grad_norm": 0.2713293433189392,
"learning_rate": 0.00035148514851485147,
"loss": 1.7959,
"step": 850
},
{
"epoch": 1.6373155640171349,
"grad_norm": 0.27542880177497864,
"learning_rate": 0.00034950495049504947,
"loss": 1.791,
"step": 860
},
{
"epoch": 1.6563541170871012,
"grad_norm": 0.3243546783924103,
"learning_rate": 0.0003475247524752475,
"loss": 1.7831,
"step": 870
},
{
"epoch": 1.675392670157068,
"grad_norm": 0.2858756184577942,
"learning_rate": 0.0003455445544554455,
"loss": 1.7829,
"step": 880
},
{
"epoch": 1.6944312232270349,
"grad_norm": 0.28570687770843506,
"learning_rate": 0.0003435643564356436,
"loss": 1.7793,
"step": 890
},
{
"epoch": 1.7134697762970015,
"grad_norm": 0.2588244080543518,
"learning_rate": 0.0003415841584158416,
"loss": 1.796,
"step": 900
},
{
"epoch": 1.732508329366968,
"grad_norm": 0.2729063928127289,
"learning_rate": 0.0003396039603960396,
"loss": 1.7789,
"step": 910
},
{
"epoch": 1.751546882436935,
"grad_norm": 0.2799668312072754,
"learning_rate": 0.00033762376237623766,
"loss": 1.7859,
"step": 920
},
{
"epoch": 1.7705854355069015,
"grad_norm": 0.2754090428352356,
"learning_rate": 0.00033564356435643566,
"loss": 1.7879,
"step": 930
},
{
"epoch": 1.789623988576868,
"grad_norm": 0.26798099279403687,
"learning_rate": 0.0003336633663366337,
"loss": 1.7744,
"step": 940
},
{
"epoch": 1.808662541646835,
"grad_norm": 0.2651982605457306,
"learning_rate": 0.0003316831683168317,
"loss": 1.7813,
"step": 950
},
{
"epoch": 1.8277010947168015,
"grad_norm": 0.25073009729385376,
"learning_rate": 0.0003297029702970297,
"loss": 1.7875,
"step": 960
},
{
"epoch": 1.8467396477867681,
"grad_norm": 0.2663566470146179,
"learning_rate": 0.00032772277227722775,
"loss": 1.7795,
"step": 970
},
{
"epoch": 1.865778200856735,
"grad_norm": 0.25802338123321533,
"learning_rate": 0.00032574257425742575,
"loss": 1.7772,
"step": 980
},
{
"epoch": 1.8848167539267016,
"grad_norm": 0.2851213216781616,
"learning_rate": 0.0003237623762376238,
"loss": 1.7836,
"step": 990
},
{
"epoch": 1.9038553069966682,
"grad_norm": 0.27455398440361023,
"learning_rate": 0.0003217821782178218,
"loss": 1.771,
"step": 1000
},
{
"epoch": 1.9038553069966682,
"eval_loss": 1.8010673522949219,
"eval_runtime": 4.1928,
"eval_samples_per_second": 23.85,
"eval_steps_per_second": 1.67,
"step": 1000
},
{
"epoch": 1.922893860066635,
"grad_norm": 0.27414214611053467,
"learning_rate": 0.0003198019801980198,
"loss": 1.7763,
"step": 1010
},
{
"epoch": 1.9419324131366016,
"grad_norm": 0.28562483191490173,
"learning_rate": 0.00031782178217821784,
"loss": 1.8059,
"step": 1020
},
{
"epoch": 1.9609709662065682,
"grad_norm": 0.27301162481307983,
"learning_rate": 0.00031584158415841584,
"loss": 1.7853,
"step": 1030
},
{
"epoch": 1.980009519276535,
"grad_norm": 0.2673158645629883,
"learning_rate": 0.0003138613861386139,
"loss": 1.7867,
"step": 1040
},
{
"epoch": 1.9990480723465016,
"grad_norm": 0.2679426074028015,
"learning_rate": 0.0003118811881188119,
"loss": 1.7871,
"step": 1050
},
{
"epoch": 2.018086625416468,
"grad_norm": 0.28638601303100586,
"learning_rate": 0.0003099009900990099,
"loss": 1.7884,
"step": 1060
},
{
"epoch": 2.037125178486435,
"grad_norm": 0.26236289739608765,
"learning_rate": 0.00030792079207920793,
"loss": 1.767,
"step": 1070
},
{
"epoch": 2.056163731556402,
"grad_norm": 0.2774026095867157,
"learning_rate": 0.00030594059405940593,
"loss": 1.7735,
"step": 1080
},
{
"epoch": 2.0752022846263682,
"grad_norm": 0.28758397698402405,
"learning_rate": 0.000303960396039604,
"loss": 1.7833,
"step": 1090
},
{
"epoch": 2.094240837696335,
"grad_norm": 0.25563687086105347,
"learning_rate": 0.000301980198019802,
"loss": 1.7741,
"step": 1100
},
{
"epoch": 2.113279390766302,
"grad_norm": 0.29064470529556274,
"learning_rate": 0.0003,
"loss": 1.7759,
"step": 1110
},
{
"epoch": 2.1323179438362683,
"grad_norm": 0.26785504817962646,
"learning_rate": 0.000298019801980198,
"loss": 1.7971,
"step": 1120
},
{
"epoch": 2.151356496906235,
"grad_norm": 0.26074618101119995,
"learning_rate": 0.000296039603960396,
"loss": 1.7843,
"step": 1130
},
{
"epoch": 2.170395049976202,
"grad_norm": 0.2896900475025177,
"learning_rate": 0.00029405940594059407,
"loss": 1.7732,
"step": 1140
},
{
"epoch": 2.1894336030461683,
"grad_norm": 0.2741701602935791,
"learning_rate": 0.00029207920792079207,
"loss": 1.7898,
"step": 1150
},
{
"epoch": 2.208472156116135,
"grad_norm": 0.28687021136283875,
"learning_rate": 0.00029009900990099006,
"loss": 1.7825,
"step": 1160
},
{
"epoch": 2.227510709186102,
"grad_norm": 0.27220088243484497,
"learning_rate": 0.0002881188118811881,
"loss": 1.7699,
"step": 1170
},
{
"epoch": 2.2465492622560683,
"grad_norm": 0.2600407898426056,
"learning_rate": 0.0002861386138613861,
"loss": 1.7923,
"step": 1180
},
{
"epoch": 2.265587815326035,
"grad_norm": 0.25748902559280396,
"learning_rate": 0.00028415841584158416,
"loss": 1.7768,
"step": 1190
},
{
"epoch": 2.284626368396002,
"grad_norm": 0.2772551476955414,
"learning_rate": 0.00028217821782178216,
"loss": 1.7792,
"step": 1200
},
{
"epoch": 2.284626368396002,
"eval_loss": 1.7984042167663574,
"eval_runtime": 4.2152,
"eval_samples_per_second": 23.723,
"eval_steps_per_second": 1.661,
"step": 1200
}
],
"logging_steps": 10,
"max_steps": 2625,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.9776499976503296e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}