smollm-360M-aliases-5-plus / trainer_state.json
PereLluis13's picture
Model save
913f6f3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996828417380272,
"eval_steps": 500,
"global_step": 1576,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006343165239454488,
"grad_norm": 4.072216572643748,
"learning_rate": 6.329113924050633e-06,
"loss": 3.2618,
"step": 1
},
{
"epoch": 0.003171582619727244,
"grad_norm": 3.6569951400637444,
"learning_rate": 3.1645569620253167e-05,
"loss": 3.3026,
"step": 5
},
{
"epoch": 0.006343165239454488,
"grad_norm": 2.7101736291338634,
"learning_rate": 6.329113924050633e-05,
"loss": 3.1879,
"step": 10
},
{
"epoch": 0.009514747859181731,
"grad_norm": 0.8292602783930105,
"learning_rate": 9.49367088607595e-05,
"loss": 2.9431,
"step": 15
},
{
"epoch": 0.012686330478908976,
"grad_norm": 0.4014812685185947,
"learning_rate": 0.00012658227848101267,
"loss": 2.7946,
"step": 20
},
{
"epoch": 0.01585791309863622,
"grad_norm": 0.3318705203128404,
"learning_rate": 0.00015822784810126583,
"loss": 2.6345,
"step": 25
},
{
"epoch": 0.019029495718363463,
"grad_norm": 0.4987722149702972,
"learning_rate": 0.000189873417721519,
"loss": 2.547,
"step": 30
},
{
"epoch": 0.022201078338090707,
"grad_norm": 0.3868479554032603,
"learning_rate": 0.00022151898734177215,
"loss": 2.462,
"step": 35
},
{
"epoch": 0.02537266095781795,
"grad_norm": 0.3440357488186946,
"learning_rate": 0.00025316455696202533,
"loss": 2.4119,
"step": 40
},
{
"epoch": 0.028544243577545196,
"grad_norm": 0.2542091158949379,
"learning_rate": 0.0002848101265822785,
"loss": 2.3337,
"step": 45
},
{
"epoch": 0.03171582619727244,
"grad_norm": 0.5028112177822042,
"learning_rate": 0.00031645569620253165,
"loss": 2.3079,
"step": 50
},
{
"epoch": 0.034887408816999685,
"grad_norm": 0.3647071858886548,
"learning_rate": 0.00034810126582278487,
"loss": 2.2772,
"step": 55
},
{
"epoch": 0.038058991436726926,
"grad_norm": 0.3365279113242041,
"learning_rate": 0.000379746835443038,
"loss": 2.2643,
"step": 60
},
{
"epoch": 0.041230574056454174,
"grad_norm": 0.3437920074360608,
"learning_rate": 0.0004113924050632912,
"loss": 2.2467,
"step": 65
},
{
"epoch": 0.044402156676181415,
"grad_norm": 0.2180989181512006,
"learning_rate": 0.0004430379746835443,
"loss": 2.2192,
"step": 70
},
{
"epoch": 0.047573739295908656,
"grad_norm": 0.3740832044447792,
"learning_rate": 0.00047468354430379745,
"loss": 2.2202,
"step": 75
},
{
"epoch": 0.0507453219156359,
"grad_norm": 0.4042428788012064,
"learning_rate": 0.0005063291139240507,
"loss": 2.206,
"step": 80
},
{
"epoch": 0.053916904535363144,
"grad_norm": 0.5502810429404877,
"learning_rate": 0.0005379746835443038,
"loss": 2.1792,
"step": 85
},
{
"epoch": 0.05708848715509039,
"grad_norm": 0.9725610364599878,
"learning_rate": 0.000569620253164557,
"loss": 2.1717,
"step": 90
},
{
"epoch": 0.06026006977481763,
"grad_norm": 0.4008236462318082,
"learning_rate": 0.0006012658227848101,
"loss": 2.1581,
"step": 95
},
{
"epoch": 0.06343165239454487,
"grad_norm": 1.1094243654374898,
"learning_rate": 0.0006329113924050633,
"loss": 2.1497,
"step": 100
},
{
"epoch": 0.06660323501427212,
"grad_norm": 0.257873202758346,
"learning_rate": 0.0006645569620253165,
"loss": 2.1357,
"step": 105
},
{
"epoch": 0.06977481763399937,
"grad_norm": 0.4530053303085577,
"learning_rate": 0.0006962025316455697,
"loss": 2.1319,
"step": 110
},
{
"epoch": 0.0729464002537266,
"grad_norm": 0.237617906265262,
"learning_rate": 0.0007278481012658228,
"loss": 2.114,
"step": 115
},
{
"epoch": 0.07611798287345385,
"grad_norm": 1.309988153168323,
"learning_rate": 0.000759493670886076,
"loss": 2.101,
"step": 120
},
{
"epoch": 0.0792895654931811,
"grad_norm": 0.33147972360135136,
"learning_rate": 0.0007911392405063291,
"loss": 2.0983,
"step": 125
},
{
"epoch": 0.08246114811290835,
"grad_norm": 0.6923953247791184,
"learning_rate": 0.0008227848101265824,
"loss": 2.0775,
"step": 130
},
{
"epoch": 0.08563273073263558,
"grad_norm": 0.6108397955993198,
"learning_rate": 0.0008544303797468354,
"loss": 2.076,
"step": 135
},
{
"epoch": 0.08880431335236283,
"grad_norm": 0.25451367249745316,
"learning_rate": 0.0008860759493670886,
"loss": 2.0613,
"step": 140
},
{
"epoch": 0.09197589597209008,
"grad_norm": 0.5696237036165042,
"learning_rate": 0.0009177215189873418,
"loss": 2.0571,
"step": 145
},
{
"epoch": 0.09514747859181731,
"grad_norm": 0.47600183429795934,
"learning_rate": 0.0009493670886075949,
"loss": 2.0597,
"step": 150
},
{
"epoch": 0.09831906121154456,
"grad_norm": 0.49360558045014563,
"learning_rate": 0.0009810126582278482,
"loss": 2.0276,
"step": 155
},
{
"epoch": 0.1014906438312718,
"grad_norm": 0.2619769061768294,
"learning_rate": 0.0009999950915251159,
"loss": 2.049,
"step": 160
},
{
"epoch": 0.10466222645099905,
"grad_norm": 0.25933552218187667,
"learning_rate": 0.0009999398722894419,
"loss": 2.0304,
"step": 165
},
{
"epoch": 0.10783380907072629,
"grad_norm": 0.334897031747453,
"learning_rate": 0.0009998233050230736,
"loss": 2.0144,
"step": 170
},
{
"epoch": 0.11100539169045354,
"grad_norm": 0.31275160877263003,
"learning_rate": 0.0009996454040300758,
"loss": 1.9773,
"step": 175
},
{
"epoch": 0.11417697431018078,
"grad_norm": 0.5307747748580993,
"learning_rate": 0.0009994061911408245,
"loss": 1.9863,
"step": 180
},
{
"epoch": 0.11734855692990802,
"grad_norm": 0.4066591897491239,
"learning_rate": 0.0009991056957093295,
"loss": 1.9812,
"step": 185
},
{
"epoch": 0.12052013954963527,
"grad_norm": 0.40898061678871916,
"learning_rate": 0.0009987439546096308,
"loss": 1.9983,
"step": 190
},
{
"epoch": 0.12369172216936251,
"grad_norm": 0.6281156992876311,
"learning_rate": 0.0009983210122312745,
"loss": 1.9663,
"step": 195
},
{
"epoch": 0.12686330478908975,
"grad_norm": 0.418957878268899,
"learning_rate": 0.000997836920473866,
"loss": 1.9443,
"step": 200
},
{
"epoch": 0.130034887408817,
"grad_norm": 0.2967784254205414,
"learning_rate": 0.000997291738740701,
"loss": 1.9496,
"step": 205
},
{
"epoch": 0.13320647002854424,
"grad_norm": 0.32659030796363414,
"learning_rate": 0.0009966855339314756,
"loss": 1.9394,
"step": 210
},
{
"epoch": 0.1363780526482715,
"grad_norm": 0.44336971944165454,
"learning_rate": 0.0009960183804340781,
"loss": 1.9274,
"step": 215
},
{
"epoch": 0.13954963526799874,
"grad_norm": 0.7513841493295751,
"learning_rate": 0.0009952903601154596,
"loss": 1.937,
"step": 220
},
{
"epoch": 0.142721217887726,
"grad_norm": 0.8840919710762163,
"learning_rate": 0.0009945015623115897,
"loss": 1.9222,
"step": 225
},
{
"epoch": 0.1458928005074532,
"grad_norm": 0.3088103069995084,
"learning_rate": 0.000993652083816491,
"loss": 1.9272,
"step": 230
},
{
"epoch": 0.14906438312718046,
"grad_norm": 0.3205509173648325,
"learning_rate": 0.0009927420288703658,
"loss": 1.9282,
"step": 235
},
{
"epoch": 0.1522359657469077,
"grad_norm": 0.5725328480987164,
"learning_rate": 0.0009917715091467998,
"loss": 1.9092,
"step": 240
},
{
"epoch": 0.15540754836663495,
"grad_norm": 0.6881811737021382,
"learning_rate": 0.000990740643739063,
"loss": 1.9257,
"step": 245
},
{
"epoch": 0.1585791309863622,
"grad_norm": 0.3928160747689014,
"learning_rate": 0.000989649559145493,
"loss": 1.9075,
"step": 250
},
{
"epoch": 0.16175071360608945,
"grad_norm": 0.3457762887590973,
"learning_rate": 0.000988498389253972,
"loss": 1.8954,
"step": 255
},
{
"epoch": 0.1649222962258167,
"grad_norm": 0.6130106535523941,
"learning_rate": 0.0009872872753254995,
"loss": 1.8869,
"step": 260
},
{
"epoch": 0.16809387884554391,
"grad_norm": 0.2006935276789736,
"learning_rate": 0.0009860163659768566,
"loss": 1.8764,
"step": 265
},
{
"epoch": 0.17126546146527116,
"grad_norm": 0.2519600708036085,
"learning_rate": 0.0009846858171623687,
"loss": 1.8592,
"step": 270
},
{
"epoch": 0.1744370440849984,
"grad_norm": 0.21430434382675823,
"learning_rate": 0.0009832957921547696,
"loss": 1.8588,
"step": 275
},
{
"epoch": 0.17760862670472566,
"grad_norm": 0.7316176065198735,
"learning_rate": 0.000981846461525165,
"loss": 1.8442,
"step": 280
},
{
"epoch": 0.1807802093244529,
"grad_norm": 0.5438158046657656,
"learning_rate": 0.0009803380031221018,
"loss": 1.8681,
"step": 285
},
{
"epoch": 0.18395179194418015,
"grad_norm": 0.22290789589006946,
"learning_rate": 0.000978770602049745,
"loss": 1.8342,
"step": 290
},
{
"epoch": 0.1871233745639074,
"grad_norm": 0.2561355818352734,
"learning_rate": 0.0009771444506451621,
"loss": 1.8408,
"step": 295
},
{
"epoch": 0.19029495718363462,
"grad_norm": 0.3381776052738623,
"learning_rate": 0.0009754597484547223,
"loss": 1.829,
"step": 300
},
{
"epoch": 0.19346653980336187,
"grad_norm": 0.2267569653346989,
"learning_rate": 0.0009737167022096094,
"loss": 1.8283,
"step": 305
},
{
"epoch": 0.19663812242308912,
"grad_norm": 0.23165580428938548,
"learning_rate": 0.0009719155258004541,
"loss": 1.8071,
"step": 310
},
{
"epoch": 0.19980970504281637,
"grad_norm": 0.25586494282771377,
"learning_rate": 0.0009700564402510871,
"loss": 1.8145,
"step": 315
},
{
"epoch": 0.2029812876625436,
"grad_norm": 0.2540371949506308,
"learning_rate": 0.0009681396736914168,
"loss": 1.8015,
"step": 320
},
{
"epoch": 0.20615287028227086,
"grad_norm": 0.6388348558478815,
"learning_rate": 0.0009661654613294355,
"loss": 1.8127,
"step": 325
},
{
"epoch": 0.2093244529019981,
"grad_norm": 0.3864015903258655,
"learning_rate": 0.0009641340454223575,
"loss": 1.7935,
"step": 330
},
{
"epoch": 0.21249603552172533,
"grad_norm": 0.2751489116810319,
"learning_rate": 0.0009620456752468903,
"loss": 1.8058,
"step": 335
},
{
"epoch": 0.21566761814145258,
"grad_norm": 0.7422837235361598,
"learning_rate": 0.0009599006070686467,
"loss": 1.7927,
"step": 340
},
{
"epoch": 0.21883920076117983,
"grad_norm": 0.4534291854575538,
"learning_rate": 0.0009576991041106973,
"loss": 1.7927,
"step": 345
},
{
"epoch": 0.22201078338090707,
"grad_norm": 0.3583139746684532,
"learning_rate": 0.0009554414365212709,
"loss": 1.7883,
"step": 350
},
{
"epoch": 0.22518236600063432,
"grad_norm": 0.20634161577455162,
"learning_rate": 0.0009531278813406046,
"loss": 1.7637,
"step": 355
},
{
"epoch": 0.22835394862036157,
"grad_norm": 0.5462716024749192,
"learning_rate": 0.000950758722466947,
"loss": 1.7823,
"step": 360
},
{
"epoch": 0.23152553124008882,
"grad_norm": 0.20847302955466993,
"learning_rate": 0.0009483342506217214,
"loss": 1.7736,
"step": 365
},
{
"epoch": 0.23469711385981604,
"grad_norm": 0.21809684764751344,
"learning_rate": 0.0009458547633138515,
"loss": 1.7636,
"step": 370
},
{
"epoch": 0.23786869647954328,
"grad_norm": 0.19220401784144317,
"learning_rate": 0.0009433205648032528,
"loss": 1.7509,
"step": 375
},
{
"epoch": 0.24104027909927053,
"grad_norm": 0.273271874095809,
"learning_rate": 0.0009407319660634979,
"loss": 1.7488,
"step": 380
},
{
"epoch": 0.24421186171899778,
"grad_norm": 0.31458786826276625,
"learning_rate": 0.0009380892847436555,
"loss": 1.7342,
"step": 385
},
{
"epoch": 0.24738344433872503,
"grad_norm": 0.19789392284188642,
"learning_rate": 0.0009353928451293121,
"loss": 1.743,
"step": 390
},
{
"epoch": 0.2505550269584523,
"grad_norm": 0.24499888411428472,
"learning_rate": 0.0009326429781027789,
"loss": 1.7193,
"step": 395
},
{
"epoch": 0.2537266095781795,
"grad_norm": 0.33173879702411524,
"learning_rate": 0.0009298400211024877,
"loss": 1.729,
"step": 400
},
{
"epoch": 0.25689819219790677,
"grad_norm": 0.34879621136110506,
"learning_rate": 0.0009269843180815853,
"loss": 1.7241,
"step": 405
},
{
"epoch": 0.260069774817634,
"grad_norm": 0.20572899222991778,
"learning_rate": 0.0009240762194657253,
"loss": 1.7229,
"step": 410
},
{
"epoch": 0.26324135743736127,
"grad_norm": 0.21561079654661294,
"learning_rate": 0.0009211160821100679,
"loss": 1.7155,
"step": 415
},
{
"epoch": 0.2664129400570885,
"grad_norm": 0.443393553505543,
"learning_rate": 0.0009181042692554893,
"loss": 1.7111,
"step": 420
},
{
"epoch": 0.2695845226768157,
"grad_norm": 0.2266683853424827,
"learning_rate": 0.0009150411504840086,
"loss": 1.7009,
"step": 425
},
{
"epoch": 0.272756105296543,
"grad_norm": 0.3582735361142464,
"learning_rate": 0.000911927101673436,
"loss": 1.7016,
"step": 430
},
{
"epoch": 0.2759276879162702,
"grad_norm": 0.43342116834945776,
"learning_rate": 0.0009087625049512488,
"loss": 1.7037,
"step": 435
},
{
"epoch": 0.2790992705359975,
"grad_norm": 0.3295875571024751,
"learning_rate": 0.0009055477486476991,
"loss": 1.682,
"step": 440
},
{
"epoch": 0.2822708531557247,
"grad_norm": 0.1891978276034803,
"learning_rate": 0.0009022832272481627,
"loss": 1.6899,
"step": 445
},
{
"epoch": 0.285442435775452,
"grad_norm": 0.26615608448970285,
"learning_rate": 0.000898969341344731,
"loss": 1.6909,
"step": 450
},
{
"epoch": 0.2886140183951792,
"grad_norm": 0.26554406802462666,
"learning_rate": 0.0008956064975870544,
"loss": 1.6764,
"step": 455
},
{
"epoch": 0.2917856010149064,
"grad_norm": 0.20008546513645153,
"learning_rate": 0.0008921951086324411,
"loss": 1.6571,
"step": 460
},
{
"epoch": 0.2949571836346337,
"grad_norm": 0.25575390463894654,
"learning_rate": 0.0008887355930952202,
"loss": 1.6636,
"step": 465
},
{
"epoch": 0.2981287662543609,
"grad_norm": 0.3501161922386378,
"learning_rate": 0.0008852283754953732,
"loss": 1.657,
"step": 470
},
{
"epoch": 0.3013003488740882,
"grad_norm": 0.20707308621635875,
"learning_rate": 0.0008816738862064412,
"loss": 1.6659,
"step": 475
},
{
"epoch": 0.3044719314938154,
"grad_norm": 0.2572060719794171,
"learning_rate": 0.0008780725614027123,
"loss": 1.6521,
"step": 480
},
{
"epoch": 0.3076435141135427,
"grad_norm": 0.2773641851176988,
"learning_rate": 0.000874424843005699,
"loss": 1.6545,
"step": 485
},
{
"epoch": 0.3108150967332699,
"grad_norm": 0.5151669199508683,
"learning_rate": 0.0008707311786299099,
"loss": 1.6512,
"step": 490
},
{
"epoch": 0.3139866793529971,
"grad_norm": 0.35976330322294225,
"learning_rate": 0.0008669920215279222,
"loss": 1.6489,
"step": 495
},
{
"epoch": 0.3171582619727244,
"grad_norm": 0.18626964833018503,
"learning_rate": 0.0008632078305347623,
"loss": 1.6292,
"step": 500
},
{
"epoch": 0.3203298445924516,
"grad_norm": 0.26834931489718644,
"learning_rate": 0.0008593790700116029,
"loss": 1.6244,
"step": 505
},
{
"epoch": 0.3235014272121789,
"grad_norm": 0.24263398553664595,
"learning_rate": 0.0008555062097887796,
"loss": 1.6173,
"step": 510
},
{
"epoch": 0.3266730098319061,
"grad_norm": 0.20406088273168801,
"learning_rate": 0.0008515897251081384,
"loss": 1.6273,
"step": 515
},
{
"epoch": 0.3298445924516334,
"grad_norm": 0.18611145873310309,
"learning_rate": 0.0008476300965647186,
"loss": 1.5954,
"step": 520
},
{
"epoch": 0.3330161750713606,
"grad_norm": 0.2455194958653317,
"learning_rate": 0.0008436278100477775,
"loss": 1.6284,
"step": 525
},
{
"epoch": 0.33618775769108783,
"grad_norm": 0.3089123130431731,
"learning_rate": 0.0008395833566811676,
"loss": 1.6043,
"step": 530
},
{
"epoch": 0.3393593403108151,
"grad_norm": 0.278226103442385,
"learning_rate": 0.0008354972327630705,
"loss": 1.5991,
"step": 535
},
{
"epoch": 0.3425309229305423,
"grad_norm": 0.40283564646452896,
"learning_rate": 0.000831369939705094,
"loss": 1.5942,
"step": 540
},
{
"epoch": 0.3457025055502696,
"grad_norm": 0.667879925495927,
"learning_rate": 0.0008272019839707461,
"loss": 1.5968,
"step": 545
},
{
"epoch": 0.3488740881699968,
"grad_norm": 0.34787949832129605,
"learning_rate": 0.0008229938770132843,
"loss": 1.5815,
"step": 550
},
{
"epoch": 0.3520456707897241,
"grad_norm": 0.2584482438633063,
"learning_rate": 0.0008187461352129555,
"loss": 1.5884,
"step": 555
},
{
"epoch": 0.3552172534094513,
"grad_norm": 0.22742176340374293,
"learning_rate": 0.0008144592798136309,
"loss": 1.5919,
"step": 560
},
{
"epoch": 0.35838883602917854,
"grad_norm": 0.42255520990843093,
"learning_rate": 0.0008101338368588436,
"loss": 1.5913,
"step": 565
},
{
"epoch": 0.3615604186489058,
"grad_norm": 0.3824113474293145,
"learning_rate": 0.0008057703371272366,
"loss": 1.5611,
"step": 570
},
{
"epoch": 0.36473200126863303,
"grad_norm": 0.2091017723989841,
"learning_rate": 0.0008013693160674316,
"loss": 1.5626,
"step": 575
},
{
"epoch": 0.3679035838883603,
"grad_norm": 0.23814734243563393,
"learning_rate": 0.0007969313137323229,
"loss": 1.5656,
"step": 580
},
{
"epoch": 0.37107516650808753,
"grad_norm": 0.2597004458168679,
"learning_rate": 0.0007924568747128076,
"loss": 1.5624,
"step": 585
},
{
"epoch": 0.3742467491278148,
"grad_norm": 0.2949069544402481,
"learning_rate": 0.0007879465480709576,
"loss": 1.5516,
"step": 590
},
{
"epoch": 0.377418331747542,
"grad_norm": 0.21263382790898516,
"learning_rate": 0.0007834008872726453,
"loss": 1.5409,
"step": 595
},
{
"epoch": 0.38058991436726924,
"grad_norm": 0.27681275720229476,
"learning_rate": 0.0007788204501196254,
"loss": 1.5507,
"step": 600
},
{
"epoch": 0.3837614969869965,
"grad_norm": 0.5196324383707882,
"learning_rate": 0.000774205798681088,
"loss": 1.5435,
"step": 605
},
{
"epoch": 0.38693307960672374,
"grad_norm": 0.3397151418636398,
"learning_rate": 0.000769557499224686,
"loss": 1.5292,
"step": 610
},
{
"epoch": 0.390104662226451,
"grad_norm": 0.21757261564984298,
"learning_rate": 0.0007648761221470481,
"loss": 1.5342,
"step": 615
},
{
"epoch": 0.39327624484617824,
"grad_norm": 0.23799713493080946,
"learning_rate": 0.000760162241903785,
"loss": 1.5314,
"step": 620
},
{
"epoch": 0.3964478274659055,
"grad_norm": 0.20955913102047505,
"learning_rate": 0.0007554164369389975,
"loss": 1.5149,
"step": 625
},
{
"epoch": 0.39961941008563273,
"grad_norm": 0.19465193626198848,
"learning_rate": 0.0007506392896142951,
"loss": 1.514,
"step": 630
},
{
"epoch": 0.40279099270535995,
"grad_norm": 0.37370015455345407,
"learning_rate": 0.0007458313861373336,
"loss": 1.5138,
"step": 635
},
{
"epoch": 0.4059625753250872,
"grad_norm": 0.2112845859224254,
"learning_rate": 0.0007409933164898818,
"loss": 1.5024,
"step": 640
},
{
"epoch": 0.40913415794481445,
"grad_norm": 0.24626397881644146,
"learning_rate": 0.0007361256743554241,
"loss": 1.519,
"step": 645
},
{
"epoch": 0.4123057405645417,
"grad_norm": 0.3216374157044185,
"learning_rate": 0.0007312290570463083,
"loss": 1.5039,
"step": 650
},
{
"epoch": 0.41547732318426894,
"grad_norm": 0.22302629969432056,
"learning_rate": 0.0007263040654304502,
"loss": 1.494,
"step": 655
},
{
"epoch": 0.4186489058039962,
"grad_norm": 0.2675317557830398,
"learning_rate": 0.0007213513038575998,
"loss": 1.4884,
"step": 660
},
{
"epoch": 0.42182048842372344,
"grad_norm": 0.2905992631967741,
"learning_rate": 0.0007163713800851811,
"loss": 1.4851,
"step": 665
},
{
"epoch": 0.42499207104345066,
"grad_norm": 0.20033257450058217,
"learning_rate": 0.0007113649052037139,
"loss": 1.475,
"step": 670
},
{
"epoch": 0.42816365366317793,
"grad_norm": 0.24204591478150614,
"learning_rate": 0.0007063324935618264,
"loss": 1.4854,
"step": 675
},
{
"epoch": 0.43133523628290515,
"grad_norm": 0.2121430223132248,
"learning_rate": 0.0007012747626908679,
"loss": 1.4867,
"step": 680
},
{
"epoch": 0.43450681890263243,
"grad_norm": 0.22539040426730952,
"learning_rate": 0.0006961923332291309,
"loss": 1.467,
"step": 685
},
{
"epoch": 0.43767840152235965,
"grad_norm": 0.22695514383581045,
"learning_rate": 0.0006910858288456921,
"loss": 1.4657,
"step": 690
},
{
"epoch": 0.4408499841420869,
"grad_norm": 0.22184474318285244,
"learning_rate": 0.0006859558761638819,
"loss": 1.4423,
"step": 695
},
{
"epoch": 0.44402156676181415,
"grad_norm": 0.29575466467956124,
"learning_rate": 0.0006808031046843901,
"loss": 1.4485,
"step": 700
},
{
"epoch": 0.44719314938154137,
"grad_norm": 0.21488007270980458,
"learning_rate": 0.0006756281467080205,
"loss": 1.4508,
"step": 705
},
{
"epoch": 0.45036473200126864,
"grad_norm": 0.38667469007287536,
"learning_rate": 0.0006704316372580989,
"loss": 1.4459,
"step": 710
},
{
"epoch": 0.45353631462099586,
"grad_norm": 0.5234661249173684,
"learning_rate": 0.0006652142140025517,
"loss": 1.435,
"step": 715
},
{
"epoch": 0.45670789724072314,
"grad_norm": 0.37462414488518325,
"learning_rate": 0.0006599765171756538,
"loss": 1.4379,
"step": 720
},
{
"epoch": 0.45987947986045036,
"grad_norm": 0.3040640640559466,
"learning_rate": 0.0006547191894994679,
"loss": 1.4341,
"step": 725
},
{
"epoch": 0.46305106248017763,
"grad_norm": 0.28687145037107376,
"learning_rate": 0.0006494428761049736,
"loss": 1.4297,
"step": 730
},
{
"epoch": 0.46622264509990485,
"grad_norm": 0.20728566940658133,
"learning_rate": 0.0006441482244529037,
"loss": 1.4124,
"step": 735
},
{
"epoch": 0.4693942277196321,
"grad_norm": 0.21956352378213645,
"learning_rate": 0.0006388358842542938,
"loss": 1.4162,
"step": 740
},
{
"epoch": 0.47256581033935935,
"grad_norm": 0.20961137895482168,
"learning_rate": 0.0006335065073907551,
"loss": 1.4055,
"step": 745
},
{
"epoch": 0.47573739295908657,
"grad_norm": 0.21979161995613117,
"learning_rate": 0.0006281607478344823,
"loss": 1.4112,
"step": 750
},
{
"epoch": 0.47890897557881384,
"grad_norm": 0.2707420648256881,
"learning_rate": 0.0006227992615680033,
"loss": 1.4127,
"step": 755
},
{
"epoch": 0.48208055819854106,
"grad_norm": 0.2829526420993808,
"learning_rate": 0.000617422706503684,
"loss": 1.3905,
"step": 760
},
{
"epoch": 0.48525214081826834,
"grad_norm": 0.2988909342739172,
"learning_rate": 0.0006120317424029943,
"loss": 1.3941,
"step": 765
},
{
"epoch": 0.48842372343799556,
"grad_norm": 0.2787477024270155,
"learning_rate": 0.0006066270307955492,
"loss": 1.404,
"step": 770
},
{
"epoch": 0.4915953060577228,
"grad_norm": 0.22142539860110755,
"learning_rate": 0.000601209234897931,
"loss": 1.3886,
"step": 775
},
{
"epoch": 0.49476688867745006,
"grad_norm": 0.2777507592841434,
"learning_rate": 0.0005957790195323064,
"loss": 1.3896,
"step": 780
},
{
"epoch": 0.4979384712971773,
"grad_norm": 0.2552429702914411,
"learning_rate": 0.0005903370510448447,
"loss": 1.3779,
"step": 785
},
{
"epoch": 0.5011100539169046,
"grad_norm": 0.268362490404369,
"learning_rate": 0.0005848839972239511,
"loss": 1.3732,
"step": 790
},
{
"epoch": 0.5042816365366318,
"grad_norm": 0.24326700144489616,
"learning_rate": 0.0005794205272183205,
"loss": 1.3748,
"step": 795
},
{
"epoch": 0.507453219156359,
"grad_norm": 0.36345823855975784,
"learning_rate": 0.0005739473114548266,
"loss": 1.3755,
"step": 800
},
{
"epoch": 0.5106248017760863,
"grad_norm": 0.4843512449452226,
"learning_rate": 0.000568465021556253,
"loss": 1.3638,
"step": 805
},
{
"epoch": 0.5137963843958135,
"grad_norm": 0.28627979209509896,
"learning_rate": 0.0005629743302588779,
"loss": 1.3514,
"step": 810
},
{
"epoch": 0.5169679670155407,
"grad_norm": 0.25100043970170133,
"learning_rate": 0.0005574759113299217,
"loss": 1.341,
"step": 815
},
{
"epoch": 0.520139549635268,
"grad_norm": 0.294501664896367,
"learning_rate": 0.0005519704394848692,
"loss": 1.3323,
"step": 820
},
{
"epoch": 0.5233111322549953,
"grad_norm": 0.2382026692611784,
"learning_rate": 0.0005464585903046744,
"loss": 1.3483,
"step": 825
},
{
"epoch": 0.5264827148747225,
"grad_norm": 0.3233277006590301,
"learning_rate": 0.0005409410401528587,
"loss": 1.3275,
"step": 830
},
{
"epoch": 0.5296542974944497,
"grad_norm": 0.27343633383605254,
"learning_rate": 0.0005354184660925148,
"loss": 1.3379,
"step": 835
},
{
"epoch": 0.532825880114177,
"grad_norm": 0.23005655087591242,
"learning_rate": 0.0005298915458032233,
"loss": 1.3213,
"step": 840
},
{
"epoch": 0.5359974627339043,
"grad_norm": 0.22247227894179622,
"learning_rate": 0.0005243609574978941,
"loss": 1.3295,
"step": 845
},
{
"epoch": 0.5391690453536314,
"grad_norm": 0.30014284045451645,
"learning_rate": 0.0005188273798395424,
"loss": 1.3214,
"step": 850
},
{
"epoch": 0.5423406279733587,
"grad_norm": 0.3132385853038301,
"learning_rate": 0.0005132914918580093,
"loss": 1.3172,
"step": 855
},
{
"epoch": 0.545512210593086,
"grad_norm": 0.33728113255378034,
"learning_rate": 0.0005077539728666374,
"loss": 1.3218,
"step": 860
},
{
"epoch": 0.5486837932128132,
"grad_norm": 0.25874007616270794,
"learning_rate": 0.0005022155023789121,
"loss": 1.2957,
"step": 865
},
{
"epoch": 0.5518553758325404,
"grad_norm": 0.24203527103405384,
"learning_rate": 0.0004966767600250775,
"loss": 1.3035,
"step": 870
},
{
"epoch": 0.5550269584522677,
"grad_norm": 0.21381303917505012,
"learning_rate": 0.0004911384254687388,
"loss": 1.2995,
"step": 875
},
{
"epoch": 0.558198541071995,
"grad_norm": 0.24304896972645837,
"learning_rate": 0.00048560117832345984,
"loss": 1.2824,
"step": 880
},
{
"epoch": 0.5613701236917221,
"grad_norm": 0.3080366389357483,
"learning_rate": 0.0004800656980693674,
"loss": 1.2898,
"step": 885
},
{
"epoch": 0.5645417063114494,
"grad_norm": 0.262557897437375,
"learning_rate": 0.00047453266396977174,
"loss": 1.2779,
"step": 890
},
{
"epoch": 0.5677132889311767,
"grad_norm": 0.31083455423034645,
"learning_rate": 0.00046900275498781347,
"loss": 1.2806,
"step": 895
},
{
"epoch": 0.570884871550904,
"grad_norm": 0.21597926838814396,
"learning_rate": 0.00046347664970314723,
"loss": 1.274,
"step": 900
},
{
"epoch": 0.5740564541706311,
"grad_norm": 0.22596235970597578,
"learning_rate": 0.0004579550262286731,
"loss": 1.2666,
"step": 905
},
{
"epoch": 0.5772280367903584,
"grad_norm": 0.22827094422158484,
"learning_rate": 0.0004524385621273246,
"loss": 1.2583,
"step": 910
},
{
"epoch": 0.5803996194100857,
"grad_norm": 0.24853325436526866,
"learning_rate": 0.00044692793432892387,
"loss": 1.2693,
"step": 915
},
{
"epoch": 0.5835712020298128,
"grad_norm": 0.2765479869012326,
"learning_rate": 0.00044142381904711624,
"loss": 1.26,
"step": 920
},
{
"epoch": 0.5867427846495401,
"grad_norm": 0.27285996236330706,
"learning_rate": 0.00043592689169639034,
"loss": 1.246,
"step": 925
},
{
"epoch": 0.5899143672692674,
"grad_norm": 0.28781941328826144,
"learning_rate": 0.0004304378268091982,
"loss": 1.249,
"step": 930
},
{
"epoch": 0.5930859498889947,
"grad_norm": 0.240504157977766,
"learning_rate": 0.0004249572979531822,
"loss": 1.2534,
"step": 935
},
{
"epoch": 0.5962575325087218,
"grad_norm": 0.341483100362183,
"learning_rate": 0.0004194859776485216,
"loss": 1.2376,
"step": 940
},
{
"epoch": 0.5994291151284491,
"grad_norm": 0.27130765824409686,
"learning_rate": 0.0004140245372854065,
"loss": 1.2426,
"step": 945
},
{
"epoch": 0.6026006977481764,
"grad_norm": 0.28496801994375115,
"learning_rate": 0.0004085736470416516,
"loss": 1.2347,
"step": 950
},
{
"epoch": 0.6057722803679035,
"grad_norm": 0.33820479660283104,
"learning_rate": 0.00040313397580045765,
"loss": 1.2397,
"step": 955
},
{
"epoch": 0.6089438629876308,
"grad_norm": 0.2537502852561033,
"learning_rate": 0.0003977061910683325,
"loss": 1.2319,
"step": 960
},
{
"epoch": 0.6121154456073581,
"grad_norm": 0.2543562572422921,
"learning_rate": 0.0003922909588931808,
"loss": 1.2221,
"step": 965
},
{
"epoch": 0.6152870282270854,
"grad_norm": 0.28194628415561285,
"learning_rate": 0.0003868889437825724,
"loss": 1.2213,
"step": 970
},
{
"epoch": 0.6184586108468125,
"grad_norm": 0.26751445743912233,
"learning_rate": 0.0003815008086222007,
"loss": 1.211,
"step": 975
},
{
"epoch": 0.6216301934665398,
"grad_norm": 0.22966413613029274,
"learning_rate": 0.0003761272145945388,
"loss": 1.2058,
"step": 980
},
{
"epoch": 0.6248017760862671,
"grad_norm": 0.24668142278345986,
"learning_rate": 0.0003707688210977055,
"loss": 1.2223,
"step": 985
},
{
"epoch": 0.6279733587059942,
"grad_norm": 0.23811743937781157,
"learning_rate": 0.00036542628566455025,
"loss": 1.2024,
"step": 990
},
{
"epoch": 0.6311449413257215,
"grad_norm": 0.2901121774163334,
"learning_rate": 0.0003601002638819665,
"loss": 1.2036,
"step": 995
},
{
"epoch": 0.6343165239454488,
"grad_norm": 0.2600410825499236,
"learning_rate": 0.0003547914093104439,
"loss": 1.2012,
"step": 1000
},
{
"epoch": 0.6374881065651761,
"grad_norm": 0.352563938838776,
"learning_rate": 0.0003495003734038697,
"loss": 1.1751,
"step": 1005
},
{
"epoch": 0.6406596891849032,
"grad_norm": 0.26125000772161344,
"learning_rate": 0.00034422780542958827,
"loss": 1.1919,
"step": 1010
},
{
"epoch": 0.6438312718046305,
"grad_norm": 0.2640437019043301,
"learning_rate": 0.00033897435238872874,
"loss": 1.1781,
"step": 1015
},
{
"epoch": 0.6470028544243578,
"grad_norm": 0.2782272386225361,
"learning_rate": 0.00033374065893681127,
"loss": 1.1821,
"step": 1020
},
{
"epoch": 0.650174437044085,
"grad_norm": 0.24555576527657738,
"learning_rate": 0.0003285273673046409,
"loss": 1.1721,
"step": 1025
},
{
"epoch": 0.6533460196638122,
"grad_norm": 0.40556770599075914,
"learning_rate": 0.00032333511721949817,
"loss": 1.1679,
"step": 1030
},
{
"epoch": 0.6565176022835395,
"grad_norm": 0.25906084663363754,
"learning_rate": 0.00031816454582663856,
"loss": 1.1567,
"step": 1035
},
{
"epoch": 0.6596891849032668,
"grad_norm": 0.27183164743159954,
"learning_rate": 0.0003130162876111074,
"loss": 1.1596,
"step": 1040
},
{
"epoch": 0.6628607675229939,
"grad_norm": 0.24394077297020256,
"learning_rate": 0.0003078909743198817,
"loss": 1.1487,
"step": 1045
},
{
"epoch": 0.6660323501427212,
"grad_norm": 0.23339037702881532,
"learning_rate": 0.000302789234884348,
"loss": 1.1636,
"step": 1050
},
{
"epoch": 0.6692039327624485,
"grad_norm": 0.2651227300122355,
"learning_rate": 0.00029771169534312583,
"loss": 1.1475,
"step": 1055
},
{
"epoch": 0.6723755153821757,
"grad_norm": 0.23719809453094406,
"learning_rate": 0.000292658978765246,
"loss": 1.1496,
"step": 1060
},
{
"epoch": 0.6755470980019029,
"grad_norm": 0.31466276172538943,
"learning_rate": 0.000287631705173693,
"loss": 1.1404,
"step": 1065
},
{
"epoch": 0.6787186806216302,
"grad_norm": 0.2657362830496313,
"learning_rate": 0.00028263049146932153,
"loss": 1.156,
"step": 1070
},
{
"epoch": 0.6818902632413575,
"grad_norm": 0.2353420135393821,
"learning_rate": 0.00027765595135515673,
"loss": 1.1382,
"step": 1075
},
{
"epoch": 0.6850618458610847,
"grad_norm": 0.29180017450918116,
"learning_rate": 0.00027270869526108506,
"loss": 1.1403,
"step": 1080
},
{
"epoch": 0.6882334284808119,
"grad_norm": 0.28381426741820764,
"learning_rate": 0.000267789330268949,
"loss": 1.1351,
"step": 1085
},
{
"epoch": 0.6914050111005392,
"grad_norm": 0.2368326399732858,
"learning_rate": 0.00026289846003805075,
"loss": 1.1264,
"step": 1090
},
{
"epoch": 0.6945765937202664,
"grad_norm": 0.24260754741892487,
"learning_rate": 0.0002580366847310774,
"loss": 1.1318,
"step": 1095
},
{
"epoch": 0.6977481763399936,
"grad_norm": 0.33032504483698477,
"learning_rate": 0.0002532046009404537,
"loss": 1.123,
"step": 1100
},
{
"epoch": 0.7009197589597209,
"grad_norm": 0.2626626593890248,
"learning_rate": 0.00024840280161513446,
"loss": 1.1147,
"step": 1105
},
{
"epoch": 0.7040913415794482,
"grad_norm": 0.24734490639888912,
"learning_rate": 0.0002436318759878432,
"loss": 1.1141,
"step": 1110
},
{
"epoch": 0.7072629241991754,
"grad_norm": 0.25777344330608626,
"learning_rate": 0.00023889240950276602,
"loss": 1.1069,
"step": 1115
},
{
"epoch": 0.7104345068189026,
"grad_norm": 0.24965316567346824,
"learning_rate": 0.00023418498374371268,
"loss": 1.0961,
"step": 1120
},
{
"epoch": 0.7136060894386299,
"grad_norm": 0.2588175173420704,
"learning_rate": 0.0002295101763627483,
"loss": 1.1062,
"step": 1125
},
{
"epoch": 0.7167776720583571,
"grad_norm": 0.2617691894820057,
"learning_rate": 0.00022486856100931146,
"loss": 1.0949,
"step": 1130
},
{
"epoch": 0.7199492546780843,
"grad_norm": 0.24640261475787326,
"learning_rate": 0.00022026070725981867,
"loss": 1.1024,
"step": 1135
},
{
"epoch": 0.7231208372978116,
"grad_norm": 0.25512789636052857,
"learning_rate": 0.0002156871805477732,
"loss": 1.0981,
"step": 1140
},
{
"epoch": 0.7262924199175389,
"grad_norm": 0.2380744076277497,
"learning_rate": 0.00021114854209437889,
"loss": 1.0803,
"step": 1145
},
{
"epoch": 0.7294640025372661,
"grad_norm": 0.26280691701304987,
"learning_rate": 0.00020664534883967311,
"loss": 1.0851,
"step": 1150
},
{
"epoch": 0.7326355851569933,
"grad_norm": 0.2565430586115982,
"learning_rate": 0.00020217815337418427,
"loss": 1.076,
"step": 1155
},
{
"epoch": 0.7358071677767206,
"grad_norm": 0.2533752282987412,
"learning_rate": 0.00019774750387112174,
"loss": 1.0826,
"step": 1160
},
{
"epoch": 0.7389787503964478,
"grad_norm": 0.28126577459530283,
"learning_rate": 0.00019335394401911082,
"loss": 1.0719,
"step": 1165
},
{
"epoch": 0.7421503330161751,
"grad_norm": 0.2650849332335025,
"learning_rate": 0.00018899801295547476,
"loss": 1.0742,
"step": 1170
},
{
"epoch": 0.7453219156359023,
"grad_norm": 0.2603829852111257,
"learning_rate": 0.00018468024520007764,
"loss": 1.0772,
"step": 1175
},
{
"epoch": 0.7484934982556296,
"grad_norm": 0.2527087543783394,
"learning_rate": 0.00018040117058973316,
"loss": 1.0595,
"step": 1180
},
{
"epoch": 0.7516650808753568,
"grad_norm": 0.24678722431639855,
"learning_rate": 0.0001761613142131867,
"loss": 1.0469,
"step": 1185
},
{
"epoch": 0.754836663495084,
"grad_norm": 0.25910814410326344,
"learning_rate": 0.00017196119634668293,
"loss": 1.0627,
"step": 1190
},
{
"epoch": 0.7580082461148113,
"grad_norm": 0.26173306347429054,
"learning_rate": 0.00016780133239012075,
"loss": 1.0607,
"step": 1195
},
{
"epoch": 0.7611798287345385,
"grad_norm": 0.24651016867032868,
"learning_rate": 0.0001636822328038095,
"loss": 1.0546,
"step": 1200
},
{
"epoch": 0.7643514113542658,
"grad_norm": 0.28020957707447064,
"learning_rate": 0.00015960440304582858,
"loss": 1.0579,
"step": 1205
},
{
"epoch": 0.767522993973993,
"grad_norm": 0.26371476098524943,
"learning_rate": 0.00015556834351000354,
"loss": 1.0537,
"step": 1210
},
{
"epoch": 0.7706945765937203,
"grad_norm": 0.24623457163199874,
"learning_rate": 0.0001515745494645019,
"loss": 1.045,
"step": 1215
},
{
"epoch": 0.7738661592134475,
"grad_norm": 0.3155558400199454,
"learning_rate": 0.0001476235109910576,
"loss": 1.0405,
"step": 1220
},
{
"epoch": 0.7770377418331748,
"grad_norm": 0.2789622570986826,
"learning_rate": 0.00014371571292483393,
"loss": 1.0381,
"step": 1225
},
{
"epoch": 0.780209324452902,
"grad_norm": 0.2409114498230053,
"learning_rate": 0.0001398516347949284,
"loss": 1.0394,
"step": 1230
},
{
"epoch": 0.7833809070726292,
"grad_norm": 0.267235707838601,
"learning_rate": 0.0001360317507655293,
"loss": 1.0278,
"step": 1235
},
{
"epoch": 0.7865524896923565,
"grad_norm": 0.28458374786381546,
"learning_rate": 0.00013225652957773044,
"loss": 1.0326,
"step": 1240
},
{
"epoch": 0.7897240723120837,
"grad_norm": 0.25695712786415686,
"learning_rate": 0.00012852643449201212,
"loss": 1.023,
"step": 1245
},
{
"epoch": 0.792895654931811,
"grad_norm": 0.2590457354954553,
"learning_rate": 0.0001248419232313938,
"loss": 1.0232,
"step": 1250
},
{
"epoch": 0.7960672375515382,
"grad_norm": 0.2715843775728456,
"learning_rate": 0.000121203447925266,
"loss": 1.0287,
"step": 1255
},
{
"epoch": 0.7992388201712655,
"grad_norm": 0.2398511137856279,
"learning_rate": 0.00011761145505391024,
"loss": 1.0186,
"step": 1260
},
{
"epoch": 0.8024104027909927,
"grad_norm": 0.27281371167245233,
"learning_rate": 0.00011406638539370979,
"loss": 1.0224,
"step": 1265
},
{
"epoch": 0.8055819854107199,
"grad_norm": 0.3188098317762095,
"learning_rate": 0.00011056867396306292,
"loss": 1.0092,
"step": 1270
},
{
"epoch": 0.8087535680304472,
"grad_norm": 0.3265540754130617,
"learning_rate": 0.00010711874996900023,
"loss": 1.0104,
"step": 1275
},
{
"epoch": 0.8119251506501745,
"grad_norm": 0.2607401452606644,
"learning_rate": 0.00010371703675451733,
"loss": 1.0114,
"step": 1280
},
{
"epoch": 0.8150967332699017,
"grad_norm": 0.2883090335939891,
"learning_rate": 0.0001003639517466256,
"loss": 1.0093,
"step": 1285
},
{
"epoch": 0.8182683158896289,
"grad_norm": 0.25562259108760305,
"learning_rate": 9.705990640512907e-05,
"loss": 0.9938,
"step": 1290
},
{
"epoch": 0.8214398985093562,
"grad_norm": 0.2753573095600564,
"learning_rate": 9.380530617213456e-05,
"loss": 1.0114,
"step": 1295
},
{
"epoch": 0.8246114811290834,
"grad_norm": 0.23998112779723507,
"learning_rate": 9.060055042229881e-05,
"loss": 1.0089,
"step": 1300
},
{
"epoch": 0.8277830637488106,
"grad_norm": 0.2524204007518801,
"learning_rate": 8.74460324138216e-05,
"loss": 1.007,
"step": 1305
},
{
"epoch": 0.8309546463685379,
"grad_norm": 0.2526736715480949,
"learning_rate": 8.434213924018835e-05,
"loss": 1.0,
"step": 1310
},
{
"epoch": 0.8341262289882652,
"grad_norm": 0.2503643121035892,
"learning_rate": 8.128925178266927e-05,
"loss": 0.9965,
"step": 1315
},
{
"epoch": 0.8372978116079924,
"grad_norm": 0.23244697445873563,
"learning_rate": 7.828774466358179e-05,
"loss": 0.9988,
"step": 1320
},
{
"epoch": 0.8404693942277196,
"grad_norm": 0.2560633353876119,
"learning_rate": 7.53379862003195e-05,
"loss": 1.0048,
"step": 1325
},
{
"epoch": 0.8436409768474469,
"grad_norm": 0.2421969390657872,
"learning_rate": 7.244033836015695e-05,
"loss": 0.9844,
"step": 1330
},
{
"epoch": 0.8468125594671742,
"grad_norm": 0.2607793031238756,
"learning_rate": 6.95951567158305e-05,
"loss": 0.9778,
"step": 1335
},
{
"epoch": 0.8499841420869013,
"grad_norm": 0.2698408656759126,
"learning_rate": 6.680279040190746e-05,
"loss": 0.9828,
"step": 1340
},
{
"epoch": 0.8531557247066286,
"grad_norm": 0.23841421968692497,
"learning_rate": 6.406358207194224e-05,
"loss": 0.9991,
"step": 1345
},
{
"epoch": 0.8563273073263559,
"grad_norm": 0.28084531889088754,
"learning_rate": 6.137786785642985e-05,
"loss": 0.9855,
"step": 1350
},
{
"epoch": 0.8594988899460831,
"grad_norm": 0.24806562901660065,
"learning_rate": 5.8745977321558786e-05,
"loss": 0.9747,
"step": 1355
},
{
"epoch": 0.8626704725658103,
"grad_norm": 0.2511271472454086,
"learning_rate": 5.616823342876931e-05,
"loss": 0.9758,
"step": 1360
},
{
"epoch": 0.8658420551855376,
"grad_norm": 0.24241834259427655,
"learning_rate": 5.364495249512336e-05,
"loss": 0.9765,
"step": 1365
},
{
"epoch": 0.8690136378052649,
"grad_norm": 0.23883983545121304,
"learning_rate": 5.11764441544883e-05,
"loss": 0.9808,
"step": 1370
},
{
"epoch": 0.872185220424992,
"grad_norm": 0.24435079120648112,
"learning_rate": 4.8763011319542025e-05,
"loss": 0.9777,
"step": 1375
},
{
"epoch": 0.8753568030447193,
"grad_norm": 0.2629272614210174,
"learning_rate": 4.6404950144602e-05,
"loss": 0.9819,
"step": 1380
},
{
"epoch": 0.8785283856644466,
"grad_norm": 0.2663926565440222,
"learning_rate": 4.4102549989283756e-05,
"loss": 0.9675,
"step": 1385
},
{
"epoch": 0.8816999682841739,
"grad_norm": 0.2431315966754426,
"learning_rate": 4.1856093382994067e-05,
"loss": 0.9617,
"step": 1390
},
{
"epoch": 0.884871550903901,
"grad_norm": 0.2615985135082817,
"learning_rate": 3.966585599026051e-05,
"loss": 0.9705,
"step": 1395
},
{
"epoch": 0.8880431335236283,
"grad_norm": 0.2437571495377507,
"learning_rate": 3.753210657690537e-05,
"loss": 0.9637,
"step": 1400
},
{
"epoch": 0.8912147161433556,
"grad_norm": 0.2867683320851404,
"learning_rate": 3.5455106977064555e-05,
"loss": 0.9813,
"step": 1405
},
{
"epoch": 0.8943862987630827,
"grad_norm": 0.23596873266775822,
"learning_rate": 3.343511206105804e-05,
"loss": 0.9654,
"step": 1410
},
{
"epoch": 0.89755788138281,
"grad_norm": 0.2691994741352151,
"learning_rate": 3.147236970411449e-05,
"loss": 0.9559,
"step": 1415
},
{
"epoch": 0.9007294640025373,
"grad_norm": 0.25848494724563204,
"learning_rate": 2.9567120755953858e-05,
"loss": 0.9631,
"step": 1420
},
{
"epoch": 0.9039010466222646,
"grad_norm": 0.2329147999513356,
"learning_rate": 2.7719599011233333e-05,
"loss": 0.9654,
"step": 1425
},
{
"epoch": 0.9070726292419917,
"grad_norm": 0.24897956262619586,
"learning_rate": 2.593003118085746e-05,
"loss": 0.9686,
"step": 1430
},
{
"epoch": 0.910244211861719,
"grad_norm": 0.24140747398754628,
"learning_rate": 2.4198636864158684e-05,
"loss": 0.9709,
"step": 1435
},
{
"epoch": 0.9134157944814463,
"grad_norm": 0.23859470642150962,
"learning_rate": 2.2525628521949837e-05,
"loss": 0.9723,
"step": 1440
},
{
"epoch": 0.9165873771011734,
"grad_norm": 0.23614657553272203,
"learning_rate": 2.091121145045327e-05,
"loss": 0.96,
"step": 1445
},
{
"epoch": 0.9197589597209007,
"grad_norm": 0.2305582486421353,
"learning_rate": 1.9355583756108407e-05,
"loss": 0.9622,
"step": 1450
},
{
"epoch": 0.922930542340628,
"grad_norm": 0.23818404259591164,
"learning_rate": 1.7858936331262122e-05,
"loss": 0.9612,
"step": 1455
},
{
"epoch": 0.9261021249603553,
"grad_norm": 0.2378653986328734,
"learning_rate": 1.6421452830744365e-05,
"loss": 0.9716,
"step": 1460
},
{
"epoch": 0.9292737075800824,
"grad_norm": 0.2329586634857264,
"learning_rate": 1.5043309649331205e-05,
"loss": 0.9558,
"step": 1465
},
{
"epoch": 0.9324452901998097,
"grad_norm": 0.4063284757979328,
"learning_rate": 1.3724675900099959e-05,
"loss": 0.9654,
"step": 1470
},
{
"epoch": 0.935616872819537,
"grad_norm": 0.24883789905797735,
"learning_rate": 1.246571339367658e-05,
"loss": 0.9603,
"step": 1475
},
{
"epoch": 0.9387884554392641,
"grad_norm": 0.2484198570399255,
"learning_rate": 1.1266576618380097e-05,
"loss": 0.9579,
"step": 1480
},
{
"epoch": 0.9419600380589914,
"grad_norm": 0.24729636213521072,
"learning_rate": 1.0127412721265218e-05,
"loss": 0.9675,
"step": 1485
},
{
"epoch": 0.9451316206787187,
"grad_norm": 0.2505398676306425,
"learning_rate": 9.048361490065548e-06,
"loss": 0.9526,
"step": 1490
},
{
"epoch": 0.948303203298446,
"grad_norm": 0.26466516239326676,
"learning_rate": 8.029555336040383e-06,
"loss": 0.9661,
"step": 1495
},
{
"epoch": 0.9514747859181731,
"grad_norm": 0.24164934386910944,
"learning_rate": 7.071119277726301e-06,
"loss": 0.9577,
"step": 1500
},
{
"epoch": 0.9546463685379004,
"grad_norm": 0.23732953651039135,
"learning_rate": 6.17317092559605e-06,
"loss": 0.9562,
"step": 1505
},
{
"epoch": 0.9578179511576277,
"grad_norm": 0.23388339171880254,
"learning_rate": 5.335820467626485e-06,
"loss": 0.973,
"step": 1510
},
{
"epoch": 0.9609895337773549,
"grad_norm": 0.2318159543809297,
"learning_rate": 4.559170655777267e-06,
"loss": 0.9478,
"step": 1515
},
{
"epoch": 0.9641611163970821,
"grad_norm": 0.24710746850544488,
"learning_rate": 3.843316793382123e-06,
"loss": 0.9707,
"step": 1520
},
{
"epoch": 0.9673326990168094,
"grad_norm": 0.2862292656525582,
"learning_rate": 3.188346723454083e-06,
"loss": 0.9643,
"step": 1525
},
{
"epoch": 0.9705042816365367,
"grad_norm": 0.2380821089662019,
"learning_rate": 2.594340817906271e-06,
"loss": 0.9624,
"step": 1530
},
{
"epoch": 0.9736758642562638,
"grad_norm": 0.23933390384190942,
"learning_rate": 2.0613719676891853e-06,
"loss": 0.9599,
"step": 1535
},
{
"epoch": 0.9768474468759911,
"grad_norm": 0.24854365220185978,
"learning_rate": 1.5895055738465169e-06,
"loss": 0.9592,
"step": 1540
},
{
"epoch": 0.9800190294957184,
"grad_norm": 0.24336558825284596,
"learning_rate": 1.1787995394893502e-06,
"loss": 0.962,
"step": 1545
},
{
"epoch": 0.9831906121154456,
"grad_norm": 0.22749162744166387,
"learning_rate": 8.293042626912328e-07,
"loss": 0.9573,
"step": 1550
},
{
"epoch": 0.9863621947351728,
"grad_norm": 0.2390082283281969,
"learning_rate": 5.410626303034017e-07,
"loss": 0.9556,
"step": 1555
},
{
"epoch": 0.9895337773549001,
"grad_norm": 0.23360184137608606,
"learning_rate": 3.141100126923813e-07,
"loss": 0.9571,
"step": 1560
},
{
"epoch": 0.9927053599746274,
"grad_norm": 0.2367998742996814,
"learning_rate": 1.4847425939956693e-07,
"loss": 0.9495,
"step": 1565
},
{
"epoch": 0.9958769425943546,
"grad_norm": 0.2269345672235601,
"learning_rate": 4.417569572368052e-08,
"loss": 0.9499,
"step": 1570
},
{
"epoch": 0.9990485252140818,
"grad_norm": 0.2484799432231265,
"learning_rate": 1.2271202268210324e-09,
"loss": 0.956,
"step": 1575
},
{
"epoch": 0.9996828417380272,
"eval_loss": 2.245941638946533,
"eval_runtime": 8.442,
"eval_samples_per_second": 46.316,
"eval_steps_per_second": 5.804,
"step": 1576
},
{
"epoch": 0.9996828417380272,
"step": 1576,
"total_flos": 38663670988800.0,
"train_loss": 1.4483577938854393,
"train_runtime": 3409.163,
"train_samples_per_second": 14.797,
"train_steps_per_second": 0.462
}
],
"logging_steps": 5,
"max_steps": 1576,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 38663670988800.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}