nthakur's picture
Model save
82ee557 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996830427892235,
"eval_steps": 200,
"global_step": 1577,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006339144215530904,
"grad_norm": 0.1844951284226507,
"learning_rate": 1.2658227848101265e-06,
"loss": 0.2445,
"step": 1
},
{
"epoch": 0.003169572107765452,
"grad_norm": 0.4228474323435137,
"learning_rate": 6.329113924050633e-06,
"loss": 0.5751,
"step": 5
},
{
"epoch": 0.006339144215530904,
"grad_norm": 0.44943947868261674,
"learning_rate": 1.2658227848101267e-05,
"loss": 0.6355,
"step": 10
},
{
"epoch": 0.009508716323296355,
"grad_norm": 0.4388677072263688,
"learning_rate": 1.89873417721519e-05,
"loss": 0.5644,
"step": 15
},
{
"epoch": 0.012678288431061807,
"grad_norm": 0.42271873762000256,
"learning_rate": 2.5316455696202533e-05,
"loss": 0.6074,
"step": 20
},
{
"epoch": 0.01584786053882726,
"grad_norm": 0.394168038759961,
"learning_rate": 3.1645569620253167e-05,
"loss": 0.5618,
"step": 25
},
{
"epoch": 0.01901743264659271,
"grad_norm": 0.3572253194331872,
"learning_rate": 3.79746835443038e-05,
"loss": 0.3782,
"step": 30
},
{
"epoch": 0.022187004754358162,
"grad_norm": 0.412555556410598,
"learning_rate": 4.430379746835443e-05,
"loss": 0.5004,
"step": 35
},
{
"epoch": 0.025356576862123614,
"grad_norm": 0.3794217410787016,
"learning_rate": 5.0632911392405066e-05,
"loss": 0.4466,
"step": 40
},
{
"epoch": 0.028526148969889066,
"grad_norm": 0.35048440162730615,
"learning_rate": 5.69620253164557e-05,
"loss": 0.4673,
"step": 45
},
{
"epoch": 0.03169572107765452,
"grad_norm": 0.5944470916433041,
"learning_rate": 6.329113924050633e-05,
"loss": 0.4321,
"step": 50
},
{
"epoch": 0.03486529318541997,
"grad_norm": 0.42565747209046223,
"learning_rate": 6.962025316455697e-05,
"loss": 0.3458,
"step": 55
},
{
"epoch": 0.03803486529318542,
"grad_norm": 0.4086754614850085,
"learning_rate": 7.59493670886076e-05,
"loss": 0.3853,
"step": 60
},
{
"epoch": 0.04120443740095087,
"grad_norm": 0.4100402315211435,
"learning_rate": 8.227848101265824e-05,
"loss": 0.3248,
"step": 65
},
{
"epoch": 0.044374009508716325,
"grad_norm": 0.439691902474589,
"learning_rate": 8.860759493670887e-05,
"loss": 0.3902,
"step": 70
},
{
"epoch": 0.04754358161648178,
"grad_norm": 0.4789983229510449,
"learning_rate": 9.493670886075949e-05,
"loss": 0.4375,
"step": 75
},
{
"epoch": 0.05071315372424723,
"grad_norm": 0.4326295423185557,
"learning_rate": 0.00010126582278481013,
"loss": 0.344,
"step": 80
},
{
"epoch": 0.05388272583201268,
"grad_norm": 0.4082745764246507,
"learning_rate": 0.00010759493670886076,
"loss": 0.373,
"step": 85
},
{
"epoch": 0.05705229793977813,
"grad_norm": 0.44917869809554406,
"learning_rate": 0.0001139240506329114,
"loss": 0.3366,
"step": 90
},
{
"epoch": 0.060221870047543584,
"grad_norm": 0.4099898854278625,
"learning_rate": 0.00012025316455696203,
"loss": 0.3827,
"step": 95
},
{
"epoch": 0.06339144215530904,
"grad_norm": 0.5173612120396457,
"learning_rate": 0.00012658227848101267,
"loss": 0.3913,
"step": 100
},
{
"epoch": 0.06656101426307448,
"grad_norm": 0.4695908910723305,
"learning_rate": 0.0001329113924050633,
"loss": 0.3285,
"step": 105
},
{
"epoch": 0.06973058637083994,
"grad_norm": 0.34610029250066504,
"learning_rate": 0.00013924050632911395,
"loss": 0.3542,
"step": 110
},
{
"epoch": 0.07290015847860538,
"grad_norm": 0.3833563232036365,
"learning_rate": 0.00014556962025316457,
"loss": 0.3442,
"step": 115
},
{
"epoch": 0.07606973058637084,
"grad_norm": 0.38597736664868315,
"learning_rate": 0.0001518987341772152,
"loss": 0.3499,
"step": 120
},
{
"epoch": 0.07923930269413629,
"grad_norm": 0.4555824320300245,
"learning_rate": 0.00015822784810126583,
"loss": 0.3843,
"step": 125
},
{
"epoch": 0.08240887480190175,
"grad_norm": 0.44058959604469733,
"learning_rate": 0.00016455696202531648,
"loss": 0.3321,
"step": 130
},
{
"epoch": 0.08557844690966719,
"grad_norm": 0.37513672150754146,
"learning_rate": 0.0001708860759493671,
"loss": 0.3409,
"step": 135
},
{
"epoch": 0.08874801901743265,
"grad_norm": 0.3532888739409051,
"learning_rate": 0.00017721518987341773,
"loss": 0.3388,
"step": 140
},
{
"epoch": 0.0919175911251981,
"grad_norm": 0.31398944959900404,
"learning_rate": 0.00018354430379746836,
"loss": 0.3407,
"step": 145
},
{
"epoch": 0.09508716323296355,
"grad_norm": 0.48473648286443866,
"learning_rate": 0.00018987341772151899,
"loss": 0.4109,
"step": 150
},
{
"epoch": 0.098256735340729,
"grad_norm": 0.3832743712760423,
"learning_rate": 0.00019620253164556964,
"loss": 0.2894,
"step": 155
},
{
"epoch": 0.10142630744849446,
"grad_norm": 0.3576599310136604,
"learning_rate": 0.00019999901968817678,
"loss": 0.3685,
"step": 160
},
{
"epoch": 0.1045958795562599,
"grad_norm": 0.4041268184733326,
"learning_rate": 0.0001999879914008964,
"loss": 0.3103,
"step": 165
},
{
"epoch": 0.10776545166402536,
"grad_norm": 0.348710082889974,
"learning_rate": 0.00019996471079244477,
"loss": 0.3686,
"step": 170
},
{
"epoch": 0.1109350237717908,
"grad_norm": 0.3641139077278622,
"learning_rate": 0.0001999291807155794,
"loss": 0.3672,
"step": 175
},
{
"epoch": 0.11410459587955626,
"grad_norm": 0.34875291735749603,
"learning_rate": 0.0001998814055240823,
"loss": 0.3289,
"step": 180
},
{
"epoch": 0.11727416798732171,
"grad_norm": 0.35868082118594846,
"learning_rate": 0.00019982139107222632,
"loss": 0.3843,
"step": 185
},
{
"epoch": 0.12044374009508717,
"grad_norm": 0.2975053354861811,
"learning_rate": 0.000199749144714058,
"loss": 0.3187,
"step": 190
},
{
"epoch": 0.12361331220285261,
"grad_norm": 0.3926097041806586,
"learning_rate": 0.00019966467530249627,
"loss": 0.3711,
"step": 195
},
{
"epoch": 0.12678288431061807,
"grad_norm": 0.39235636818547276,
"learning_rate": 0.00019956799318824776,
"loss": 0.3599,
"step": 200
},
{
"epoch": 0.12678288431061807,
"eval_loss": 0.31717613339424133,
"eval_runtime": 878.4135,
"eval_samples_per_second": 4.554,
"eval_steps_per_second": 0.569,
"step": 200
},
{
"epoch": 0.12995245641838352,
"grad_norm": 0.32366959300654363,
"learning_rate": 0.00019945911021853818,
"loss": 0.2671,
"step": 205
},
{
"epoch": 0.13312202852614896,
"grad_norm": 0.34183927553766114,
"learning_rate": 0.00019933803973566102,
"loss": 0.3491,
"step": 210
},
{
"epoch": 0.13629160063391443,
"grad_norm": 0.355629049879592,
"learning_rate": 0.0001992047965753422,
"loss": 0.2778,
"step": 215
},
{
"epoch": 0.13946117274167988,
"grad_norm": 0.31194706241410036,
"learning_rate": 0.00019905939706492238,
"loss": 0.3278,
"step": 220
},
{
"epoch": 0.14263074484944532,
"grad_norm": 0.37190501088914274,
"learning_rate": 0.0001989018590213561,
"loss": 0.3757,
"step": 225
},
{
"epoch": 0.14580031695721077,
"grad_norm": 0.30859177154159206,
"learning_rate": 0.00019873220174902858,
"loss": 0.2952,
"step": 230
},
{
"epoch": 0.14896988906497624,
"grad_norm": 0.4072493051692793,
"learning_rate": 0.0001985504460373903,
"loss": 0.3576,
"step": 235
},
{
"epoch": 0.15213946117274169,
"grad_norm": 0.3117614582623609,
"learning_rate": 0.00019835661415840928,
"loss": 0.3127,
"step": 240
},
{
"epoch": 0.15530903328050713,
"grad_norm": 0.3433870206019631,
"learning_rate": 0.00019815072986384218,
"loss": 0.3424,
"step": 245
},
{
"epoch": 0.15847860538827258,
"grad_norm": 0.3252374107324197,
"learning_rate": 0.0001979328183823236,
"loss": 0.3509,
"step": 250
},
{
"epoch": 0.16164817749603805,
"grad_norm": 0.32574757253252834,
"learning_rate": 0.00019770290641627468,
"loss": 0.2913,
"step": 255
},
{
"epoch": 0.1648177496038035,
"grad_norm": 0.37343408069668577,
"learning_rate": 0.00019746102213863114,
"loss": 0.3524,
"step": 260
},
{
"epoch": 0.16798732171156894,
"grad_norm": 0.30197216412790706,
"learning_rate": 0.00019720719518939083,
"loss": 0.295,
"step": 265
},
{
"epoch": 0.17115689381933438,
"grad_norm": 0.37750434171669517,
"learning_rate": 0.00019694145667198195,
"loss": 0.3215,
"step": 270
},
{
"epoch": 0.17432646592709986,
"grad_norm": 0.3368196048030473,
"learning_rate": 0.0001966638391494514,
"loss": 0.35,
"step": 275
},
{
"epoch": 0.1774960380348653,
"grad_norm": 0.3232595651729065,
"learning_rate": 0.0001963743766404749,
"loss": 0.2637,
"step": 280
},
{
"epoch": 0.18066561014263074,
"grad_norm": 0.32199548202560035,
"learning_rate": 0.00019607310461518818,
"loss": 0.3262,
"step": 285
},
{
"epoch": 0.1838351822503962,
"grad_norm": 0.29117926540088634,
"learning_rate": 0.0001957600599908406,
"loss": 0.3129,
"step": 290
},
{
"epoch": 0.18700475435816163,
"grad_norm": 0.2836794081153409,
"learning_rate": 0.00019543528112727146,
"loss": 0.3207,
"step": 295
},
{
"epoch": 0.1901743264659271,
"grad_norm": 0.37478385305484463,
"learning_rate": 0.0001950988078222093,
"loss": 0.3503,
"step": 300
},
{
"epoch": 0.19334389857369255,
"grad_norm": 0.3323790483161259,
"learning_rate": 0.00019475068130639543,
"loss": 0.2873,
"step": 305
},
{
"epoch": 0.196513470681458,
"grad_norm": 0.31045326503955184,
"learning_rate": 0.0001943909442385313,
"loss": 0.3379,
"step": 310
},
{
"epoch": 0.19968304278922344,
"grad_norm": 0.295428110940092,
"learning_rate": 0.00019401964070005144,
"loss": 0.2913,
"step": 315
},
{
"epoch": 0.20285261489698891,
"grad_norm": 0.31381749704770145,
"learning_rate": 0.00019363681618972164,
"loss": 0.3167,
"step": 320
},
{
"epoch": 0.20602218700475436,
"grad_norm": 0.3799683908480184,
"learning_rate": 0.00019324251761806374,
"loss": 0.3203,
"step": 325
},
{
"epoch": 0.2091917591125198,
"grad_norm": 0.25669447806119594,
"learning_rate": 0.00019283679330160726,
"loss": 0.2598,
"step": 330
},
{
"epoch": 0.21236133122028525,
"grad_norm": 0.3253285501894849,
"learning_rate": 0.00019241969295696879,
"loss": 0.321,
"step": 335
},
{
"epoch": 0.21553090332805072,
"grad_norm": 0.3015776648780859,
"learning_rate": 0.0001919912676947598,
"loss": 0.2912,
"step": 340
},
{
"epoch": 0.21870047543581617,
"grad_norm": 0.3548152436637532,
"learning_rate": 0.00019155157001332374,
"loss": 0.3398,
"step": 345
},
{
"epoch": 0.2218700475435816,
"grad_norm": 0.3562179525646546,
"learning_rate": 0.00019110065379230289,
"loss": 0.3575,
"step": 350
},
{
"epoch": 0.22503961965134706,
"grad_norm": 0.33759944051182883,
"learning_rate": 0.00019063857428603615,
"loss": 0.2644,
"step": 355
},
{
"epoch": 0.22820919175911253,
"grad_norm": 0.3478332359179607,
"learning_rate": 0.00019016538811678823,
"loss": 0.3421,
"step": 360
},
{
"epoch": 0.23137876386687797,
"grad_norm": 0.3107602080624315,
"learning_rate": 0.0001896811532678113,
"loss": 0.262,
"step": 365
},
{
"epoch": 0.23454833597464342,
"grad_norm": 0.26971775917740104,
"learning_rate": 0.00018918592907623985,
"loss": 0.3378,
"step": 370
},
{
"epoch": 0.23771790808240886,
"grad_norm": 0.32413332448217697,
"learning_rate": 0.00018867977622581957,
"loss": 0.3316,
"step": 375
},
{
"epoch": 0.24088748019017434,
"grad_norm": 0.3522975093101741,
"learning_rate": 0.00018816275673947148,
"loss": 0.2678,
"step": 380
},
{
"epoch": 0.24405705229793978,
"grad_norm": 0.31661852350790726,
"learning_rate": 0.00018763493397169146,
"loss": 0.3275,
"step": 385
},
{
"epoch": 0.24722662440570523,
"grad_norm": 0.27090727261610936,
"learning_rate": 0.00018709637260078729,
"loss": 0.2858,
"step": 390
},
{
"epoch": 0.25039619651347067,
"grad_norm": 0.3143474617991223,
"learning_rate": 0.0001865471386209527,
"loss": 0.3317,
"step": 395
},
{
"epoch": 0.25356576862123614,
"grad_norm": 0.48811153855723693,
"learning_rate": 0.000185987299334181,
"loss": 0.3295,
"step": 400
},
{
"epoch": 0.25356576862123614,
"eval_loss": 0.29194891452789307,
"eval_runtime": 872.9978,
"eval_samples_per_second": 4.582,
"eval_steps_per_second": 0.573,
"step": 400
},
{
"epoch": 0.25673534072900156,
"grad_norm": 0.31755342222995686,
"learning_rate": 0.00018541692334201771,
"loss": 0.2643,
"step": 405
},
{
"epoch": 0.25990491283676703,
"grad_norm": 0.34778059073770806,
"learning_rate": 0.0001848360805371544,
"loss": 0.3339,
"step": 410
},
{
"epoch": 0.2630744849445325,
"grad_norm": 0.3183073063986642,
"learning_rate": 0.00018424484209486416,
"loss": 0.2673,
"step": 415
},
{
"epoch": 0.2662440570522979,
"grad_norm": 0.2788199901083398,
"learning_rate": 0.00018364328046428,
"loss": 0.3272,
"step": 420
},
{
"epoch": 0.2694136291600634,
"grad_norm": 0.3666143727147526,
"learning_rate": 0.00018303146935951689,
"loss": 0.3247,
"step": 425
},
{
"epoch": 0.27258320126782887,
"grad_norm": 0.28586548327038175,
"learning_rate": 0.00018240948375063926,
"loss": 0.2792,
"step": 430
},
{
"epoch": 0.2757527733755943,
"grad_norm": 0.9727255846044429,
"learning_rate": 0.00018177739985447412,
"loss": 0.3485,
"step": 435
},
{
"epoch": 0.27892234548335976,
"grad_norm": 0.29065854553956355,
"learning_rate": 0.0001811352951252717,
"loss": 0.2729,
"step": 440
},
{
"epoch": 0.2820919175911252,
"grad_norm": 0.320575993183303,
"learning_rate": 0.0001804832482452142,
"loss": 0.3354,
"step": 445
},
{
"epoch": 0.28526148969889065,
"grad_norm": 0.34869737354697955,
"learning_rate": 0.0001798213391147746,
"loss": 0.3385,
"step": 450
},
{
"epoch": 0.2884310618066561,
"grad_norm": 0.31478642211651564,
"learning_rate": 0.00017914964884292544,
"loss": 0.3133,
"step": 455
},
{
"epoch": 0.29160063391442154,
"grad_norm": 0.36834278711947965,
"learning_rate": 0.0001784682597372,
"loss": 0.3593,
"step": 460
},
{
"epoch": 0.294770206022187,
"grad_norm": 0.2791902388221146,
"learning_rate": 0.00017777725529360676,
"loss": 0.3005,
"step": 465
},
{
"epoch": 0.2979397781299525,
"grad_norm": 0.30096452678752406,
"learning_rate": 0.00017707672018639758,
"loss": 0.3354,
"step": 470
},
{
"epoch": 0.3011093502377179,
"grad_norm": 0.3708048891578612,
"learning_rate": 0.00017636674025769215,
"loss": 0.3147,
"step": 475
},
{
"epoch": 0.30427892234548337,
"grad_norm": 0.305209122691005,
"learning_rate": 0.00017564740250695904,
"loss": 0.2713,
"step": 480
},
{
"epoch": 0.3074484944532488,
"grad_norm": 0.3018873391630076,
"learning_rate": 0.0001749187950803549,
"loss": 0.3202,
"step": 485
},
{
"epoch": 0.31061806656101426,
"grad_norm": 0.3464422287874134,
"learning_rate": 0.00017418100725992316,
"loss": 0.3042,
"step": 490
},
{
"epoch": 0.31378763866877973,
"grad_norm": 0.31036543367721087,
"learning_rate": 0.00017343412945265382,
"loss": 0.3105,
"step": 495
},
{
"epoch": 0.31695721077654515,
"grad_norm": 0.3090116757558095,
"learning_rate": 0.00017267825317940493,
"loss": 0.3086,
"step": 500
},
{
"epoch": 0.3201267828843106,
"grad_norm": 0.32015559999952525,
"learning_rate": 0.00017191347106368797,
"loss": 0.2595,
"step": 505
},
{
"epoch": 0.3232963549920761,
"grad_norm": 0.28242640929152685,
"learning_rate": 0.0001711398768203178,
"loss": 0.3171,
"step": 510
},
{
"epoch": 0.3264659270998415,
"grad_norm": 0.3373697781712397,
"learning_rate": 0.00017035756524392924,
"loss": 0.2897,
"step": 515
},
{
"epoch": 0.329635499207607,
"grad_norm": 0.3187883343723006,
"learning_rate": 0.0001695666321973609,
"loss": 0.303,
"step": 520
},
{
"epoch": 0.3328050713153724,
"grad_norm": 0.4060972163443389,
"learning_rate": 0.00016876717459990862,
"loss": 0.3273,
"step": 525
},
{
"epoch": 0.3359746434231379,
"grad_norm": 0.2709960074426642,
"learning_rate": 0.0001679592904154489,
"loss": 0.2629,
"step": 530
},
{
"epoch": 0.33914421553090335,
"grad_norm": 0.2828719972128079,
"learning_rate": 0.00016714307864043487,
"loss": 0.2946,
"step": 535
},
{
"epoch": 0.34231378763866877,
"grad_norm": 0.29485357171410065,
"learning_rate": 0.00016631863929176524,
"loss": 0.2704,
"step": 540
},
{
"epoch": 0.34548335974643424,
"grad_norm": 0.3140677978027709,
"learning_rate": 0.00016548607339452853,
"loss": 0.3211,
"step": 545
},
{
"epoch": 0.3486529318541997,
"grad_norm": 0.30224374704766904,
"learning_rate": 0.00016464548296962373,
"loss": 0.3289,
"step": 550
},
{
"epoch": 0.3518225039619651,
"grad_norm": 0.3015178734291492,
"learning_rate": 0.0001637969710212588,
"loss": 0.262,
"step": 555
},
{
"epoch": 0.3549920760697306,
"grad_norm": 0.3261808476280464,
"learning_rate": 0.00016294064152432879,
"loss": 0.3524,
"step": 560
},
{
"epoch": 0.358161648177496,
"grad_norm": 0.30420040263110554,
"learning_rate": 0.00016207659941167485,
"loss": 0.2888,
"step": 565
},
{
"epoch": 0.3613312202852615,
"grad_norm": 0.29855740633395794,
"learning_rate": 0.00016120495056122622,
"loss": 0.3075,
"step": 570
},
{
"epoch": 0.36450079239302696,
"grad_norm": 0.3775755682614953,
"learning_rate": 0.00016032580178302583,
"loss": 0.3452,
"step": 575
},
{
"epoch": 0.3676703645007924,
"grad_norm": 0.3189277602131783,
"learning_rate": 0.00015943926080614235,
"loss": 0.2643,
"step": 580
},
{
"epoch": 0.37083993660855785,
"grad_norm": 0.32115548282274786,
"learning_rate": 0.00015854543626546915,
"loss": 0.3126,
"step": 585
},
{
"epoch": 0.37400950871632327,
"grad_norm": 0.29230296850863174,
"learning_rate": 0.00015764443768841234,
"loss": 0.2949,
"step": 590
},
{
"epoch": 0.37717908082408874,
"grad_norm": 0.32187057297721217,
"learning_rate": 0.0001567363754814696,
"loss": 0.3166,
"step": 595
},
{
"epoch": 0.3803486529318542,
"grad_norm": 0.3766752931165212,
"learning_rate": 0.0001558213609167012,
"loss": 0.323,
"step": 600
},
{
"epoch": 0.3803486529318542,
"eval_loss": 0.2788923680782318,
"eval_runtime": 873.5171,
"eval_samples_per_second": 4.579,
"eval_steps_per_second": 0.572,
"step": 600
},
{
"epoch": 0.38351822503961963,
"grad_norm": 0.31877960462977273,
"learning_rate": 0.00015489950611809484,
"loss": 0.2803,
"step": 605
},
{
"epoch": 0.3866877971473851,
"grad_norm": 0.2903622851026156,
"learning_rate": 0.00015397092404782642,
"loss": 0.3178,
"step": 610
},
{
"epoch": 0.3898573692551506,
"grad_norm": 0.2639727101749139,
"learning_rate": 0.00015303572849241764,
"loss": 0.2703,
"step": 615
},
{
"epoch": 0.393026941362916,
"grad_norm": 0.3491709894849581,
"learning_rate": 0.00015209403404879303,
"loss": 0.3049,
"step": 620
},
{
"epoch": 0.39619651347068147,
"grad_norm": 0.3651420024997032,
"learning_rate": 0.00015114595611023744,
"loss": 0.3265,
"step": 625
},
{
"epoch": 0.3993660855784469,
"grad_norm": 0.3071330073578763,
"learning_rate": 0.0001501916108522558,
"loss": 0.2645,
"step": 630
},
{
"epoch": 0.40253565768621236,
"grad_norm": 0.2739471545543727,
"learning_rate": 0.00014923111521833758,
"loss": 0.3035,
"step": 635
},
{
"epoch": 0.40570522979397783,
"grad_norm": 0.30630113259525843,
"learning_rate": 0.00014826458690562642,
"loss": 0.2606,
"step": 640
},
{
"epoch": 0.40887480190174325,
"grad_norm": 0.2988843883769528,
"learning_rate": 0.00014729214435049793,
"loss": 0.3111,
"step": 645
},
{
"epoch": 0.4120443740095087,
"grad_norm": 0.3110979862585215,
"learning_rate": 0.0001463139067140468,
"loss": 0.2948,
"step": 650
},
{
"epoch": 0.4152139461172742,
"grad_norm": 0.30767657253531316,
"learning_rate": 0.0001453299938674849,
"loss": 0.2638,
"step": 655
},
{
"epoch": 0.4183835182250396,
"grad_norm": 0.27014842841388653,
"learning_rate": 0.00014434052637745257,
"loss": 0.2819,
"step": 660
},
{
"epoch": 0.4215530903328051,
"grad_norm": 0.2739393681355767,
"learning_rate": 0.00014334562549124467,
"loss": 0.2466,
"step": 665
},
{
"epoch": 0.4247226624405705,
"grad_norm": 0.31758998023523244,
"learning_rate": 0.00014234541312195323,
"loss": 0.2873,
"step": 670
},
{
"epoch": 0.42789223454833597,
"grad_norm": 0.39847849128188423,
"learning_rate": 0.00014134001183352832,
"loss": 0.2979,
"step": 675
},
{
"epoch": 0.43106180665610144,
"grad_norm": 0.30950118355401873,
"learning_rate": 0.00014032954482575937,
"loss": 0.2617,
"step": 680
},
{
"epoch": 0.43423137876386686,
"grad_norm": 0.3260587574739946,
"learning_rate": 0.0001393141359191787,
"loss": 0.3109,
"step": 685
},
{
"epoch": 0.43740095087163233,
"grad_norm": 0.3114375419997854,
"learning_rate": 0.00013829390953988853,
"loss": 0.2845,
"step": 690
},
{
"epoch": 0.4405705229793978,
"grad_norm": 0.30019871836883555,
"learning_rate": 0.00013726899070431423,
"loss": 0.324,
"step": 695
},
{
"epoch": 0.4437400950871632,
"grad_norm": 0.38021042516470643,
"learning_rate": 0.00013623950500388506,
"loss": 0.3269,
"step": 700
},
{
"epoch": 0.4469096671949287,
"grad_norm": 0.3089060241706131,
"learning_rate": 0.00013520557858964446,
"loss": 0.2584,
"step": 705
},
{
"epoch": 0.4500792393026941,
"grad_norm": 0.27984586622582663,
"learning_rate": 0.00013416733815679166,
"loss": 0.2909,
"step": 710
},
{
"epoch": 0.4532488114104596,
"grad_norm": 0.2923559292409706,
"learning_rate": 0.00013312491092915682,
"loss": 0.2489,
"step": 715
},
{
"epoch": 0.45641838351822506,
"grad_norm": 0.29223045315786345,
"learning_rate": 0.00013207842464361125,
"loss": 0.3135,
"step": 720
},
{
"epoch": 0.4595879556259905,
"grad_norm": 0.33907899924090856,
"learning_rate": 0.00013102800753441487,
"loss": 0.3148,
"step": 725
},
{
"epoch": 0.46275752773375595,
"grad_norm": 0.26110455456342696,
"learning_rate": 0.00012997378831750242,
"loss": 0.2505,
"step": 730
},
{
"epoch": 0.4659270998415214,
"grad_norm": 0.2855563878095534,
"learning_rate": 0.00012891589617471122,
"loss": 0.322,
"step": 735
},
{
"epoch": 0.46909667194928684,
"grad_norm": 0.27089962197787903,
"learning_rate": 0.00012785446073795118,
"loss": 0.2629,
"step": 740
},
{
"epoch": 0.4722662440570523,
"grad_norm": 0.2787588891548799,
"learning_rate": 0.00012678961207332015,
"loss": 0.3071,
"step": 745
},
{
"epoch": 0.4754358161648177,
"grad_norm": 0.35249049637057156,
"learning_rate": 0.00012572148066516584,
"loss": 0.3265,
"step": 750
},
{
"epoch": 0.4786053882725832,
"grad_norm": 0.33307560406452336,
"learning_rate": 0.00012465019740009662,
"loss": 0.2403,
"step": 755
},
{
"epoch": 0.48177496038034867,
"grad_norm": 0.3035753509057755,
"learning_rate": 0.00012357589355094275,
"loss": 0.3057,
"step": 760
},
{
"epoch": 0.4849445324881141,
"grad_norm": 0.2950972689886197,
"learning_rate": 0.00012249870076067067,
"loss": 0.2637,
"step": 765
},
{
"epoch": 0.48811410459587956,
"grad_norm": 0.2713040409786771,
"learning_rate": 0.00012141875102625167,
"loss": 0.3196,
"step": 770
},
{
"epoch": 0.49128367670364503,
"grad_norm": 0.37005187803966516,
"learning_rate": 0.00012033617668248723,
"loss": 0.3265,
"step": 775
},
{
"epoch": 0.49445324881141045,
"grad_norm": 0.3678796577106568,
"learning_rate": 0.00011925111038579309,
"loss": 0.2283,
"step": 780
},
{
"epoch": 0.4976228209191759,
"grad_norm": 0.3021844529595635,
"learning_rate": 0.00011816368509794364,
"loss": 0.2967,
"step": 785
},
{
"epoch": 0.5007923930269413,
"grad_norm": 0.3028161473676034,
"learning_rate": 0.00011707403406977928,
"loss": 0.2841,
"step": 790
},
{
"epoch": 0.5039619651347068,
"grad_norm": 0.27418964538735746,
"learning_rate": 0.00011598229082487784,
"loss": 0.2803,
"step": 795
},
{
"epoch": 0.5071315372424723,
"grad_norm": 0.3426638434156249,
"learning_rate": 0.0001148885891431932,
"loss": 0.3274,
"step": 800
},
{
"epoch": 0.5071315372424723,
"eval_loss": 0.26855266094207764,
"eval_runtime": 873.628,
"eval_samples_per_second": 4.579,
"eval_steps_per_second": 0.572,
"step": 800
},
{
"epoch": 0.5103011093502378,
"grad_norm": 0.2681269338020656,
"learning_rate": 0.00011379306304466198,
"loss": 0.2381,
"step": 805
},
{
"epoch": 0.5134706814580031,
"grad_norm": 0.2987060218422062,
"learning_rate": 0.00011269584677278102,
"loss": 0.3076,
"step": 810
},
{
"epoch": 0.5166402535657686,
"grad_norm": 0.2804222341073312,
"learning_rate": 0.00011159707477815755,
"loss": 0.2395,
"step": 815
},
{
"epoch": 0.5198098256735341,
"grad_norm": 0.25835895356413513,
"learning_rate": 0.00011049688170203383,
"loss": 0.3041,
"step": 820
},
{
"epoch": 0.5229793977812995,
"grad_norm": 0.3313190058494361,
"learning_rate": 0.00010939540235978845,
"loss": 0.297,
"step": 825
},
{
"epoch": 0.526148969889065,
"grad_norm": 0.2564972143294916,
"learning_rate": 0.00010829277172441648,
"loss": 0.2359,
"step": 830
},
{
"epoch": 0.5293185419968305,
"grad_norm": 0.31632766018739716,
"learning_rate": 0.00010718912490998991,
"loss": 0.3112,
"step": 835
},
{
"epoch": 0.5324881141045958,
"grad_norm": 0.2738970193614327,
"learning_rate": 0.00010608459715510139,
"loss": 0.2416,
"step": 840
},
{
"epoch": 0.5356576862123613,
"grad_norm": 0.35306801364530893,
"learning_rate": 0.00010497932380629207,
"loss": 0.3334,
"step": 845
},
{
"epoch": 0.5388272583201268,
"grad_norm": 0.3617753781992424,
"learning_rate": 0.00010387344030146665,
"loss": 0.3071,
"step": 850
},
{
"epoch": 0.5419968304278923,
"grad_norm": 0.284695185318866,
"learning_rate": 0.0001027670821532971,
"loss": 0.2516,
"step": 855
},
{
"epoch": 0.5451664025356577,
"grad_norm": 0.28641499966999695,
"learning_rate": 0.00010166038493261722,
"loss": 0.3268,
"step": 860
},
{
"epoch": 0.5483359746434231,
"grad_norm": 0.29940254299061986,
"learning_rate": 0.00010055348425181,
"loss": 0.2667,
"step": 865
},
{
"epoch": 0.5515055467511886,
"grad_norm": 0.33784906825030664,
"learning_rate": 9.944651574819003e-05,
"loss": 0.3006,
"step": 870
},
{
"epoch": 0.554675118858954,
"grad_norm": 0.33800198210916443,
"learning_rate": 9.83396150673828e-05,
"loss": 0.3009,
"step": 875
},
{
"epoch": 0.5578446909667195,
"grad_norm": 0.27814752259908526,
"learning_rate": 9.72329178467029e-05,
"loss": 0.25,
"step": 880
},
{
"epoch": 0.561014263074485,
"grad_norm": 0.3120985607406773,
"learning_rate": 9.612655969853336e-05,
"loss": 0.3079,
"step": 885
},
{
"epoch": 0.5641838351822503,
"grad_norm": 0.32270045792226343,
"learning_rate": 9.502067619370794e-05,
"loss": 0.2465,
"step": 890
},
{
"epoch": 0.5673534072900158,
"grad_norm": 0.2522429392869884,
"learning_rate": 9.391540284489862e-05,
"loss": 0.3049,
"step": 895
},
{
"epoch": 0.5705229793977813,
"grad_norm": 0.32479021947356745,
"learning_rate": 9.281087509001011e-05,
"loss": 0.3109,
"step": 900
},
{
"epoch": 0.5736925515055468,
"grad_norm": 0.3071871099500722,
"learning_rate": 9.170722827558358e-05,
"loss": 0.2566,
"step": 905
},
{
"epoch": 0.5768621236133122,
"grad_norm": 0.2808358292017096,
"learning_rate": 9.060459764021156e-05,
"loss": 0.2981,
"step": 910
},
{
"epoch": 0.5800316957210776,
"grad_norm": 0.36613518181258947,
"learning_rate": 8.950311829796619e-05,
"loss": 0.2812,
"step": 915
},
{
"epoch": 0.5832012678288431,
"grad_norm": 0.29120302112196544,
"learning_rate": 8.840292522184247e-05,
"loss": 0.2958,
"step": 920
},
{
"epoch": 0.5863708399366085,
"grad_norm": 0.3008146054202439,
"learning_rate": 8.730415322721897e-05,
"loss": 0.3119,
"step": 925
},
{
"epoch": 0.589540412044374,
"grad_norm": 0.30809505125548203,
"learning_rate": 8.620693695533803e-05,
"loss": 0.2603,
"step": 930
},
{
"epoch": 0.5927099841521395,
"grad_norm": 0.3464042931932695,
"learning_rate": 8.511141085680683e-05,
"loss": 0.3217,
"step": 935
},
{
"epoch": 0.595879556259905,
"grad_norm": 0.28395404105986655,
"learning_rate": 8.401770917512221e-05,
"loss": 0.2339,
"step": 940
},
{
"epoch": 0.5990491283676703,
"grad_norm": 0.32456815689823176,
"learning_rate": 8.292596593022075e-05,
"loss": 0.2761,
"step": 945
},
{
"epoch": 0.6022187004754358,
"grad_norm": 0.35814205267620147,
"learning_rate": 8.183631490205637e-05,
"loss": 0.3064,
"step": 950
},
{
"epoch": 0.6053882725832013,
"grad_norm": 0.3307025804465351,
"learning_rate": 8.074888961420695e-05,
"loss": 0.2317,
"step": 955
},
{
"epoch": 0.6085578446909667,
"grad_norm": 0.3035093202164917,
"learning_rate": 7.966382331751277e-05,
"loss": 0.3024,
"step": 960
},
{
"epoch": 0.6117274167987322,
"grad_norm": 0.23483953416505404,
"learning_rate": 7.858124897374837e-05,
"loss": 0.2616,
"step": 965
},
{
"epoch": 0.6148969889064976,
"grad_norm": 0.24795445024402282,
"learning_rate": 7.750129923932939e-05,
"loss": 0.2889,
"step": 970
},
{
"epoch": 0.618066561014263,
"grad_norm": 0.39470726118892546,
"learning_rate": 7.642410644905726e-05,
"loss": 0.3255,
"step": 975
},
{
"epoch": 0.6212361331220285,
"grad_norm": 0.28578857562483734,
"learning_rate": 7.534980259990341e-05,
"loss": 0.2177,
"step": 980
},
{
"epoch": 0.624405705229794,
"grad_norm": 0.293120691065387,
"learning_rate": 7.427851933483418e-05,
"loss": 0.3008,
"step": 985
},
{
"epoch": 0.6275752773375595,
"grad_norm": 0.28050824031198807,
"learning_rate": 7.321038792667987e-05,
"loss": 0.2617,
"step": 990
},
{
"epoch": 0.6307448494453248,
"grad_norm": 0.3421819179459905,
"learning_rate": 7.214553926204883e-05,
"loss": 0.2827,
"step": 995
},
{
"epoch": 0.6339144215530903,
"grad_norm": 0.3825000717076991,
"learning_rate": 7.108410382528879e-05,
"loss": 0.3171,
"step": 1000
},
{
"epoch": 0.6339144215530903,
"eval_loss": 0.2597305178642273,
"eval_runtime": 873.3574,
"eval_samples_per_second": 4.58,
"eval_steps_per_second": 0.573,
"step": 1000
},
{
"epoch": 0.6370839936608558,
"grad_norm": 0.293460396656183,
"learning_rate": 7.002621168249759e-05,
"loss": 0.2297,
"step": 1005
},
{
"epoch": 0.6402535657686212,
"grad_norm": 0.3006160040194,
"learning_rate": 6.897199246558514e-05,
"loss": 0.2956,
"step": 1010
},
{
"epoch": 0.6434231378763867,
"grad_norm": 0.2791223126874652,
"learning_rate": 6.792157535638874e-05,
"loss": 0.2496,
"step": 1015
},
{
"epoch": 0.6465927099841522,
"grad_norm": 0.2894662197144813,
"learning_rate": 6.687508907084319e-05,
"loss": 0.2866,
"step": 1020
},
{
"epoch": 0.6497622820919176,
"grad_norm": 0.33156274133370534,
"learning_rate": 6.583266184320836e-05,
"loss": 0.32,
"step": 1025
},
{
"epoch": 0.652931854199683,
"grad_norm": 0.3447301699746775,
"learning_rate": 6.479442141035556e-05,
"loss": 0.2555,
"step": 1030
},
{
"epoch": 0.6561014263074485,
"grad_norm": 0.3019937172628048,
"learning_rate": 6.376049499611496e-05,
"loss": 0.2632,
"step": 1035
},
{
"epoch": 0.659270998415214,
"grad_norm": 0.25047087286035274,
"learning_rate": 6.273100929568578e-05,
"loss": 0.2472,
"step": 1040
},
{
"epoch": 0.6624405705229794,
"grad_norm": 0.31801398649186896,
"learning_rate": 6.170609046011151e-05,
"loss": 0.2793,
"step": 1045
},
{
"epoch": 0.6656101426307448,
"grad_norm": 0.3464523898432614,
"learning_rate": 6.068586408082133e-05,
"loss": 0.3138,
"step": 1050
},
{
"epoch": 0.6687797147385103,
"grad_norm": 0.2919062799416737,
"learning_rate": 5.9670455174240614e-05,
"loss": 0.2427,
"step": 1055
},
{
"epoch": 0.6719492868462758,
"grad_norm": 0.29267872629520425,
"learning_rate": 5.865998816647171e-05,
"loss": 0.3038,
"step": 1060
},
{
"epoch": 0.6751188589540412,
"grad_norm": 0.27361822239828004,
"learning_rate": 5.765458687804679e-05,
"loss": 0.2566,
"step": 1065
},
{
"epoch": 0.6782884310618067,
"grad_norm": 0.3050132066017946,
"learning_rate": 5.665437450875534e-05,
"loss": 0.2752,
"step": 1070
},
{
"epoch": 0.6814580031695721,
"grad_norm": 0.3580338711915158,
"learning_rate": 5.565947362254746e-05,
"loss": 0.3331,
"step": 1075
},
{
"epoch": 0.6846275752773375,
"grad_norm": 0.26747930377415474,
"learning_rate": 5.467000613251516e-05,
"loss": 0.2429,
"step": 1080
},
{
"epoch": 0.687797147385103,
"grad_norm": 0.32226567868782413,
"learning_rate": 5.368609328595323e-05,
"loss": 0.3208,
"step": 1085
},
{
"epoch": 0.6909667194928685,
"grad_norm": 0.27314417996148593,
"learning_rate": 5.270785564950208e-05,
"loss": 0.2351,
"step": 1090
},
{
"epoch": 0.694136291600634,
"grad_norm": 0.31179553442460595,
"learning_rate": 5.1735413094373594e-05,
"loss": 0.2791,
"step": 1095
},
{
"epoch": 0.6973058637083994,
"grad_norm": 0.2983027582550753,
"learning_rate": 5.0768884781662465e-05,
"loss": 0.3123,
"step": 1100
},
{
"epoch": 0.7004754358161648,
"grad_norm": 0.268619063810808,
"learning_rate": 4.9808389147744195e-05,
"loss": 0.2675,
"step": 1105
},
{
"epoch": 0.7036450079239303,
"grad_norm": 0.34151620569667657,
"learning_rate": 4.885404388976261e-05,
"loss": 0.3171,
"step": 1110
},
{
"epoch": 0.7068145800316957,
"grad_norm": 0.25963093128586956,
"learning_rate": 4.790596595120699e-05,
"loss": 0.2533,
"step": 1115
},
{
"epoch": 0.7099841521394612,
"grad_norm": 0.3373621924020373,
"learning_rate": 4.696427150758238e-05,
"loss": 0.3017,
"step": 1120
},
{
"epoch": 0.7131537242472267,
"grad_norm": 0.32633352666577314,
"learning_rate": 4.6029075952173596e-05,
"loss": 0.3052,
"step": 1125
},
{
"epoch": 0.716323296354992,
"grad_norm": 0.24971258370165642,
"learning_rate": 4.510049388190518e-05,
"loss": 0.2044,
"step": 1130
},
{
"epoch": 0.7194928684627575,
"grad_norm": 0.29602844393415106,
"learning_rate": 4.417863908329884e-05,
"loss": 0.2959,
"step": 1135
},
{
"epoch": 0.722662440570523,
"grad_norm": 0.23146594836780063,
"learning_rate": 4.32636245185304e-05,
"loss": 0.2252,
"step": 1140
},
{
"epoch": 0.7258320126782885,
"grad_norm": 0.2744736835188008,
"learning_rate": 4.235556231158765e-05,
"loss": 0.2884,
"step": 1145
},
{
"epoch": 0.7290015847860539,
"grad_norm": 0.27538990975844047,
"learning_rate": 4.145456373453087e-05,
"loss": 0.2981,
"step": 1150
},
{
"epoch": 0.7321711568938193,
"grad_norm": 0.3032208366026702,
"learning_rate": 4.0560739193857625e-05,
"loss": 0.2158,
"step": 1155
},
{
"epoch": 0.7353407290015848,
"grad_norm": 0.27204457210068295,
"learning_rate": 3.96741982169742e-05,
"loss": 0.3028,
"step": 1160
},
{
"epoch": 0.7385103011093502,
"grad_norm": 0.28301662262727184,
"learning_rate": 3.8795049438773825e-05,
"loss": 0.2946,
"step": 1165
},
{
"epoch": 0.7416798732171157,
"grad_norm": 0.2884264535746388,
"learning_rate": 3.7923400588325155e-05,
"loss": 0.3015,
"step": 1170
},
{
"epoch": 0.7448494453248812,
"grad_norm": 0.3186549926460967,
"learning_rate": 3.7059358475671224e-05,
"loss": 0.2773,
"step": 1175
},
{
"epoch": 0.7480190174326465,
"grad_norm": 0.2997708530371057,
"learning_rate": 3.6203028978741226e-05,
"loss": 0.2469,
"step": 1180
},
{
"epoch": 0.751188589540412,
"grad_norm": 0.32430776300917263,
"learning_rate": 3.535451703037626e-05,
"loss": 0.2726,
"step": 1185
},
{
"epoch": 0.7543581616481775,
"grad_norm": 0.2946578935656507,
"learning_rate": 3.45139266054715e-05,
"loss": 0.2645,
"step": 1190
},
{
"epoch": 0.757527733755943,
"grad_norm": 0.26638481808591286,
"learning_rate": 3.368136070823478e-05,
"loss": 0.2465,
"step": 1195
},
{
"epoch": 0.7606973058637084,
"grad_norm": 0.3677636374426017,
"learning_rate": 3.285692135956515e-05,
"loss": 0.3034,
"step": 1200
},
{
"epoch": 0.7606973058637084,
"eval_loss": 0.2539891302585602,
"eval_runtime": 873.4669,
"eval_samples_per_second": 4.579,
"eval_steps_per_second": 0.572,
"step": 1200
},
{
"epoch": 0.7638668779714739,
"grad_norm": 0.29762017072344943,
"learning_rate": 3.2040709584551095e-05,
"loss": 0.2547,
"step": 1205
},
{
"epoch": 0.7670364500792393,
"grad_norm": 0.35066724794986226,
"learning_rate": 3.123282540009139e-05,
"loss": 0.3043,
"step": 1210
},
{
"epoch": 0.7702060221870047,
"grad_norm": 0.27108651599825634,
"learning_rate": 3.0433367802639112e-05,
"loss": 0.2195,
"step": 1215
},
{
"epoch": 0.7733755942947702,
"grad_norm": 0.24030479810127725,
"learning_rate": 2.9642434756070793e-05,
"loss": 0.2545,
"step": 1220
},
{
"epoch": 0.7765451664025357,
"grad_norm": 0.288327556838552,
"learning_rate": 2.8860123179682242e-05,
"loss": 0.2942,
"step": 1225
},
{
"epoch": 0.7797147385103012,
"grad_norm": 0.29997783643544385,
"learning_rate": 2.8086528936312073e-05,
"loss": 0.2407,
"step": 1230
},
{
"epoch": 0.7828843106180665,
"grad_norm": 0.2665313932594352,
"learning_rate": 2.7321746820595086e-05,
"loss": 0.2863,
"step": 1235
},
{
"epoch": 0.786053882725832,
"grad_norm": 0.24138106294481415,
"learning_rate": 2.6565870547346196e-05,
"loss": 0.2443,
"step": 1240
},
{
"epoch": 0.7892234548335975,
"grad_norm": 0.27410565336257203,
"learning_rate": 2.5818992740076873e-05,
"loss": 0.2714,
"step": 1245
},
{
"epoch": 0.7923930269413629,
"grad_norm": 0.3607807135248553,
"learning_rate": 2.508120491964512e-05,
"loss": 0.3131,
"step": 1250
},
{
"epoch": 0.7955625990491284,
"grad_norm": 0.2752324746545014,
"learning_rate": 2.435259749304096e-05,
"loss": 0.2352,
"step": 1255
},
{
"epoch": 0.7987321711568938,
"grad_norm": 0.33701412326580854,
"learning_rate": 2.3633259742307844e-05,
"loss": 0.3121,
"step": 1260
},
{
"epoch": 0.8019017432646592,
"grad_norm": 0.2719696587030905,
"learning_rate": 2.292327981360245e-05,
"loss": 0.2569,
"step": 1265
},
{
"epoch": 0.8050713153724247,
"grad_norm": 0.321470064394813,
"learning_rate": 2.222274470639324e-05,
"loss": 0.2903,
"step": 1270
},
{
"epoch": 0.8082408874801902,
"grad_norm": 0.33376441935823614,
"learning_rate": 2.1531740262800004e-05,
"loss": 0.2712,
"step": 1275
},
{
"epoch": 0.8114104595879557,
"grad_norm": 0.3559808478292093,
"learning_rate": 2.0850351157074598e-05,
"loss": 0.2485,
"step": 1280
},
{
"epoch": 0.8145800316957211,
"grad_norm": 0.3006799560470683,
"learning_rate": 2.017866088522541e-05,
"loss": 0.2735,
"step": 1285
},
{
"epoch": 0.8177496038034865,
"grad_norm": 0.27868991819615774,
"learning_rate": 1.951675175478579e-05,
"loss": 0.2479,
"step": 1290
},
{
"epoch": 0.820919175911252,
"grad_norm": 0.30796745550467525,
"learning_rate": 1.8864704874728346e-05,
"loss": 0.2693,
"step": 1295
},
{
"epoch": 0.8240887480190174,
"grad_norm": 0.327384705590186,
"learning_rate": 1.822260014552587e-05,
"loss": 0.2787,
"step": 1300
},
{
"epoch": 0.8272583201267829,
"grad_norm": 0.2993843751525639,
"learning_rate": 1.7590516249360754e-05,
"loss": 0.2455,
"step": 1305
},
{
"epoch": 0.8304278922345484,
"grad_norm": 0.2979918507317238,
"learning_rate": 1.6968530640483127e-05,
"loss": 0.2889,
"step": 1310
},
{
"epoch": 0.8335974643423137,
"grad_norm": 0.2942240760065363,
"learning_rate": 1.6356719535720056e-05,
"loss": 0.2557,
"step": 1315
},
{
"epoch": 0.8367670364500792,
"grad_norm": 0.31698805935759067,
"learning_rate": 1.5755157905135843e-05,
"loss": 0.2842,
"step": 1320
},
{
"epoch": 0.8399366085578447,
"grad_norm": 0.3795639487558114,
"learning_rate": 1.5163919462845622e-05,
"loss": 0.2979,
"step": 1325
},
{
"epoch": 0.8431061806656102,
"grad_norm": 0.2933950396246441,
"learning_rate": 1.4583076657982297e-05,
"loss": 0.2291,
"step": 1330
},
{
"epoch": 0.8462757527733756,
"grad_norm": 0.25934135222761445,
"learning_rate": 1.401270066581899e-05,
"loss": 0.2981,
"step": 1335
},
{
"epoch": 0.849445324881141,
"grad_norm": 0.2512793866151091,
"learning_rate": 1.3452861379047287e-05,
"loss": 0.2299,
"step": 1340
},
{
"epoch": 0.8526148969889065,
"grad_norm": 0.27890392188122143,
"learning_rate": 1.2903627399212747e-05,
"loss": 0.2714,
"step": 1345
},
{
"epoch": 0.8557844690966719,
"grad_norm": 0.3540435753559853,
"learning_rate": 1.2365066028308547e-05,
"loss": 0.3208,
"step": 1350
},
{
"epoch": 0.8589540412044374,
"grad_norm": 0.3170188652169802,
"learning_rate": 1.183724326052854e-05,
"loss": 0.261,
"step": 1355
},
{
"epoch": 0.8621236133122029,
"grad_norm": 0.287259110452561,
"learning_rate": 1.1320223774180428e-05,
"loss": 0.2918,
"step": 1360
},
{
"epoch": 0.8652931854199684,
"grad_norm": 0.3145063929825825,
"learning_rate": 1.0814070923760178e-05,
"loss": 0.2562,
"step": 1365
},
{
"epoch": 0.8684627575277337,
"grad_norm": 0.29883537499670176,
"learning_rate": 1.0318846732188737e-05,
"loss": 0.2585,
"step": 1370
},
{
"epoch": 0.8716323296354992,
"grad_norm": 0.33602754178177113,
"learning_rate": 9.834611883211797e-06,
"loss": 0.303,
"step": 1375
},
{
"epoch": 0.8748019017432647,
"grad_norm": 0.27917699955310804,
"learning_rate": 9.361425713963878e-06,
"loss": 0.2399,
"step": 1380
},
{
"epoch": 0.8779714738510301,
"grad_norm": 0.29322424380757633,
"learning_rate": 8.899346207697134e-06,
"loss": 0.3192,
"step": 1385
},
{
"epoch": 0.8811410459587956,
"grad_norm": 0.32716078301472046,
"learning_rate": 8.448429986676298e-06,
"loss": 0.256,
"step": 1390
},
{
"epoch": 0.884310618066561,
"grad_norm": 0.28468261231564157,
"learning_rate": 8.00873230524023e-06,
"loss": 0.2864,
"step": 1395
},
{
"epoch": 0.8874801901743264,
"grad_norm": 0.3481974787604397,
"learning_rate": 7.580307043031232e-06,
"loss": 0.265,
"step": 1400
},
{
"epoch": 0.8874801901743264,
"eval_loss": 0.25099214911460876,
"eval_runtime": 873.7854,
"eval_samples_per_second": 4.578,
"eval_steps_per_second": 0.572,
"step": 1400
},
{
"epoch": 0.8906497622820919,
"grad_norm": 0.2756744352775957,
"learning_rate": 7.163206698392744e-06,
"loss": 0.2392,
"step": 1405
},
{
"epoch": 0.8938193343898574,
"grad_norm": 0.3070714015760399,
"learning_rate": 6.757482381936264e-06,
"loss": 0.2722,
"step": 1410
},
{
"epoch": 0.8969889064976229,
"grad_norm": 0.2719682030351016,
"learning_rate": 6.36318381027835e-06,
"loss": 0.2553,
"step": 1415
},
{
"epoch": 0.9001584786053882,
"grad_norm": 0.30754515515844727,
"learning_rate": 5.980359299948568e-06,
"loss": 0.2763,
"step": 1420
},
{
"epoch": 0.9033280507131537,
"grad_norm": 0.3599613866897873,
"learning_rate": 5.609055761468707e-06,
"loss": 0.2987,
"step": 1425
},
{
"epoch": 0.9064976228209192,
"grad_norm": 0.26662442413818216,
"learning_rate": 5.249318693604577e-06,
"loss": 0.2632,
"step": 1430
},
{
"epoch": 0.9096671949286846,
"grad_norm": 0.2965993748242227,
"learning_rate": 4.901192177790692e-06,
"loss": 0.2799,
"step": 1435
},
{
"epoch": 0.9128367670364501,
"grad_norm": 0.2923839300339188,
"learning_rate": 4.564718872728568e-06,
"loss": 0.2464,
"step": 1440
},
{
"epoch": 0.9160063391442155,
"grad_norm": 0.3004256474409844,
"learning_rate": 4.2399400091594154e-06,
"loss": 0.2775,
"step": 1445
},
{
"epoch": 0.919175911251981,
"grad_norm": 0.30636844288189197,
"learning_rate": 3.926895384811835e-06,
"loss": 0.2917,
"step": 1450
},
{
"epoch": 0.9223454833597464,
"grad_norm": 0.27018058178290905,
"learning_rate": 3.625623359525099e-06,
"loss": 0.2522,
"step": 1455
},
{
"epoch": 0.9255150554675119,
"grad_norm": 0.3069766309513976,
"learning_rate": 3.33616085054862e-06,
"loss": 0.2722,
"step": 1460
},
{
"epoch": 0.9286846275752774,
"grad_norm": 0.2673579253849767,
"learning_rate": 3.0585433280180707e-06,
"loss": 0.2561,
"step": 1465
},
{
"epoch": 0.9318541996830428,
"grad_norm": 0.2688001276727079,
"learning_rate": 2.792804810609173e-06,
"loss": 0.2718,
"step": 1470
},
{
"epoch": 0.9350237717908082,
"grad_norm": 0.3331860222359942,
"learning_rate": 2.538977861368874e-06,
"loss": 0.3163,
"step": 1475
},
{
"epoch": 0.9381933438985737,
"grad_norm": 0.2668325932813764,
"learning_rate": 2.2970935837253182e-06,
"loss": 0.2393,
"step": 1480
},
{
"epoch": 0.9413629160063391,
"grad_norm": 0.3285498156618503,
"learning_rate": 2.0671816176764058e-06,
"loss": 0.2862,
"step": 1485
},
{
"epoch": 0.9445324881141046,
"grad_norm": 0.36573862269188245,
"learning_rate": 1.8492701361578324e-06,
"loss": 0.2447,
"step": 1490
},
{
"epoch": 0.9477020602218701,
"grad_norm": 0.2864139944423568,
"learning_rate": 1.6433858415907278e-06,
"loss": 0.2777,
"step": 1495
},
{
"epoch": 0.9508716323296355,
"grad_norm": 0.323741034773291,
"learning_rate": 1.4495539626097288e-06,
"loss": 0.3086,
"step": 1500
},
{
"epoch": 0.9540412044374009,
"grad_norm": 0.2857388007026186,
"learning_rate": 1.2677982509714415e-06,
"loss": 0.2175,
"step": 1505
},
{
"epoch": 0.9572107765451664,
"grad_norm": 0.2813011213045847,
"learning_rate": 1.0981409786439355e-06,
"loss": 0.2882,
"step": 1510
},
{
"epoch": 0.9603803486529319,
"grad_norm": 0.27685594779071976,
"learning_rate": 9.40602935077639e-07,
"loss": 0.23,
"step": 1515
},
{
"epoch": 0.9635499207606973,
"grad_norm": 0.278082958417837,
"learning_rate": 7.952034246577977e-07,
"loss": 0.2814,
"step": 1520
},
{
"epoch": 0.9667194928684627,
"grad_norm": 0.332411253150925,
"learning_rate": 6.619602643389899e-07,
"loss": 0.2772,
"step": 1525
},
{
"epoch": 0.9698890649762282,
"grad_norm": 0.28541188324654354,
"learning_rate": 5.408897814618175e-07,
"loss": 0.2456,
"step": 1530
},
{
"epoch": 0.9730586370839936,
"grad_norm": 0.289051402982161,
"learning_rate": 4.320068117522835e-07,
"loss": 0.2659,
"step": 1535
},
{
"epoch": 0.9762282091917591,
"grad_norm": 0.2896831822321737,
"learning_rate": 3.35324697503725e-07,
"loss": 0.2721,
"step": 1540
},
{
"epoch": 0.9793977812995246,
"grad_norm": 0.31950029347694936,
"learning_rate": 2.508552859419977e-07,
"loss": 0.2622,
"step": 1545
},
{
"epoch": 0.9825673534072901,
"grad_norm": 0.33661523682392047,
"learning_rate": 1.7860892777367133e-07,
"loss": 0.2731,
"step": 1550
},
{
"epoch": 0.9857369255150554,
"grad_norm": 0.2522879758084615,
"learning_rate": 1.1859447591769934e-07,
"loss": 0.2291,
"step": 1555
},
{
"epoch": 0.9889064976228209,
"grad_norm": 0.2923729662272973,
"learning_rate": 7.081928442057573e-08,
"loss": 0.2972,
"step": 1560
},
{
"epoch": 0.9920760697305864,
"grad_norm": 0.24814229174923821,
"learning_rate": 3.5289207555233573e-08,
"loss": 0.2586,
"step": 1565
},
{
"epoch": 0.9952456418383518,
"grad_norm": 0.24322900794711846,
"learning_rate": 1.2008599103618956e-08,
"loss": 0.2751,
"step": 1570
},
{
"epoch": 0.9984152139461173,
"grad_norm": 0.4374899080765362,
"learning_rate": 9.803118232398768e-10,
"loss": 0.2981,
"step": 1575
},
{
"epoch": 0.9996830427892235,
"step": 1577,
"total_flos": 8013042675351552.0,
"train_loss": 0.3010184336819827,
"train_runtime": 18281.3669,
"train_samples_per_second": 1.381,
"train_steps_per_second": 0.086
}
],
"logging_steps": 5,
"max_steps": 1577,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8013042675351552.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}