Model save

82ee557 verified 3 months ago

57.6 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.9996830427892235,
	"eval_steps": 200,
	"global_step": 1577,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.0006339144215530904,
	"grad_norm": 0.1844951284226507,
	"learning_rate": 1.2658227848101265e-06,
	"loss": 0.2445,
	"step": 1
	},
	{
	"epoch": 0.003169572107765452,
	"grad_norm": 0.4228474323435137,
	"learning_rate": 6.329113924050633e-06,
	"loss": 0.5751,
	"step": 5
	},
	{
	"epoch": 0.006339144215530904,
	"grad_norm": 0.44943947868261674,
	"learning_rate": 1.2658227848101267e-05,
	"loss": 0.6355,
	"step": 10
	},
	{
	"epoch": 0.009508716323296355,
	"grad_norm": 0.4388677072263688,
	"learning_rate": 1.89873417721519e-05,
	"loss": 0.5644,
	"step": 15
	},
	{
	"epoch": 0.012678288431061807,
	"grad_norm": 0.42271873762000256,
	"learning_rate": 2.5316455696202533e-05,
	"loss": 0.6074,
	"step": 20
	},
	{
	"epoch": 0.01584786053882726,
	"grad_norm": 0.394168038759961,
	"learning_rate": 3.1645569620253167e-05,
	"loss": 0.5618,
	"step": 25
	},
	{
	"epoch": 0.01901743264659271,
	"grad_norm": 0.3572253194331872,
	"learning_rate": 3.79746835443038e-05,
	"loss": 0.3782,
	"step": 30
	},
	{
	"epoch": 0.022187004754358162,
	"grad_norm": 0.412555556410598,
	"learning_rate": 4.430379746835443e-05,
	"loss": 0.5004,
	"step": 35
	},
	{
	"epoch": 0.025356576862123614,
	"grad_norm": 0.3794217410787016,
	"learning_rate": 5.0632911392405066e-05,
	"loss": 0.4466,
	"step": 40
	},
	{
	"epoch": 0.028526148969889066,
	"grad_norm": 0.35048440162730615,
	"learning_rate": 5.69620253164557e-05,
	"loss": 0.4673,
	"step": 45
	},
	{
	"epoch": 0.03169572107765452,
	"grad_norm": 0.5944470916433041,
	"learning_rate": 6.329113924050633e-05,
	"loss": 0.4321,
	"step": 50
	},
	{
	"epoch": 0.03486529318541997,
	"grad_norm": 0.42565747209046223,
	"learning_rate": 6.962025316455697e-05,
	"loss": 0.3458,
	"step": 55
	},
	{
	"epoch": 0.03803486529318542,
	"grad_norm": 0.4086754614850085,
	"learning_rate": 7.59493670886076e-05,
	"loss": 0.3853,
	"step": 60
	},
	{
	"epoch": 0.04120443740095087,
	"grad_norm": 0.4100402315211435,
	"learning_rate": 8.227848101265824e-05,
	"loss": 0.3248,
	"step": 65
	},
	{
	"epoch": 0.044374009508716325,
	"grad_norm": 0.439691902474589,
	"learning_rate": 8.860759493670887e-05,
	"loss": 0.3902,
	"step": 70
	},
	{
	"epoch": 0.04754358161648178,
	"grad_norm": 0.4789983229510449,
	"learning_rate": 9.493670886075949e-05,
	"loss": 0.4375,
	"step": 75
	},
	{
	"epoch": 0.05071315372424723,
	"grad_norm": 0.4326295423185557,
	"learning_rate": 0.00010126582278481013,
	"loss": 0.344,
	"step": 80
	},
	{
	"epoch": 0.05388272583201268,
	"grad_norm": 0.4082745764246507,
	"learning_rate": 0.00010759493670886076,
	"loss": 0.373,
	"step": 85
	},
	{
	"epoch": 0.05705229793977813,
	"grad_norm": 0.44917869809554406,
	"learning_rate": 0.0001139240506329114,
	"loss": 0.3366,
	"step": 90
	},
	{
	"epoch": 0.060221870047543584,
	"grad_norm": 0.4099898854278625,
	"learning_rate": 0.00012025316455696203,
	"loss": 0.3827,
	"step": 95
	},
	{
	"epoch": 0.06339144215530904,
	"grad_norm": 0.5173612120396457,
	"learning_rate": 0.00012658227848101267,
	"loss": 0.3913,
	"step": 100
	},
	{
	"epoch": 0.06656101426307448,
	"grad_norm": 0.4695908910723305,
	"learning_rate": 0.0001329113924050633,
	"loss": 0.3285,
	"step": 105
	},
	{
	"epoch": 0.06973058637083994,
	"grad_norm": 0.34610029250066504,
	"learning_rate": 0.00013924050632911395,
	"loss": 0.3542,
	"step": 110
	},
	{
	"epoch": 0.07290015847860538,
	"grad_norm": 0.3833563232036365,
	"learning_rate": 0.00014556962025316457,
	"loss": 0.3442,
	"step": 115
	},
	{
	"epoch": 0.07606973058637084,
	"grad_norm": 0.38597736664868315,
	"learning_rate": 0.0001518987341772152,
	"loss": 0.3499,
	"step": 120
	},
	{
	"epoch": 0.07923930269413629,
	"grad_norm": 0.4555824320300245,
	"learning_rate": 0.00015822784810126583,
	"loss": 0.3843,
	"step": 125
	},
	{
	"epoch": 0.08240887480190175,
	"grad_norm": 0.44058959604469733,
	"learning_rate": 0.00016455696202531648,
	"loss": 0.3321,
	"step": 130
	},
	{
	"epoch": 0.08557844690966719,
	"grad_norm": 0.37513672150754146,
	"learning_rate": 0.0001708860759493671,
	"loss": 0.3409,
	"step": 135
	},
	{
	"epoch": 0.08874801901743265,
	"grad_norm": 0.3532888739409051,
	"learning_rate": 0.00017721518987341773,
	"loss": 0.3388,
	"step": 140
	},
	{
	"epoch": 0.0919175911251981,
	"grad_norm": 0.31398944959900404,
	"learning_rate": 0.00018354430379746836,
	"loss": 0.3407,
	"step": 145
	},
	{
	"epoch": 0.09508716323296355,
	"grad_norm": 0.48473648286443866,
	"learning_rate": 0.00018987341772151899,
	"loss": 0.4109,
	"step": 150
	},
	{
	"epoch": 0.098256735340729,
	"grad_norm": 0.3832743712760423,
	"learning_rate": 0.00019620253164556964,
	"loss": 0.2894,
	"step": 155
	},
	{
	"epoch": 0.10142630744849446,
	"grad_norm": 0.3576599310136604,
	"learning_rate": 0.00019999901968817678,
	"loss": 0.3685,
	"step": 160
	},
	{
	"epoch": 0.1045958795562599,
	"grad_norm": 0.4041268184733326,
	"learning_rate": 0.0001999879914008964,
	"loss": 0.3103,
	"step": 165
	},
	{
	"epoch": 0.10776545166402536,
	"grad_norm": 0.348710082889974,
	"learning_rate": 0.00019996471079244477,
	"loss": 0.3686,
	"step": 170
	},
	{
	"epoch": 0.1109350237717908,
	"grad_norm": 0.3641139077278622,
	"learning_rate": 0.0001999291807155794,
	"loss": 0.3672,
	"step": 175
	},
	{
	"epoch": 0.11410459587955626,
	"grad_norm": 0.34875291735749603,
	"learning_rate": 0.0001998814055240823,
	"loss": 0.3289,
	"step": 180
	},
	{
	"epoch": 0.11727416798732171,
	"grad_norm": 0.35868082118594846,
	"learning_rate": 0.00019982139107222632,
	"loss": 0.3843,
	"step": 185
	},
	{
	"epoch": 0.12044374009508717,
	"grad_norm": 0.2975053354861811,
	"learning_rate": 0.000199749144714058,
	"loss": 0.3187,
	"step": 190
	},
	{
	"epoch": 0.12361331220285261,
	"grad_norm": 0.3926097041806586,
	"learning_rate": 0.00019966467530249627,
	"loss": 0.3711,
	"step": 195
	},
	{
	"epoch": 0.12678288431061807,
	"grad_norm": 0.39235636818547276,
	"learning_rate": 0.00019956799318824776,
	"loss": 0.3599,
	"step": 200
	},
	{
	"epoch": 0.12678288431061807,
	"eval_loss": 0.31717613339424133,
	"eval_runtime": 878.4135,
	"eval_samples_per_second": 4.554,
	"eval_steps_per_second": 0.569,
	"step": 200
	},
	{
	"epoch": 0.12995245641838352,
	"grad_norm": 0.32366959300654363,
	"learning_rate": 0.00019945911021853818,
	"loss": 0.2671,
	"step": 205
	},
	{
	"epoch": 0.13312202852614896,
	"grad_norm": 0.34183927553766114,
	"learning_rate": 0.00019933803973566102,
	"loss": 0.3491,
	"step": 210
	},
	{
	"epoch": 0.13629160063391443,
	"grad_norm": 0.355629049879592,
	"learning_rate": 0.0001992047965753422,
	"loss": 0.2778,
	"step": 215
	},
	{
	"epoch": 0.13946117274167988,
	"grad_norm": 0.31194706241410036,
	"learning_rate": 0.00019905939706492238,
	"loss": 0.3278,
	"step": 220
	},
	{
	"epoch": 0.14263074484944532,
	"grad_norm": 0.37190501088914274,
	"learning_rate": 0.0001989018590213561,
	"loss": 0.3757,
	"step": 225
	},
	{
	"epoch": 0.14580031695721077,
	"grad_norm": 0.30859177154159206,
	"learning_rate": 0.00019873220174902858,
	"loss": 0.2952,
	"step": 230
	},
	{
	"epoch": 0.14896988906497624,
	"grad_norm": 0.4072493051692793,
	"learning_rate": 0.0001985504460373903,
	"loss": 0.3576,
	"step": 235
	},
	{
	"epoch": 0.15213946117274169,
	"grad_norm": 0.3117614582623609,
	"learning_rate": 0.00019835661415840928,
	"loss": 0.3127,
	"step": 240
	},
	{
	"epoch": 0.15530903328050713,
	"grad_norm": 0.3433870206019631,
	"learning_rate": 0.00019815072986384218,
	"loss": 0.3424,
	"step": 245
	},
	{
	"epoch": 0.15847860538827258,
	"grad_norm": 0.3252374107324197,
	"learning_rate": 0.0001979328183823236,
	"loss": 0.3509,
	"step": 250
	},
	{
	"epoch": 0.16164817749603805,
	"grad_norm": 0.32574757253252834,
	"learning_rate": 0.00019770290641627468,
	"loss": 0.2913,
	"step": 255
	},
	{
	"epoch": 0.1648177496038035,
	"grad_norm": 0.37343408069668577,
	"learning_rate": 0.00019746102213863114,
	"loss": 0.3524,
	"step": 260
	},
	{
	"epoch": 0.16798732171156894,
	"grad_norm": 0.30197216412790706,
	"learning_rate": 0.00019720719518939083,
	"loss": 0.295,
	"step": 265
	},
	{
	"epoch": 0.17115689381933438,
	"grad_norm": 0.37750434171669517,
	"learning_rate": 0.00019694145667198195,
	"loss": 0.3215,
	"step": 270
	},
	{
	"epoch": 0.17432646592709986,
	"grad_norm": 0.3368196048030473,
	"learning_rate": 0.0001966638391494514,
	"loss": 0.35,
	"step": 275
	},
	{
	"epoch": 0.1774960380348653,
	"grad_norm": 0.3232595651729065,
	"learning_rate": 0.0001963743766404749,
	"loss": 0.2637,
	"step": 280
	},
	{
	"epoch": 0.18066561014263074,
	"grad_norm": 0.32199548202560035,
	"learning_rate": 0.00019607310461518818,
	"loss": 0.3262,
	"step": 285
	},
	{
	"epoch": 0.1838351822503962,
	"grad_norm": 0.29117926540088634,
	"learning_rate": 0.0001957600599908406,
	"loss": 0.3129,
	"step": 290
	},
	{
	"epoch": 0.18700475435816163,
	"grad_norm": 0.2836794081153409,
	"learning_rate": 0.00019543528112727146,
	"loss": 0.3207,
	"step": 295
	},
	{
	"epoch": 0.1901743264659271,
	"grad_norm": 0.37478385305484463,
	"learning_rate": 0.0001950988078222093,
	"loss": 0.3503,
	"step": 300
	},
	{
	"epoch": 0.19334389857369255,
	"grad_norm": 0.3323790483161259,
	"learning_rate": 0.00019475068130639543,
	"loss": 0.2873,
	"step": 305
	},
	{
	"epoch": 0.196513470681458,
	"grad_norm": 0.31045326503955184,
	"learning_rate": 0.0001943909442385313,
	"loss": 0.3379,
	"step": 310
	},
	{
	"epoch": 0.19968304278922344,
	"grad_norm": 0.295428110940092,
	"learning_rate": 0.00019401964070005144,
	"loss": 0.2913,
	"step": 315
	},
	{
	"epoch": 0.20285261489698891,
	"grad_norm": 0.31381749704770145,
	"learning_rate": 0.00019363681618972164,
	"loss": 0.3167,
	"step": 320
	},
	{
	"epoch": 0.20602218700475436,
	"grad_norm": 0.3799683908480184,
	"learning_rate": 0.00019324251761806374,
	"loss": 0.3203,
	"step": 325
	},
	{
	"epoch": 0.2091917591125198,
	"grad_norm": 0.25669447806119594,
	"learning_rate": 0.00019283679330160726,
	"loss": 0.2598,
	"step": 330
	},
	{
	"epoch": 0.21236133122028525,
	"grad_norm": 0.3253285501894849,
	"learning_rate": 0.00019241969295696879,
	"loss": 0.321,
	"step": 335
	},
	{
	"epoch": 0.21553090332805072,
	"grad_norm": 0.3015776648780859,
	"learning_rate": 0.0001919912676947598,
	"loss": 0.2912,
	"step": 340
	},
	{
	"epoch": 0.21870047543581617,
	"grad_norm": 0.3548152436637532,
	"learning_rate": 0.00019155157001332374,
	"loss": 0.3398,
	"step": 345
	},
	{
	"epoch": 0.2218700475435816,
	"grad_norm": 0.3562179525646546,
	"learning_rate": 0.00019110065379230289,
	"loss": 0.3575,
	"step": 350
	},
	{
	"epoch": 0.22503961965134706,
	"grad_norm": 0.33759944051182883,
	"learning_rate": 0.00019063857428603615,
	"loss": 0.2644,
	"step": 355
	},
	{
	"epoch": 0.22820919175911253,
	"grad_norm": 0.3478332359179607,
	"learning_rate": 0.00019016538811678823,
	"loss": 0.3421,
	"step": 360
	},
	{
	"epoch": 0.23137876386687797,
	"grad_norm": 0.3107602080624315,
	"learning_rate": 0.0001896811532678113,
	"loss": 0.262,
	"step": 365
	},
	{
	"epoch": 0.23454833597464342,
	"grad_norm": 0.26971775917740104,
	"learning_rate": 0.00018918592907623985,
	"loss": 0.3378,
	"step": 370
	},
	{
	"epoch": 0.23771790808240886,
	"grad_norm": 0.32413332448217697,
	"learning_rate": 0.00018867977622581957,
	"loss": 0.3316,
	"step": 375
	},
	{
	"epoch": 0.24088748019017434,
	"grad_norm": 0.3522975093101741,
	"learning_rate": 0.00018816275673947148,
	"loss": 0.2678,
	"step": 380
	},
	{
	"epoch": 0.24405705229793978,
	"grad_norm": 0.31661852350790726,
	"learning_rate": 0.00018763493397169146,
	"loss": 0.3275,
	"step": 385
	},
	{
	"epoch": 0.24722662440570523,
	"grad_norm": 0.27090727261610936,
	"learning_rate": 0.00018709637260078729,
	"loss": 0.2858,
	"step": 390
	},
	{
	"epoch": 0.25039619651347067,
	"grad_norm": 0.3143474617991223,
	"learning_rate": 0.0001865471386209527,
	"loss": 0.3317,
	"step": 395
	},
	{
	"epoch": 0.25356576862123614,
	"grad_norm": 0.48811153855723693,
	"learning_rate": 0.000185987299334181,
	"loss": 0.3295,
	"step": 400
	},
	{
	"epoch": 0.25356576862123614,
	"eval_loss": 0.29194891452789307,
	"eval_runtime": 872.9978,
	"eval_samples_per_second": 4.582,
	"eval_steps_per_second": 0.573,
	"step": 400
	},
	{
	"epoch": 0.25673534072900156,
	"grad_norm": 0.31755342222995686,
	"learning_rate": 0.00018541692334201771,
	"loss": 0.2643,
	"step": 405
	},
	{
	"epoch": 0.25990491283676703,
	"grad_norm": 0.34778059073770806,
	"learning_rate": 0.0001848360805371544,
	"loss": 0.3339,
	"step": 410
	},
	{
	"epoch": 0.2630744849445325,
	"grad_norm": 0.3183073063986642,
	"learning_rate": 0.00018424484209486416,
	"loss": 0.2673,
	"step": 415
	},
	{
	"epoch": 0.2662440570522979,
	"grad_norm": 0.2788199901083398,
	"learning_rate": 0.00018364328046428,
	"loss": 0.3272,
	"step": 420
	},
	{
	"epoch": 0.2694136291600634,
	"grad_norm": 0.3666143727147526,
	"learning_rate": 0.00018303146935951689,
	"loss": 0.3247,
	"step": 425
	},
	{
	"epoch": 0.27258320126782887,
	"grad_norm": 0.28586548327038175,
	"learning_rate": 0.00018240948375063926,
	"loss": 0.2792,
	"step": 430
	},
	{
	"epoch": 0.2757527733755943,
	"grad_norm": 0.9727255846044429,
	"learning_rate": 0.00018177739985447412,
	"loss": 0.3485,
	"step": 435
	},
	{
	"epoch": 0.27892234548335976,
	"grad_norm": 0.29065854553956355,
	"learning_rate": 0.0001811352951252717,
	"loss": 0.2729,
	"step": 440
	},
	{
	"epoch": 0.2820919175911252,
	"grad_norm": 0.320575993183303,
	"learning_rate": 0.0001804832482452142,
	"loss": 0.3354,
	"step": 445
	},
	{
	"epoch": 0.28526148969889065,
	"grad_norm": 0.34869737354697955,
	"learning_rate": 0.0001798213391147746,
	"loss": 0.3385,
	"step": 450
	},
	{
	"epoch": 0.2884310618066561,
	"grad_norm": 0.31478642211651564,
	"learning_rate": 0.00017914964884292544,
	"loss": 0.3133,
	"step": 455
	},
	{
	"epoch": 0.29160063391442154,
	"grad_norm": 0.36834278711947965,
	"learning_rate": 0.0001784682597372,
	"loss": 0.3593,
	"step": 460
	},
	{
	"epoch": 0.294770206022187,
	"grad_norm": 0.2791902388221146,
	"learning_rate": 0.00017777725529360676,
	"loss": 0.3005,
	"step": 465
	},
	{
	"epoch": 0.2979397781299525,
	"grad_norm": 0.30096452678752406,
	"learning_rate": 0.00017707672018639758,
	"loss": 0.3354,
	"step": 470
	},
	{
	"epoch": 0.3011093502377179,
	"grad_norm": 0.3708048891578612,
	"learning_rate": 0.00017636674025769215,
	"loss": 0.3147,
	"step": 475
	},
	{
	"epoch": 0.30427892234548337,
	"grad_norm": 0.305209122691005,
	"learning_rate": 0.00017564740250695904,
	"loss": 0.2713,
	"step": 480
	},
	{
	"epoch": 0.3074484944532488,
	"grad_norm": 0.3018873391630076,
	"learning_rate": 0.0001749187950803549,
	"loss": 0.3202,
	"step": 485
	},
	{
	"epoch": 0.31061806656101426,
	"grad_norm": 0.3464422287874134,
	"learning_rate": 0.00017418100725992316,
	"loss": 0.3042,
	"step": 490
	},
	{
	"epoch": 0.31378763866877973,
	"grad_norm": 0.31036543367721087,
	"learning_rate": 0.00017343412945265382,
	"loss": 0.3105,
	"step": 495
	},
	{
	"epoch": 0.31695721077654515,
	"grad_norm": 0.3090116757558095,
	"learning_rate": 0.00017267825317940493,
	"loss": 0.3086,
	"step": 500
	},
	{
	"epoch": 0.3201267828843106,
	"grad_norm": 0.32015559999952525,
	"learning_rate": 0.00017191347106368797,
	"loss": 0.2595,
	"step": 505
	},
	{
	"epoch": 0.3232963549920761,
	"grad_norm": 0.28242640929152685,
	"learning_rate": 0.0001711398768203178,
	"loss": 0.3171,
	"step": 510
	},
	{
	"epoch": 0.3264659270998415,
	"grad_norm": 0.3373697781712397,
	"learning_rate": 0.00017035756524392924,
	"loss": 0.2897,
	"step": 515
	},
	{
	"epoch": 0.329635499207607,
	"grad_norm": 0.3187883343723006,
	"learning_rate": 0.0001695666321973609,
	"loss": 0.303,
	"step": 520
	},
	{
	"epoch": 0.3328050713153724,
	"grad_norm": 0.4060972163443389,
	"learning_rate": 0.00016876717459990862,
	"loss": 0.3273,
	"step": 525
	},
	{
	"epoch": 0.3359746434231379,
	"grad_norm": 0.2709960074426642,
	"learning_rate": 0.0001679592904154489,
	"loss": 0.2629,
	"step": 530
	},
	{
	"epoch": 0.33914421553090335,
	"grad_norm": 0.2828719972128079,
	"learning_rate": 0.00016714307864043487,
	"loss": 0.2946,
	"step": 535
	},
	{
	"epoch": 0.34231378763866877,
	"grad_norm": 0.29485357171410065,
	"learning_rate": 0.00016631863929176524,
	"loss": 0.2704,
	"step": 540
	},
	{
	"epoch": 0.34548335974643424,
	"grad_norm": 0.3140677978027709,
	"learning_rate": 0.00016548607339452853,
	"loss": 0.3211,
	"step": 545
	},
	{
	"epoch": 0.3486529318541997,
	"grad_norm": 0.30224374704766904,
	"learning_rate": 0.00016464548296962373,
	"loss": 0.3289,
	"step": 550
	},
	{
	"epoch": 0.3518225039619651,
	"grad_norm": 0.3015178734291492,
	"learning_rate": 0.0001637969710212588,
	"loss": 0.262,
	"step": 555
	},
	{
	"epoch": 0.3549920760697306,
	"grad_norm": 0.3261808476280464,
	"learning_rate": 0.00016294064152432879,
	"loss": 0.3524,
	"step": 560
	},
	{
	"epoch": 0.358161648177496,
	"grad_norm": 0.30420040263110554,
	"learning_rate": 0.00016207659941167485,
	"loss": 0.2888,
	"step": 565
	},
	{
	"epoch": 0.3613312202852615,
	"grad_norm": 0.29855740633395794,
	"learning_rate": 0.00016120495056122622,
	"loss": 0.3075,
	"step": 570
	},
	{
	"epoch": 0.36450079239302696,
	"grad_norm": 0.3775755682614953,
	"learning_rate": 0.00016032580178302583,
	"loss": 0.3452,
	"step": 575
	},
	{
	"epoch": 0.3676703645007924,
	"grad_norm": 0.3189277602131783,
	"learning_rate": 0.00015943926080614235,
	"loss": 0.2643,
	"step": 580
	},
	{
	"epoch": 0.37083993660855785,
	"grad_norm": 0.32115548282274786,
	"learning_rate": 0.00015854543626546915,
	"loss": 0.3126,
	"step": 585
	},
	{
	"epoch": 0.37400950871632327,
	"grad_norm": 0.29230296850863174,
	"learning_rate": 0.00015764443768841234,
	"loss": 0.2949,
	"step": 590
	},
	{
	"epoch": 0.37717908082408874,
	"grad_norm": 0.32187057297721217,
	"learning_rate": 0.0001567363754814696,
	"loss": 0.3166,
	"step": 595
	},
	{
	"epoch": 0.3803486529318542,
	"grad_norm": 0.3766752931165212,
	"learning_rate": 0.0001558213609167012,
	"loss": 0.323,
	"step": 600
	},
	{
	"epoch": 0.3803486529318542,
	"eval_loss": 0.2788923680782318,
	"eval_runtime": 873.5171,
	"eval_samples_per_second": 4.579,
	"eval_steps_per_second": 0.572,
	"step": 600
	},
	{
	"epoch": 0.38351822503961963,
	"grad_norm": 0.31877960462977273,
	"learning_rate": 0.00015489950611809484,
	"loss": 0.2803,
	"step": 605
	},
	{
	"epoch": 0.3866877971473851,
	"grad_norm": 0.2903622851026156,
	"learning_rate": 0.00015397092404782642,
	"loss": 0.3178,
	"step": 610
	},
	{
	"epoch": 0.3898573692551506,
	"grad_norm": 0.2639727101749139,
	"learning_rate": 0.00015303572849241764,
	"loss": 0.2703,
	"step": 615
	},
	{
	"epoch": 0.393026941362916,
	"grad_norm": 0.3491709894849581,
	"learning_rate": 0.00015209403404879303,
	"loss": 0.3049,
	"step": 620
	},
	{
	"epoch": 0.39619651347068147,
	"grad_norm": 0.3651420024997032,
	"learning_rate": 0.00015114595611023744,
	"loss": 0.3265,
	"step": 625
	},
	{
	"epoch": 0.3993660855784469,
	"grad_norm": 0.3071330073578763,
	"learning_rate": 0.0001501916108522558,
	"loss": 0.2645,
	"step": 630
	},
	{
	"epoch": 0.40253565768621236,
	"grad_norm": 0.2739471545543727,
	"learning_rate": 0.00014923111521833758,
	"loss": 0.3035,
	"step": 635
	},
	{
	"epoch": 0.40570522979397783,
	"grad_norm": 0.30630113259525843,
	"learning_rate": 0.00014826458690562642,
	"loss": 0.2606,
	"step": 640
	},
	{
	"epoch": 0.40887480190174325,
	"grad_norm": 0.2988843883769528,
	"learning_rate": 0.00014729214435049793,
	"loss": 0.3111,
	"step": 645
	},
	{
	"epoch": 0.4120443740095087,
	"grad_norm": 0.3110979862585215,
	"learning_rate": 0.0001463139067140468,
	"loss": 0.2948,
	"step": 650
	},
	{
	"epoch": 0.4152139461172742,
	"grad_norm": 0.30767657253531316,
	"learning_rate": 0.0001453299938674849,
	"loss": 0.2638,
	"step": 655
	},
	{
	"epoch": 0.4183835182250396,
	"grad_norm": 0.27014842841388653,
	"learning_rate": 0.00014434052637745257,
	"loss": 0.2819,
	"step": 660
	},
	{
	"epoch": 0.4215530903328051,
	"grad_norm": 0.2739393681355767,
	"learning_rate": 0.00014334562549124467,
	"loss": 0.2466,
	"step": 665
	},
	{
	"epoch": 0.4247226624405705,
	"grad_norm": 0.31758998023523244,
	"learning_rate": 0.00014234541312195323,
	"loss": 0.2873,
	"step": 670
	},
	{
	"epoch": 0.42789223454833597,
	"grad_norm": 0.39847849128188423,
	"learning_rate": 0.00014134001183352832,
	"loss": 0.2979,
	"step": 675
	},
	{
	"epoch": 0.43106180665610144,
	"grad_norm": 0.30950118355401873,
	"learning_rate": 0.00014032954482575937,
	"loss": 0.2617,
	"step": 680
	},
	{
	"epoch": 0.43423137876386686,
	"grad_norm": 0.3260587574739946,
	"learning_rate": 0.0001393141359191787,
	"loss": 0.3109,
	"step": 685
	},
	{
	"epoch": 0.43740095087163233,
	"grad_norm": 0.3114375419997854,
	"learning_rate": 0.00013829390953988853,
	"loss": 0.2845,
	"step": 690
	},
	{
	"epoch": 0.4405705229793978,
	"grad_norm": 0.30019871836883555,
	"learning_rate": 0.00013726899070431423,
	"loss": 0.324,
	"step": 695
	},
	{
	"epoch": 0.4437400950871632,
	"grad_norm": 0.38021042516470643,
	"learning_rate": 0.00013623950500388506,
	"loss": 0.3269,
	"step": 700
	},
	{
	"epoch": 0.4469096671949287,
	"grad_norm": 0.3089060241706131,
	"learning_rate": 0.00013520557858964446,
	"loss": 0.2584,
	"step": 705
	},
	{
	"epoch": 0.4500792393026941,
	"grad_norm": 0.27984586622582663,
	"learning_rate": 0.00013416733815679166,
	"loss": 0.2909,
	"step": 710
	},
	{
	"epoch": 0.4532488114104596,
	"grad_norm": 0.2923559292409706,
	"learning_rate": 0.00013312491092915682,
	"loss": 0.2489,
	"step": 715
	},
	{
	"epoch": 0.45641838351822506,
	"grad_norm": 0.29223045315786345,
	"learning_rate": 0.00013207842464361125,
	"loss": 0.3135,
	"step": 720
	},
	{
	"epoch": 0.4595879556259905,
	"grad_norm": 0.33907899924090856,
	"learning_rate": 0.00013102800753441487,
	"loss": 0.3148,
	"step": 725
	},
	{
	"epoch": 0.46275752773375595,
	"grad_norm": 0.26110455456342696,
	"learning_rate": 0.00012997378831750242,
	"loss": 0.2505,
	"step": 730
	},
	{
	"epoch": 0.4659270998415214,
	"grad_norm": 0.2855563878095534,
	"learning_rate": 0.00012891589617471122,
	"loss": 0.322,
	"step": 735
	},
	{
	"epoch": 0.46909667194928684,
	"grad_norm": 0.27089962197787903,
	"learning_rate": 0.00012785446073795118,
	"loss": 0.2629,
	"step": 740
	},
	{
	"epoch": 0.4722662440570523,
	"grad_norm": 0.2787588891548799,
	"learning_rate": 0.00012678961207332015,
	"loss": 0.3071,
	"step": 745
	},
	{
	"epoch": 0.4754358161648177,
	"grad_norm": 0.35249049637057156,
	"learning_rate": 0.00012572148066516584,
	"loss": 0.3265,
	"step": 750
	},
	{
	"epoch": 0.4786053882725832,
	"grad_norm": 0.33307560406452336,
	"learning_rate": 0.00012465019740009662,
	"loss": 0.2403,
	"step": 755
	},
	{
	"epoch": 0.48177496038034867,
	"grad_norm": 0.3035753509057755,
	"learning_rate": 0.00012357589355094275,
	"loss": 0.3057,
	"step": 760
	},
	{
	"epoch": 0.4849445324881141,
	"grad_norm": 0.2950972689886197,
	"learning_rate": 0.00012249870076067067,
	"loss": 0.2637,
	"step": 765
	},
	{
	"epoch": 0.48811410459587956,
	"grad_norm": 0.2713040409786771,
	"learning_rate": 0.00012141875102625167,
	"loss": 0.3196,
	"step": 770
	},
	{
	"epoch": 0.49128367670364503,
	"grad_norm": 0.37005187803966516,
	"learning_rate": 0.00012033617668248723,
	"loss": 0.3265,
	"step": 775
	},
	{
	"epoch": 0.49445324881141045,
	"grad_norm": 0.3678796577106568,
	"learning_rate": 0.00011925111038579309,
	"loss": 0.2283,
	"step": 780
	},
	{
	"epoch": 0.4976228209191759,
	"grad_norm": 0.3021844529595635,
	"learning_rate": 0.00011816368509794364,
	"loss": 0.2967,
	"step": 785
	},
	{
	"epoch": 0.5007923930269413,
	"grad_norm": 0.3028161473676034,
	"learning_rate": 0.00011707403406977928,
	"loss": 0.2841,
	"step": 790
	},
	{
	"epoch": 0.5039619651347068,
	"grad_norm": 0.27418964538735746,
	"learning_rate": 0.00011598229082487784,
	"loss": 0.2803,
	"step": 795
	},
	{
	"epoch": 0.5071315372424723,
	"grad_norm": 0.3426638434156249,
	"learning_rate": 0.0001148885891431932,
	"loss": 0.3274,
	"step": 800
	},
	{
	"epoch": 0.5071315372424723,
	"eval_loss": 0.26855266094207764,
	"eval_runtime": 873.628,
	"eval_samples_per_second": 4.579,
	"eval_steps_per_second": 0.572,
	"step": 800
	},
	{
	"epoch": 0.5103011093502378,
	"grad_norm": 0.2681269338020656,
	"learning_rate": 0.00011379306304466198,
	"loss": 0.2381,
	"step": 805
	},
	{
	"epoch": 0.5134706814580031,
	"grad_norm": 0.2987060218422062,
	"learning_rate": 0.00011269584677278102,
	"loss": 0.3076,
	"step": 810
	},
	{
	"epoch": 0.5166402535657686,
	"grad_norm": 0.2804222341073312,
	"learning_rate": 0.00011159707477815755,
	"loss": 0.2395,
	"step": 815
	},
	{
	"epoch": 0.5198098256735341,
	"grad_norm": 0.25835895356413513,
	"learning_rate": 0.00011049688170203383,
	"loss": 0.3041,
	"step": 820
	},
	{
	"epoch": 0.5229793977812995,
	"grad_norm": 0.3313190058494361,
	"learning_rate": 0.00010939540235978845,
	"loss": 0.297,
	"step": 825
	},
	{
	"epoch": 0.526148969889065,
	"grad_norm": 0.2564972143294916,
	"learning_rate": 0.00010829277172441648,
	"loss": 0.2359,
	"step": 830
	},
	{
	"epoch": 0.5293185419968305,
	"grad_norm": 0.31632766018739716,
	"learning_rate": 0.00010718912490998991,
	"loss": 0.3112,
	"step": 835
	},
	{
	"epoch": 0.5324881141045958,
	"grad_norm": 0.2738970193614327,
	"learning_rate": 0.00010608459715510139,
	"loss": 0.2416,
	"step": 840
	},
	{
	"epoch": 0.5356576862123613,
	"grad_norm": 0.35306801364530893,
	"learning_rate": 0.00010497932380629207,
	"loss": 0.3334,
	"step": 845
	},
	{
	"epoch": 0.5388272583201268,
	"grad_norm": 0.3617753781992424,
	"learning_rate": 0.00010387344030146665,
	"loss": 0.3071,
	"step": 850
	},
	{
	"epoch": 0.5419968304278923,
	"grad_norm": 0.284695185318866,
	"learning_rate": 0.0001027670821532971,
	"loss": 0.2516,
	"step": 855
	},
	{
	"epoch": 0.5451664025356577,
	"grad_norm": 0.28641499966999695,
	"learning_rate": 0.00010166038493261722,
	"loss": 0.3268,
	"step": 860
	},
	{
	"epoch": 0.5483359746434231,
	"grad_norm": 0.29940254299061986,
	"learning_rate": 0.00010055348425181,
	"loss": 0.2667,
	"step": 865
	},
	{
	"epoch": 0.5515055467511886,
	"grad_norm": 0.33784906825030664,
	"learning_rate": 9.944651574819003e-05,
	"loss": 0.3006,
	"step": 870
	},
	{
	"epoch": 0.554675118858954,
	"grad_norm": 0.33800198210916443,
	"learning_rate": 9.83396150673828e-05,
	"loss": 0.3009,
	"step": 875
	},
	{
	"epoch": 0.5578446909667195,
	"grad_norm": 0.27814752259908526,
	"learning_rate": 9.72329178467029e-05,
	"loss": 0.25,
	"step": 880
	},
	{
	"epoch": 0.561014263074485,
	"grad_norm": 0.3120985607406773,
	"learning_rate": 9.612655969853336e-05,
	"loss": 0.3079,
	"step": 885
	},
	{
	"epoch": 0.5641838351822503,
	"grad_norm": 0.32270045792226343,
	"learning_rate": 9.502067619370794e-05,
	"loss": 0.2465,
	"step": 890
	},
	{
	"epoch": 0.5673534072900158,
	"grad_norm": 0.2522429392869884,
	"learning_rate": 9.391540284489862e-05,
	"loss": 0.3049,
	"step": 895
	},
	{
	"epoch": 0.5705229793977813,
	"grad_norm": 0.32479021947356745,
	"learning_rate": 9.281087509001011e-05,
	"loss": 0.3109,
	"step": 900
	},
	{
	"epoch": 0.5736925515055468,
	"grad_norm": 0.3071871099500722,
	"learning_rate": 9.170722827558358e-05,
	"loss": 0.2566,
	"step": 905
	},
	{
	"epoch": 0.5768621236133122,
	"grad_norm": 0.2808358292017096,
	"learning_rate": 9.060459764021156e-05,
	"loss": 0.2981,
	"step": 910
	},
	{
	"epoch": 0.5800316957210776,
	"grad_norm": 0.36613518181258947,
	"learning_rate": 8.950311829796619e-05,
	"loss": 0.2812,
	"step": 915
	},
	{
	"epoch": 0.5832012678288431,
	"grad_norm": 0.29120302112196544,
	"learning_rate": 8.840292522184247e-05,
	"loss": 0.2958,
	"step": 920
	},
	{
	"epoch": 0.5863708399366085,
	"grad_norm": 0.3008146054202439,
	"learning_rate": 8.730415322721897e-05,
	"loss": 0.3119,
	"step": 925
	},
	{
	"epoch": 0.589540412044374,
	"grad_norm": 0.30809505125548203,
	"learning_rate": 8.620693695533803e-05,
	"loss": 0.2603,
	"step": 930
	},
	{
	"epoch": 0.5927099841521395,
	"grad_norm": 0.3464042931932695,
	"learning_rate": 8.511141085680683e-05,
	"loss": 0.3217,
	"step": 935
	},
	{
	"epoch": 0.595879556259905,
	"grad_norm": 0.28395404105986655,
	"learning_rate": 8.401770917512221e-05,
	"loss": 0.2339,
	"step": 940
	},
	{
	"epoch": 0.5990491283676703,
	"grad_norm": 0.32456815689823176,
	"learning_rate": 8.292596593022075e-05,
	"loss": 0.2761,
	"step": 945
	},
	{
	"epoch": 0.6022187004754358,
	"grad_norm": 0.35814205267620147,
	"learning_rate": 8.183631490205637e-05,
	"loss": 0.3064,
	"step": 950
	},
	{
	"epoch": 0.6053882725832013,
	"grad_norm": 0.3307025804465351,
	"learning_rate": 8.074888961420695e-05,
	"loss": 0.2317,
	"step": 955
	},
	{
	"epoch": 0.6085578446909667,
	"grad_norm": 0.3035093202164917,
	"learning_rate": 7.966382331751277e-05,
	"loss": 0.3024,
	"step": 960
	},
	{
	"epoch": 0.6117274167987322,
	"grad_norm": 0.23483953416505404,
	"learning_rate": 7.858124897374837e-05,
	"loss": 0.2616,
	"step": 965
	},
	{
	"epoch": 0.6148969889064976,
	"grad_norm": 0.24795445024402282,
	"learning_rate": 7.750129923932939e-05,
	"loss": 0.2889,
	"step": 970
	},
	{
	"epoch": 0.618066561014263,
	"grad_norm": 0.39470726118892546,
	"learning_rate": 7.642410644905726e-05,
	"loss": 0.3255,
	"step": 975
	},
	{
	"epoch": 0.6212361331220285,
	"grad_norm": 0.28578857562483734,
	"learning_rate": 7.534980259990341e-05,
	"loss": 0.2177,
	"step": 980
	},
	{
	"epoch": 0.624405705229794,
	"grad_norm": 0.293120691065387,
	"learning_rate": 7.427851933483418e-05,
	"loss": 0.3008,
	"step": 985
	},
	{
	"epoch": 0.6275752773375595,
	"grad_norm": 0.28050824031198807,
	"learning_rate": 7.321038792667987e-05,
	"loss": 0.2617,
	"step": 990
	},
	{
	"epoch": 0.6307448494453248,
	"grad_norm": 0.3421819179459905,
	"learning_rate": 7.214553926204883e-05,
	"loss": 0.2827,
	"step": 995
	},
	{
	"epoch": 0.6339144215530903,
	"grad_norm": 0.3825000717076991,
	"learning_rate": 7.108410382528879e-05,
	"loss": 0.3171,
	"step": 1000
	},
	{
	"epoch": 0.6339144215530903,
	"eval_loss": 0.2597305178642273,
	"eval_runtime": 873.3574,
	"eval_samples_per_second": 4.58,
	"eval_steps_per_second": 0.573,
	"step": 1000
	},
	{
	"epoch": 0.6370839936608558,
	"grad_norm": 0.293460396656183,
	"learning_rate": 7.002621168249759e-05,
	"loss": 0.2297,
	"step": 1005
	},
	{
	"epoch": 0.6402535657686212,
	"grad_norm": 0.3006160040194,
	"learning_rate": 6.897199246558514e-05,
	"loss": 0.2956,
	"step": 1010
	},
	{
	"epoch": 0.6434231378763867,
	"grad_norm": 0.2791223126874652,
	"learning_rate": 6.792157535638874e-05,
	"loss": 0.2496,
	"step": 1015
	},
	{
	"epoch": 0.6465927099841522,
	"grad_norm": 0.2894662197144813,
	"learning_rate": 6.687508907084319e-05,
	"loss": 0.2866,
	"step": 1020
	},
	{
	"epoch": 0.6497622820919176,
	"grad_norm": 0.33156274133370534,
	"learning_rate": 6.583266184320836e-05,
	"loss": 0.32,
	"step": 1025
	},
	{
	"epoch": 0.652931854199683,
	"grad_norm": 0.3447301699746775,
	"learning_rate": 6.479442141035556e-05,
	"loss": 0.2555,
	"step": 1030
	},
	{
	"epoch": 0.6561014263074485,
	"grad_norm": 0.3019937172628048,
	"learning_rate": 6.376049499611496e-05,
	"loss": 0.2632,
	"step": 1035
	},
	{
	"epoch": 0.659270998415214,
	"grad_norm": 0.25047087286035274,
	"learning_rate": 6.273100929568578e-05,
	"loss": 0.2472,
	"step": 1040
	},
	{
	"epoch": 0.6624405705229794,
	"grad_norm": 0.31801398649186896,
	"learning_rate": 6.170609046011151e-05,
	"loss": 0.2793,
	"step": 1045
	},
	{
	"epoch": 0.6656101426307448,
	"grad_norm": 0.3464523898432614,
	"learning_rate": 6.068586408082133e-05,
	"loss": 0.3138,
	"step": 1050
	},
	{
	"epoch": 0.6687797147385103,
	"grad_norm": 0.2919062799416737,
	"learning_rate": 5.9670455174240614e-05,
	"loss": 0.2427,
	"step": 1055
	},
	{
	"epoch": 0.6719492868462758,
	"grad_norm": 0.29267872629520425,
	"learning_rate": 5.865998816647171e-05,
	"loss": 0.3038,
	"step": 1060
	},
	{
	"epoch": 0.6751188589540412,
	"grad_norm": 0.27361822239828004,
	"learning_rate": 5.765458687804679e-05,
	"loss": 0.2566,
	"step": 1065
	},
	{
	"epoch": 0.6782884310618067,
	"grad_norm": 0.3050132066017946,
	"learning_rate": 5.665437450875534e-05,
	"loss": 0.2752,
	"step": 1070
	},
	{
	"epoch": 0.6814580031695721,
	"grad_norm": 0.3580338711915158,
	"learning_rate": 5.565947362254746e-05,
	"loss": 0.3331,
	"step": 1075
	},
	{
	"epoch": 0.6846275752773375,
	"grad_norm": 0.26747930377415474,
	"learning_rate": 5.467000613251516e-05,
	"loss": 0.2429,
	"step": 1080
	},
	{
	"epoch": 0.687797147385103,
	"grad_norm": 0.32226567868782413,
	"learning_rate": 5.368609328595323e-05,
	"loss": 0.3208,
	"step": 1085
	},
	{
	"epoch": 0.6909667194928685,
	"grad_norm": 0.27314417996148593,
	"learning_rate": 5.270785564950208e-05,
	"loss": 0.2351,
	"step": 1090
	},
	{
	"epoch": 0.694136291600634,
	"grad_norm": 0.31179553442460595,
	"learning_rate": 5.1735413094373594e-05,
	"loss": 0.2791,
	"step": 1095
	},
	{
	"epoch": 0.6973058637083994,
	"grad_norm": 0.2983027582550753,
	"learning_rate": 5.0768884781662465e-05,
	"loss": 0.3123,
	"step": 1100
	},
	{
	"epoch": 0.7004754358161648,
	"grad_norm": 0.268619063810808,
	"learning_rate": 4.9808389147744195e-05,
	"loss": 0.2675,
	"step": 1105
	},
	{
	"epoch": 0.7036450079239303,
	"grad_norm": 0.34151620569667657,
	"learning_rate": 4.885404388976261e-05,
	"loss": 0.3171,
	"step": 1110
	},
	{
	"epoch": 0.7068145800316957,
	"grad_norm": 0.25963093128586956,
	"learning_rate": 4.790596595120699e-05,
	"loss": 0.2533,
	"step": 1115
	},
	{
	"epoch": 0.7099841521394612,
	"grad_norm": 0.3373621924020373,
	"learning_rate": 4.696427150758238e-05,
	"loss": 0.3017,
	"step": 1120
	},
	{
	"epoch": 0.7131537242472267,
	"grad_norm": 0.32633352666577314,
	"learning_rate": 4.6029075952173596e-05,
	"loss": 0.3052,
	"step": 1125
	},
	{
	"epoch": 0.716323296354992,
	"grad_norm": 0.24971258370165642,
	"learning_rate": 4.510049388190518e-05,
	"loss": 0.2044,
	"step": 1130
	},
	{
	"epoch": 0.7194928684627575,
	"grad_norm": 0.29602844393415106,
	"learning_rate": 4.417863908329884e-05,
	"loss": 0.2959,
	"step": 1135
	},
	{
	"epoch": 0.722662440570523,
	"grad_norm": 0.23146594836780063,
	"learning_rate": 4.32636245185304e-05,
	"loss": 0.2252,
	"step": 1140
	},
	{
	"epoch": 0.7258320126782885,
	"grad_norm": 0.2744736835188008,
	"learning_rate": 4.235556231158765e-05,
	"loss": 0.2884,
	"step": 1145
	},
	{
	"epoch": 0.7290015847860539,
	"grad_norm": 0.27538990975844047,
	"learning_rate": 4.145456373453087e-05,
	"loss": 0.2981,
	"step": 1150
	},
	{
	"epoch": 0.7321711568938193,
	"grad_norm": 0.3032208366026702,
	"learning_rate": 4.0560739193857625e-05,
	"loss": 0.2158,
	"step": 1155
	},
	{
	"epoch": 0.7353407290015848,
	"grad_norm": 0.27204457210068295,
	"learning_rate": 3.96741982169742e-05,
	"loss": 0.3028,
	"step": 1160
	},
	{
	"epoch": 0.7385103011093502,
	"grad_norm": 0.28301662262727184,
	"learning_rate": 3.8795049438773825e-05,
	"loss": 0.2946,
	"step": 1165
	},
	{
	"epoch": 0.7416798732171157,
	"grad_norm": 0.2884264535746388,
	"learning_rate": 3.7923400588325155e-05,
	"loss": 0.3015,
	"step": 1170
	},
	{
	"epoch": 0.7448494453248812,
	"grad_norm": 0.3186549926460967,
	"learning_rate": 3.7059358475671224e-05,
	"loss": 0.2773,
	"step": 1175
	},
	{
	"epoch": 0.7480190174326465,
	"grad_norm": 0.2997708530371057,
	"learning_rate": 3.6203028978741226e-05,
	"loss": 0.2469,
	"step": 1180
	},
	{
	"epoch": 0.751188589540412,
	"grad_norm": 0.32430776300917263,
	"learning_rate": 3.535451703037626e-05,
	"loss": 0.2726,
	"step": 1185
	},
	{
	"epoch": 0.7543581616481775,
	"grad_norm": 0.2946578935656507,
	"learning_rate": 3.45139266054715e-05,
	"loss": 0.2645,
	"step": 1190
	},
	{
	"epoch": 0.757527733755943,
	"grad_norm": 0.26638481808591286,
	"learning_rate": 3.368136070823478e-05,
	"loss": 0.2465,
	"step": 1195
	},
	{
	"epoch": 0.7606973058637084,
	"grad_norm": 0.3677636374426017,
	"learning_rate": 3.285692135956515e-05,
	"loss": 0.3034,
	"step": 1200
	},
	{
	"epoch": 0.7606973058637084,
	"eval_loss": 0.2539891302585602,
	"eval_runtime": 873.4669,
	"eval_samples_per_second": 4.579,
	"eval_steps_per_second": 0.572,
	"step": 1200
	},
	{
	"epoch": 0.7638668779714739,
	"grad_norm": 0.29762017072344943,
	"learning_rate": 3.2040709584551095e-05,
	"loss": 0.2547,
	"step": 1205
	},
	{
	"epoch": 0.7670364500792393,
	"grad_norm": 0.35066724794986226,
	"learning_rate": 3.123282540009139e-05,
	"loss": 0.3043,
	"step": 1210
	},
	{
	"epoch": 0.7702060221870047,
	"grad_norm": 0.27108651599825634,
	"learning_rate": 3.0433367802639112e-05,
	"loss": 0.2195,
	"step": 1215
	},
	{
	"epoch": 0.7733755942947702,
	"grad_norm": 0.24030479810127725,
	"learning_rate": 2.9642434756070793e-05,
	"loss": 0.2545,
	"step": 1220
	},
	{
	"epoch": 0.7765451664025357,
	"grad_norm": 0.288327556838552,
	"learning_rate": 2.8860123179682242e-05,
	"loss": 0.2942,
	"step": 1225
	},
	{
	"epoch": 0.7797147385103012,
	"grad_norm": 0.29997783643544385,
	"learning_rate": 2.8086528936312073e-05,
	"loss": 0.2407,
	"step": 1230
	},
	{
	"epoch": 0.7828843106180665,
	"grad_norm": 0.2665313932594352,
	"learning_rate": 2.7321746820595086e-05,
	"loss": 0.2863,
	"step": 1235
	},
	{
	"epoch": 0.786053882725832,
	"grad_norm": 0.24138106294481415,
	"learning_rate": 2.6565870547346196e-05,
	"loss": 0.2443,
	"step": 1240
	},
	{
	"epoch": 0.7892234548335975,
	"grad_norm": 0.27410565336257203,
	"learning_rate": 2.5818992740076873e-05,
	"loss": 0.2714,
	"step": 1245
	},
	{
	"epoch": 0.7923930269413629,
	"grad_norm": 0.3607807135248553,
	"learning_rate": 2.508120491964512e-05,
	"loss": 0.3131,
	"step": 1250
	},
	{
	"epoch": 0.7955625990491284,
	"grad_norm": 0.2752324746545014,
	"learning_rate": 2.435259749304096e-05,
	"loss": 0.2352,
	"step": 1255
	},
	{
	"epoch": 0.7987321711568938,
	"grad_norm": 0.33701412326580854,
	"learning_rate": 2.3633259742307844e-05,
	"loss": 0.3121,
	"step": 1260
	},
	{
	"epoch": 0.8019017432646592,
	"grad_norm": 0.2719696587030905,
	"learning_rate": 2.292327981360245e-05,
	"loss": 0.2569,
	"step": 1265
	},
	{
	"epoch": 0.8050713153724247,
	"grad_norm": 0.321470064394813,
	"learning_rate": 2.222274470639324e-05,
	"loss": 0.2903,
	"step": 1270
	},
	{
	"epoch": 0.8082408874801902,
	"grad_norm": 0.33376441935823614,
	"learning_rate": 2.1531740262800004e-05,
	"loss": 0.2712,
	"step": 1275
	},
	{
	"epoch": 0.8114104595879557,
	"grad_norm": 0.3559808478292093,
	"learning_rate": 2.0850351157074598e-05,
	"loss": 0.2485,
	"step": 1280
	},
	{
	"epoch": 0.8145800316957211,
	"grad_norm": 0.3006799560470683,
	"learning_rate": 2.017866088522541e-05,
	"loss": 0.2735,
	"step": 1285
	},
	{
	"epoch": 0.8177496038034865,
	"grad_norm": 0.27868991819615774,
	"learning_rate": 1.951675175478579e-05,
	"loss": 0.2479,
	"step": 1290
	},
	{
	"epoch": 0.820919175911252,
	"grad_norm": 0.30796745550467525,
	"learning_rate": 1.8864704874728346e-05,
	"loss": 0.2693,
	"step": 1295
	},
	{
	"epoch": 0.8240887480190174,
	"grad_norm": 0.327384705590186,
	"learning_rate": 1.822260014552587e-05,
	"loss": 0.2787,
	"step": 1300
	},
	{
	"epoch": 0.8272583201267829,
	"grad_norm": 0.2993843751525639,
	"learning_rate": 1.7590516249360754e-05,
	"loss": 0.2455,
	"step": 1305
	},
	{
	"epoch": 0.8304278922345484,
	"grad_norm": 0.2979918507317238,
	"learning_rate": 1.6968530640483127e-05,
	"loss": 0.2889,
	"step": 1310
	},
	{
	"epoch": 0.8335974643423137,
	"grad_norm": 0.2942240760065363,
	"learning_rate": 1.6356719535720056e-05,
	"loss": 0.2557,
	"step": 1315
	},
	{
	"epoch": 0.8367670364500792,
	"grad_norm": 0.31698805935759067,
	"learning_rate": 1.5755157905135843e-05,
	"loss": 0.2842,
	"step": 1320
	},
	{
	"epoch": 0.8399366085578447,
	"grad_norm": 0.3795639487558114,
	"learning_rate": 1.5163919462845622e-05,
	"loss": 0.2979,
	"step": 1325
	},
	{
	"epoch": 0.8431061806656102,
	"grad_norm": 0.2933950396246441,
	"learning_rate": 1.4583076657982297e-05,
	"loss": 0.2291,
	"step": 1330
	},
	{
	"epoch": 0.8462757527733756,
	"grad_norm": 0.25934135222761445,
	"learning_rate": 1.401270066581899e-05,
	"loss": 0.2981,
	"step": 1335
	},
	{
	"epoch": 0.849445324881141,
	"grad_norm": 0.2512793866151091,
	"learning_rate": 1.3452861379047287e-05,
	"loss": 0.2299,
	"step": 1340
	},
	{
	"epoch": 0.8526148969889065,
	"grad_norm": 0.27890392188122143,
	"learning_rate": 1.2903627399212747e-05,
	"loss": 0.2714,
	"step": 1345
	},
	{
	"epoch": 0.8557844690966719,
	"grad_norm": 0.3540435753559853,
	"learning_rate": 1.2365066028308547e-05,
	"loss": 0.3208,
	"step": 1350
	},
	{
	"epoch": 0.8589540412044374,
	"grad_norm": 0.3170188652169802,
	"learning_rate": 1.183724326052854e-05,
	"loss": 0.261,
	"step": 1355
	},
	{
	"epoch": 0.8621236133122029,
	"grad_norm": 0.287259110452561,
	"learning_rate": 1.1320223774180428e-05,
	"loss": 0.2918,
	"step": 1360
	},
	{
	"epoch": 0.8652931854199684,
	"grad_norm": 0.3145063929825825,
	"learning_rate": 1.0814070923760178e-05,
	"loss": 0.2562,
	"step": 1365
	},
	{
	"epoch": 0.8684627575277337,
	"grad_norm": 0.29883537499670176,
	"learning_rate": 1.0318846732188737e-05,
	"loss": 0.2585,
	"step": 1370
	},
	{
	"epoch": 0.8716323296354992,
	"grad_norm": 0.33602754178177113,
	"learning_rate": 9.834611883211797e-06,
	"loss": 0.303,
	"step": 1375
	},
	{
	"epoch": 0.8748019017432647,
	"grad_norm": 0.27917699955310804,
	"learning_rate": 9.361425713963878e-06,
	"loss": 0.2399,
	"step": 1380
	},
	{
	"epoch": 0.8779714738510301,
	"grad_norm": 0.29322424380757633,
	"learning_rate": 8.899346207697134e-06,
	"loss": 0.3192,
	"step": 1385
	},
	{
	"epoch": 0.8811410459587956,
	"grad_norm": 0.32716078301472046,
	"learning_rate": 8.448429986676298e-06,
	"loss": 0.256,
	"step": 1390
	},
	{
	"epoch": 0.884310618066561,
	"grad_norm": 0.28468261231564157,
	"learning_rate": 8.00873230524023e-06,
	"loss": 0.2864,
	"step": 1395
	},
	{
	"epoch": 0.8874801901743264,
	"grad_norm": 0.3481974787604397,
	"learning_rate": 7.580307043031232e-06,
	"loss": 0.265,
	"step": 1400
	},
	{
	"epoch": 0.8874801901743264,
	"eval_loss": 0.25099214911460876,
	"eval_runtime": 873.7854,
	"eval_samples_per_second": 4.578,
	"eval_steps_per_second": 0.572,
	"step": 1400
	},
	{
	"epoch": 0.8906497622820919,
	"grad_norm": 0.2756744352775957,
	"learning_rate": 7.163206698392744e-06,
	"loss": 0.2392,
	"step": 1405
	},
	{
	"epoch": 0.8938193343898574,
	"grad_norm": 0.3070714015760399,
	"learning_rate": 6.757482381936264e-06,
	"loss": 0.2722,
	"step": 1410
	},
	{
	"epoch": 0.8969889064976229,
	"grad_norm": 0.2719682030351016,
	"learning_rate": 6.36318381027835e-06,
	"loss": 0.2553,
	"step": 1415
	},
	{
	"epoch": 0.9001584786053882,
	"grad_norm": 0.30754515515844727,
	"learning_rate": 5.980359299948568e-06,
	"loss": 0.2763,
	"step": 1420
	},
	{
	"epoch": 0.9033280507131537,
	"grad_norm": 0.3599613866897873,
	"learning_rate": 5.609055761468707e-06,
	"loss": 0.2987,
	"step": 1425
	},
	{
	"epoch": 0.9064976228209192,
	"grad_norm": 0.26662442413818216,
	"learning_rate": 5.249318693604577e-06,
	"loss": 0.2632,
	"step": 1430
	},
	{
	"epoch": 0.9096671949286846,
	"grad_norm": 0.2965993748242227,
	"learning_rate": 4.901192177790692e-06,
	"loss": 0.2799,
	"step": 1435
	},
	{
	"epoch": 0.9128367670364501,
	"grad_norm": 0.2923839300339188,
	"learning_rate": 4.564718872728568e-06,
	"loss": 0.2464,
	"step": 1440
	},
	{
	"epoch": 0.9160063391442155,
	"grad_norm": 0.3004256474409844,
	"learning_rate": 4.2399400091594154e-06,
	"loss": 0.2775,
	"step": 1445
	},
	{
	"epoch": 0.919175911251981,
	"grad_norm": 0.30636844288189197,
	"learning_rate": 3.926895384811835e-06,
	"loss": 0.2917,
	"step": 1450
	},
	{
	"epoch": 0.9223454833597464,
	"grad_norm": 0.27018058178290905,
	"learning_rate": 3.625623359525099e-06,
	"loss": 0.2522,
	"step": 1455
	},
	{
	"epoch": 0.9255150554675119,
	"grad_norm": 0.3069766309513976,
	"learning_rate": 3.33616085054862e-06,
	"loss": 0.2722,
	"step": 1460
	},
	{
	"epoch": 0.9286846275752774,
	"grad_norm": 0.2673579253849767,
	"learning_rate": 3.0585433280180707e-06,
	"loss": 0.2561,
	"step": 1465
	},
	{
	"epoch": 0.9318541996830428,
	"grad_norm": 0.2688001276727079,
	"learning_rate": 2.792804810609173e-06,
	"loss": 0.2718,
	"step": 1470
	},
	{
	"epoch": 0.9350237717908082,
	"grad_norm": 0.3331860222359942,
	"learning_rate": 2.538977861368874e-06,
	"loss": 0.3163,
	"step": 1475
	},
	{
	"epoch": 0.9381933438985737,
	"grad_norm": 0.2668325932813764,
	"learning_rate": 2.2970935837253182e-06,
	"loss": 0.2393,
	"step": 1480
	},
	{
	"epoch": 0.9413629160063391,
	"grad_norm": 0.3285498156618503,
	"learning_rate": 2.0671816176764058e-06,
	"loss": 0.2862,
	"step": 1485
	},
	{
	"epoch": 0.9445324881141046,
	"grad_norm": 0.36573862269188245,
	"learning_rate": 1.8492701361578324e-06,
	"loss": 0.2447,
	"step": 1490
	},
	{
	"epoch": 0.9477020602218701,
	"grad_norm": 0.2864139944423568,
	"learning_rate": 1.6433858415907278e-06,
	"loss": 0.2777,
	"step": 1495
	},
	{
	"epoch": 0.9508716323296355,
	"grad_norm": 0.323741034773291,
	"learning_rate": 1.4495539626097288e-06,
	"loss": 0.3086,
	"step": 1500
	},
	{
	"epoch": 0.9540412044374009,
	"grad_norm": 0.2857388007026186,
	"learning_rate": 1.2677982509714415e-06,
	"loss": 0.2175,
	"step": 1505
	},
	{
	"epoch": 0.9572107765451664,
	"grad_norm": 0.2813011213045847,
	"learning_rate": 1.0981409786439355e-06,
	"loss": 0.2882,
	"step": 1510
	},
	{
	"epoch": 0.9603803486529319,
	"grad_norm": 0.27685594779071976,
	"learning_rate": 9.40602935077639e-07,
	"loss": 0.23,
	"step": 1515
	},
	{
	"epoch": 0.9635499207606973,
	"grad_norm": 0.278082958417837,
	"learning_rate": 7.952034246577977e-07,
	"loss": 0.2814,
	"step": 1520
	},
	{
	"epoch": 0.9667194928684627,
	"grad_norm": 0.332411253150925,
	"learning_rate": 6.619602643389899e-07,
	"loss": 0.2772,
	"step": 1525
	},
	{
	"epoch": 0.9698890649762282,
	"grad_norm": 0.28541188324654354,
	"learning_rate": 5.408897814618175e-07,
	"loss": 0.2456,
	"step": 1530
	},
	{
	"epoch": 0.9730586370839936,
	"grad_norm": 0.289051402982161,
	"learning_rate": 4.320068117522835e-07,
	"loss": 0.2659,
	"step": 1535
	},
	{
	"epoch": 0.9762282091917591,
	"grad_norm": 0.2896831822321737,
	"learning_rate": 3.35324697503725e-07,
	"loss": 0.2721,
	"step": 1540
	},
	{
	"epoch": 0.9793977812995246,
	"grad_norm": 0.31950029347694936,
	"learning_rate": 2.508552859419977e-07,
	"loss": 0.2622,
	"step": 1545
	},
	{
	"epoch": 0.9825673534072901,
	"grad_norm": 0.33661523682392047,
	"learning_rate": 1.7860892777367133e-07,
	"loss": 0.2731,
	"step": 1550
	},
	{
	"epoch": 0.9857369255150554,
	"grad_norm": 0.2522879758084615,
	"learning_rate": 1.1859447591769934e-07,
	"loss": 0.2291,
	"step": 1555
	},
	{
	"epoch": 0.9889064976228209,
	"grad_norm": 0.2923729662272973,
	"learning_rate": 7.081928442057573e-08,
	"loss": 0.2972,
	"step": 1560
	},
	{
	"epoch": 0.9920760697305864,
	"grad_norm": 0.24814229174923821,
	"learning_rate": 3.5289207555233573e-08,
	"loss": 0.2586,
	"step": 1565
	},
	{
	"epoch": 0.9952456418383518,
	"grad_norm": 0.24322900794711846,
	"learning_rate": 1.2008599103618956e-08,
	"loss": 0.2751,
	"step": 1570
	},
	{
	"epoch": 0.9984152139461173,
	"grad_norm": 0.4374899080765362,
	"learning_rate": 9.803118232398768e-10,
	"loss": 0.2981,
	"step": 1575
	},
	{
	"epoch": 0.9996830427892235,
	"step": 1577,
	"total_flos": 8013042675351552.0,
	"train_loss": 0.3010184336819827,
	"train_runtime": 18281.3669,
	"train_samples_per_second": 1.381,
	"train_steps_per_second": 0.086
	}
	],
	"logging_steps": 5,
	"max_steps": 1577,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 1,
	"save_steps": 200,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 8013042675351552.0,
	"train_batch_size": 2,
	"trial_name": null,
	"trial_params": null
	}