terry69's picture
Model save
615d755 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2181,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004585052728106373,
"grad_norm": 25.35940676221757,
"learning_rate": 4.5662100456621004e-08,
"loss": 1.4356,
"step": 1
},
{
"epoch": 0.0022925263640531865,
"grad_norm": 23.277460508193656,
"learning_rate": 2.2831050228310502e-07,
"loss": 1.4178,
"step": 5
},
{
"epoch": 0.004585052728106373,
"grad_norm": 15.396159390081614,
"learning_rate": 4.5662100456621004e-07,
"loss": 1.3928,
"step": 10
},
{
"epoch": 0.0068775790921595595,
"grad_norm": 9.927996187561872,
"learning_rate": 6.849315068493151e-07,
"loss": 1.2487,
"step": 15
},
{
"epoch": 0.009170105456212746,
"grad_norm": 8.936136397262343,
"learning_rate": 9.132420091324201e-07,
"loss": 1.1467,
"step": 20
},
{
"epoch": 0.011462631820265932,
"grad_norm": 3.9419002716272007,
"learning_rate": 1.1415525114155251e-06,
"loss": 1.0321,
"step": 25
},
{
"epoch": 0.013755158184319119,
"grad_norm": 3.2824292809209212,
"learning_rate": 1.3698630136986302e-06,
"loss": 0.9911,
"step": 30
},
{
"epoch": 0.016047684548372305,
"grad_norm": 3.198808731865913,
"learning_rate": 1.5981735159817353e-06,
"loss": 0.9499,
"step": 35
},
{
"epoch": 0.018340210912425492,
"grad_norm": 3.200026153105945,
"learning_rate": 1.8264840182648401e-06,
"loss": 0.9394,
"step": 40
},
{
"epoch": 0.02063273727647868,
"grad_norm": 3.1015042038551264,
"learning_rate": 2.0547945205479454e-06,
"loss": 0.9374,
"step": 45
},
{
"epoch": 0.022925263640531865,
"grad_norm": 3.0638884680066116,
"learning_rate": 2.2831050228310503e-06,
"loss": 0.9366,
"step": 50
},
{
"epoch": 0.02521779000458505,
"grad_norm": 3.1218708697344337,
"learning_rate": 2.511415525114155e-06,
"loss": 0.9072,
"step": 55
},
{
"epoch": 0.027510316368638238,
"grad_norm": 3.030931859384564,
"learning_rate": 2.7397260273972604e-06,
"loss": 0.896,
"step": 60
},
{
"epoch": 0.029802842732691424,
"grad_norm": 3.183215428730836,
"learning_rate": 2.9680365296803653e-06,
"loss": 0.904,
"step": 65
},
{
"epoch": 0.03209536909674461,
"grad_norm": 3.1193991823217884,
"learning_rate": 3.1963470319634706e-06,
"loss": 0.8992,
"step": 70
},
{
"epoch": 0.0343878954607978,
"grad_norm": 3.1562480345048662,
"learning_rate": 3.4246575342465754e-06,
"loss": 0.9008,
"step": 75
},
{
"epoch": 0.036680421824850984,
"grad_norm": 3.1106379275365263,
"learning_rate": 3.6529680365296803e-06,
"loss": 0.8835,
"step": 80
},
{
"epoch": 0.03897294818890417,
"grad_norm": 3.1659334626442455,
"learning_rate": 3.881278538812785e-06,
"loss": 0.8798,
"step": 85
},
{
"epoch": 0.04126547455295736,
"grad_norm": 3.1010027836059533,
"learning_rate": 4.109589041095891e-06,
"loss": 0.879,
"step": 90
},
{
"epoch": 0.04355800091701054,
"grad_norm": 3.3519588401192273,
"learning_rate": 4.337899543378996e-06,
"loss": 0.8615,
"step": 95
},
{
"epoch": 0.04585052728106373,
"grad_norm": 3.049285908948199,
"learning_rate": 4.566210045662101e-06,
"loss": 0.8529,
"step": 100
},
{
"epoch": 0.048143053645116916,
"grad_norm": 3.109756439871898,
"learning_rate": 4.7945205479452054e-06,
"loss": 0.8654,
"step": 105
},
{
"epoch": 0.0504355800091701,
"grad_norm": 3.1513505710159335,
"learning_rate": 5.02283105022831e-06,
"loss": 0.8663,
"step": 110
},
{
"epoch": 0.05272810637322329,
"grad_norm": 3.1767156567086614,
"learning_rate": 5.251141552511416e-06,
"loss": 0.8613,
"step": 115
},
{
"epoch": 0.055020632737276476,
"grad_norm": 3.453537287264967,
"learning_rate": 5.479452054794521e-06,
"loss": 0.8771,
"step": 120
},
{
"epoch": 0.05731315910132966,
"grad_norm": 3.013155684535603,
"learning_rate": 5.7077625570776266e-06,
"loss": 0.8473,
"step": 125
},
{
"epoch": 0.05960568546538285,
"grad_norm": 3.425642520518735,
"learning_rate": 5.936073059360731e-06,
"loss": 0.8521,
"step": 130
},
{
"epoch": 0.061898211829436035,
"grad_norm": 3.031927176672884,
"learning_rate": 6.164383561643836e-06,
"loss": 0.84,
"step": 135
},
{
"epoch": 0.06419073819348922,
"grad_norm": 3.239390421336056,
"learning_rate": 6.392694063926941e-06,
"loss": 0.859,
"step": 140
},
{
"epoch": 0.06648326455754242,
"grad_norm": 3.017820442924467,
"learning_rate": 6.621004566210046e-06,
"loss": 0.86,
"step": 145
},
{
"epoch": 0.0687757909215956,
"grad_norm": 3.0002036905279503,
"learning_rate": 6.849315068493151e-06,
"loss": 0.8525,
"step": 150
},
{
"epoch": 0.07106831728564879,
"grad_norm": 3.1828998491124016,
"learning_rate": 7.077625570776257e-06,
"loss": 0.8433,
"step": 155
},
{
"epoch": 0.07336084364970197,
"grad_norm": 3.087610569097963,
"learning_rate": 7.305936073059361e-06,
"loss": 0.8361,
"step": 160
},
{
"epoch": 0.07565337001375516,
"grad_norm": 3.115099552868115,
"learning_rate": 7.534246575342466e-06,
"loss": 0.8436,
"step": 165
},
{
"epoch": 0.07794589637780834,
"grad_norm": 3.1551201699069282,
"learning_rate": 7.76255707762557e-06,
"loss": 0.8311,
"step": 170
},
{
"epoch": 0.08023842274186153,
"grad_norm": 3.2013023977541617,
"learning_rate": 7.990867579908676e-06,
"loss": 0.8244,
"step": 175
},
{
"epoch": 0.08253094910591471,
"grad_norm": 3.1031180959674716,
"learning_rate": 8.219178082191782e-06,
"loss": 0.8362,
"step": 180
},
{
"epoch": 0.08482347546996791,
"grad_norm": 3.056534274967503,
"learning_rate": 8.447488584474887e-06,
"loss": 0.827,
"step": 185
},
{
"epoch": 0.08711600183402109,
"grad_norm": 2.8738007240926016,
"learning_rate": 8.675799086757991e-06,
"loss": 0.8264,
"step": 190
},
{
"epoch": 0.08940852819807428,
"grad_norm": 2.9833947743009044,
"learning_rate": 8.904109589041097e-06,
"loss": 0.8364,
"step": 195
},
{
"epoch": 0.09170105456212746,
"grad_norm": 3.0590617698737606,
"learning_rate": 9.132420091324201e-06,
"loss": 0.8385,
"step": 200
},
{
"epoch": 0.09399358092618065,
"grad_norm": 2.9544649860589964,
"learning_rate": 9.360730593607307e-06,
"loss": 0.8306,
"step": 205
},
{
"epoch": 0.09628610729023383,
"grad_norm": 3.156467119939513,
"learning_rate": 9.589041095890411e-06,
"loss": 0.812,
"step": 210
},
{
"epoch": 0.09857863365428703,
"grad_norm": 3.241792877196348,
"learning_rate": 9.817351598173517e-06,
"loss": 0.8098,
"step": 215
},
{
"epoch": 0.1008711600183402,
"grad_norm": 3.329896188306964,
"learning_rate": 9.999993590241675e-06,
"loss": 0.8321,
"step": 220
},
{
"epoch": 0.1031636863823934,
"grad_norm": 2.961456684151267,
"learning_rate": 9.999769250425817e-06,
"loss": 0.8296,
"step": 225
},
{
"epoch": 0.10545621274644658,
"grad_norm": 3.0123856993460723,
"learning_rate": 9.999224439127452e-06,
"loss": 0.8223,
"step": 230
},
{
"epoch": 0.10774873911049977,
"grad_norm": 3.1722352404227263,
"learning_rate": 9.998359191267488e-06,
"loss": 0.8183,
"step": 235
},
{
"epoch": 0.11004126547455295,
"grad_norm": 3.339283823835408,
"learning_rate": 9.997173562305937e-06,
"loss": 0.812,
"step": 240
},
{
"epoch": 0.11233379183860615,
"grad_norm": 3.051005936600519,
"learning_rate": 9.995667628238362e-06,
"loss": 0.8159,
"step": 245
},
{
"epoch": 0.11462631820265932,
"grad_norm": 3.621892868476315,
"learning_rate": 9.993841485591e-06,
"loss": 0.8265,
"step": 250
},
{
"epoch": 0.11691884456671252,
"grad_norm": 3.1501195933267727,
"learning_rate": 9.991695251414584e-06,
"loss": 0.7829,
"step": 255
},
{
"epoch": 0.1192113709307657,
"grad_norm": 3.2077051728198436,
"learning_rate": 9.989229063276829e-06,
"loss": 0.8061,
"step": 260
},
{
"epoch": 0.12150389729481889,
"grad_norm": 2.813867856532736,
"learning_rate": 9.986443079253628e-06,
"loss": 0.8088,
"step": 265
},
{
"epoch": 0.12379642365887207,
"grad_norm": 2.953479405448006,
"learning_rate": 9.983337477918904e-06,
"loss": 0.8013,
"step": 270
},
{
"epoch": 0.12608895002292525,
"grad_norm": 2.9765536692485752,
"learning_rate": 9.979912458333179e-06,
"loss": 0.8112,
"step": 275
},
{
"epoch": 0.12838147638697844,
"grad_norm": 2.9261553011693313,
"learning_rate": 9.976168240030804e-06,
"loss": 0.797,
"step": 280
},
{
"epoch": 0.13067400275103164,
"grad_norm": 2.7549890848982668,
"learning_rate": 9.972105063005895e-06,
"loss": 0.8047,
"step": 285
},
{
"epoch": 0.13296652911508483,
"grad_norm": 2.783923747108222,
"learning_rate": 9.96772318769694e-06,
"loss": 0.8045,
"step": 290
},
{
"epoch": 0.13525905547913802,
"grad_norm": 2.922181282361273,
"learning_rate": 9.96302289497012e-06,
"loss": 0.7891,
"step": 295
},
{
"epoch": 0.1375515818431912,
"grad_norm": 2.8387565382348807,
"learning_rate": 9.958004486101293e-06,
"loss": 0.7756,
"step": 300
},
{
"epoch": 0.13984410820724438,
"grad_norm": 2.869327340764152,
"learning_rate": 9.952668282756692e-06,
"loss": 0.8027,
"step": 305
},
{
"epoch": 0.14213663457129758,
"grad_norm": 2.874303723785054,
"learning_rate": 9.947014626972298e-06,
"loss": 0.7826,
"step": 310
},
{
"epoch": 0.14442916093535077,
"grad_norm": 2.737834462358364,
"learning_rate": 9.941043881131928e-06,
"loss": 0.7702,
"step": 315
},
{
"epoch": 0.14672168729940394,
"grad_norm": 2.858629644409334,
"learning_rate": 9.934756427943996e-06,
"loss": 0.7761,
"step": 320
},
{
"epoch": 0.14901421366345713,
"grad_norm": 2.941702373835629,
"learning_rate": 9.92815267041699e-06,
"loss": 0.7778,
"step": 325
},
{
"epoch": 0.15130674002751032,
"grad_norm": 2.832449171435636,
"learning_rate": 9.921233031833639e-06,
"loss": 0.7747,
"step": 330
},
{
"epoch": 0.15359926639156352,
"grad_norm": 2.838327247569131,
"learning_rate": 9.913997955723777e-06,
"loss": 0.7798,
"step": 335
},
{
"epoch": 0.15589179275561668,
"grad_norm": 3.0053878829121357,
"learning_rate": 9.90644790583592e-06,
"loss": 0.7504,
"step": 340
},
{
"epoch": 0.15818431911966988,
"grad_norm": 2.737407601036532,
"learning_rate": 9.898583366107539e-06,
"loss": 0.7655,
"step": 345
},
{
"epoch": 0.16047684548372307,
"grad_norm": 3.0259958169837717,
"learning_rate": 9.890404840634037e-06,
"loss": 0.7582,
"step": 350
},
{
"epoch": 0.16276937184777626,
"grad_norm": 2.804766086619055,
"learning_rate": 9.881912853636445e-06,
"loss": 0.7747,
"step": 355
},
{
"epoch": 0.16506189821182943,
"grad_norm": 2.7915942235581785,
"learning_rate": 9.873107949427815e-06,
"loss": 0.7584,
"step": 360
},
{
"epoch": 0.16735442457588262,
"grad_norm": 2.8708773578370588,
"learning_rate": 9.863990692378333e-06,
"loss": 0.7538,
"step": 365
},
{
"epoch": 0.16964695093993581,
"grad_norm": 2.8372441642155097,
"learning_rate": 9.854561666879148e-06,
"loss": 0.7457,
"step": 370
},
{
"epoch": 0.171939477303989,
"grad_norm": 2.7820083192682197,
"learning_rate": 9.844821477304904e-06,
"loss": 0.775,
"step": 375
},
{
"epoch": 0.17423200366804217,
"grad_norm": 2.6780715561867066,
"learning_rate": 9.834770747975015e-06,
"loss": 0.7442,
"step": 380
},
{
"epoch": 0.17652453003209537,
"grad_norm": 2.7545319149727763,
"learning_rate": 9.824410123113634e-06,
"loss": 0.7416,
"step": 385
},
{
"epoch": 0.17881705639614856,
"grad_norm": 2.6402444423405225,
"learning_rate": 9.813740266808375e-06,
"loss": 0.7362,
"step": 390
},
{
"epoch": 0.18110958276020175,
"grad_norm": 2.730909608534738,
"learning_rate": 9.802761862967731e-06,
"loss": 0.7252,
"step": 395
},
{
"epoch": 0.18340210912425492,
"grad_norm": 2.9284254959639355,
"learning_rate": 9.791475615277248e-06,
"loss": 0.7453,
"step": 400
},
{
"epoch": 0.1856946354883081,
"grad_norm": 2.790088757652803,
"learning_rate": 9.779882247154419e-06,
"loss": 0.7344,
"step": 405
},
{
"epoch": 0.1879871618523613,
"grad_norm": 2.725250925456166,
"learning_rate": 9.76798250170231e-06,
"loss": 0.7246,
"step": 410
},
{
"epoch": 0.1902796882164145,
"grad_norm": 2.667869321574359,
"learning_rate": 9.755777141661937e-06,
"loss": 0.7193,
"step": 415
},
{
"epoch": 0.19257221458046767,
"grad_norm": 2.5119646512097997,
"learning_rate": 9.743266949363368e-06,
"loss": 0.7402,
"step": 420
},
{
"epoch": 0.19486474094452086,
"grad_norm": 2.847215415311532,
"learning_rate": 9.730452726675583e-06,
"loss": 0.7173,
"step": 425
},
{
"epoch": 0.19715726730857405,
"grad_norm": 2.779126735326216,
"learning_rate": 9.717335294955078e-06,
"loss": 0.7157,
"step": 430
},
{
"epoch": 0.19944979367262725,
"grad_norm": 3.4561646981046454,
"learning_rate": 9.703915494993215e-06,
"loss": 0.7312,
"step": 435
},
{
"epoch": 0.2017423200366804,
"grad_norm": 2.7730394910581913,
"learning_rate": 9.690194186962326e-06,
"loss": 0.7335,
"step": 440
},
{
"epoch": 0.2040348464007336,
"grad_norm": 2.859201150645261,
"learning_rate": 9.676172250360583e-06,
"loss": 0.7383,
"step": 445
},
{
"epoch": 0.2063273727647868,
"grad_norm": 2.9209175577350313,
"learning_rate": 9.66185058395563e-06,
"loss": 0.7263,
"step": 450
},
{
"epoch": 0.20861989912884,
"grad_norm": 2.704547531489439,
"learning_rate": 9.647230105726963e-06,
"loss": 0.7143,
"step": 455
},
{
"epoch": 0.21091242549289316,
"grad_norm": 2.670951446360455,
"learning_rate": 9.632311752807097e-06,
"loss": 0.7307,
"step": 460
},
{
"epoch": 0.21320495185694635,
"grad_norm": 3.2268092839390485,
"learning_rate": 9.617096481421498e-06,
"loss": 0.6985,
"step": 465
},
{
"epoch": 0.21549747822099954,
"grad_norm": 2.939723635315935,
"learning_rate": 9.601585266827288e-06,
"loss": 0.7181,
"step": 470
},
{
"epoch": 0.21779000458505274,
"grad_norm": 2.7240300289732082,
"learning_rate": 9.58577910325074e-06,
"loss": 0.7079,
"step": 475
},
{
"epoch": 0.2200825309491059,
"grad_norm": 2.7348057628577815,
"learning_rate": 9.569679003823542e-06,
"loss": 0.7063,
"step": 480
},
{
"epoch": 0.2223750573131591,
"grad_norm": 2.6209148336683894,
"learning_rate": 9.55328600051787e-06,
"loss": 0.7019,
"step": 485
},
{
"epoch": 0.2246675836772123,
"grad_norm": 2.7094717894075093,
"learning_rate": 9.536601144080224e-06,
"loss": 0.6933,
"step": 490
},
{
"epoch": 0.22696011004126548,
"grad_norm": 2.6005478056383393,
"learning_rate": 9.5196255039641e-06,
"loss": 0.7008,
"step": 495
},
{
"epoch": 0.22925263640531865,
"grad_norm": 2.9435017052734933,
"learning_rate": 9.502360168261424e-06,
"loss": 0.7168,
"step": 500
},
{
"epoch": 0.23154516276937184,
"grad_norm": 15.281241231781962,
"learning_rate": 9.48480624363281e-06,
"loss": 0.6968,
"step": 505
},
{
"epoch": 0.23383768913342504,
"grad_norm": 2.803746155734926,
"learning_rate": 9.46696485523664e-06,
"loss": 0.7176,
"step": 510
},
{
"epoch": 0.23613021549747823,
"grad_norm": 2.9572910983459275,
"learning_rate": 9.448837146656924e-06,
"loss": 0.6983,
"step": 515
},
{
"epoch": 0.2384227418615314,
"grad_norm": 2.66575290909559,
"learning_rate": 9.430424279830014e-06,
"loss": 0.679,
"step": 520
},
{
"epoch": 0.2407152682255846,
"grad_norm": 2.6071015601683056,
"learning_rate": 9.411727434970121e-06,
"loss": 0.6796,
"step": 525
},
{
"epoch": 0.24300779458963778,
"grad_norm": 2.6190152299969975,
"learning_rate": 9.392747810493675e-06,
"loss": 0.6922,
"step": 530
},
{
"epoch": 0.24530032095369098,
"grad_norm": 2.9035286162764624,
"learning_rate": 9.373486622942494e-06,
"loss": 0.6881,
"step": 535
},
{
"epoch": 0.24759284731774414,
"grad_norm": 2.722112266367375,
"learning_rate": 9.353945106905822e-06,
"loss": 0.691,
"step": 540
},
{
"epoch": 0.24988537368179733,
"grad_norm": 2.8551591177378173,
"learning_rate": 9.334124514941185e-06,
"loss": 0.6786,
"step": 545
},
{
"epoch": 0.2521779000458505,
"grad_norm": 2.789372421806793,
"learning_rate": 9.314026117494116e-06,
"loss": 0.6965,
"step": 550
},
{
"epoch": 0.2544704264099037,
"grad_norm": 2.943178087845294,
"learning_rate": 9.29365120281671e-06,
"loss": 0.6734,
"step": 555
},
{
"epoch": 0.2567629527739569,
"grad_norm": 2.9269593678262678,
"learning_rate": 9.273001076885059e-06,
"loss": 0.6567,
"step": 560
},
{
"epoch": 0.2590554791380101,
"grad_norm": 2.7577714835234457,
"learning_rate": 9.252077063315545e-06,
"loss": 0.6628,
"step": 565
},
{
"epoch": 0.2613480055020633,
"grad_norm": 2.595587224144848,
"learning_rate": 9.230880503279991e-06,
"loss": 0.6593,
"step": 570
},
{
"epoch": 0.26364053186611647,
"grad_norm": 2.6421320876444425,
"learning_rate": 9.209412755419703e-06,
"loss": 0.6616,
"step": 575
},
{
"epoch": 0.26593305823016966,
"grad_norm": 2.5889083746551487,
"learning_rate": 9.18767519575838e-06,
"loss": 0.6574,
"step": 580
},
{
"epoch": 0.26822558459422285,
"grad_norm": 2.644361824371662,
"learning_rate": 9.165669217613919e-06,
"loss": 0.6631,
"step": 585
},
{
"epoch": 0.27051811095827605,
"grad_norm": 2.7328270481402166,
"learning_rate": 9.143396231509102e-06,
"loss": 0.6591,
"step": 590
},
{
"epoch": 0.2728106373223292,
"grad_norm": 2.6202953814608247,
"learning_rate": 9.12085766508119e-06,
"loss": 0.6465,
"step": 595
},
{
"epoch": 0.2751031636863824,
"grad_norm": 2.688621083531908,
"learning_rate": 9.098054962990415e-06,
"loss": 0.6678,
"step": 600
},
{
"epoch": 0.2773956900504356,
"grad_norm": 2.684577688850206,
"learning_rate": 9.074989586827375e-06,
"loss": 0.6478,
"step": 605
},
{
"epoch": 0.27968821641448877,
"grad_norm": 2.6991742230220708,
"learning_rate": 9.05166301501936e-06,
"loss": 0.6575,
"step": 610
},
{
"epoch": 0.28198074277854196,
"grad_norm": 2.8422733898390353,
"learning_rate": 9.028076742735583e-06,
"loss": 0.6606,
"step": 615
},
{
"epoch": 0.28427326914259515,
"grad_norm": 3.3111069999457174,
"learning_rate": 9.004232281791341e-06,
"loss": 0.6501,
"step": 620
},
{
"epoch": 0.28656579550664835,
"grad_norm": 2.8352207612326676,
"learning_rate": 8.980131160551118e-06,
"loss": 0.6497,
"step": 625
},
{
"epoch": 0.28885832187070154,
"grad_norm": 2.622577509095012,
"learning_rate": 8.955774923830618e-06,
"loss": 0.6265,
"step": 630
},
{
"epoch": 0.2911508482347547,
"grad_norm": 2.6180287881898363,
"learning_rate": 8.931165132797747e-06,
"loss": 0.6397,
"step": 635
},
{
"epoch": 0.29344337459880787,
"grad_norm": 2.7463986227282713,
"learning_rate": 8.906303364872545e-06,
"loss": 0.6668,
"step": 640
},
{
"epoch": 0.29573590096286106,
"grad_norm": 2.6468423935127254,
"learning_rate": 8.881191213626084e-06,
"loss": 0.6393,
"step": 645
},
{
"epoch": 0.29802842732691426,
"grad_norm": 2.6005030935816245,
"learning_rate": 8.855830288678311e-06,
"loss": 0.644,
"step": 650
},
{
"epoch": 0.30032095369096745,
"grad_norm": 2.7192686848560554,
"learning_rate": 8.83022221559489e-06,
"loss": 0.6479,
"step": 655
},
{
"epoch": 0.30261348005502064,
"grad_norm": 2.673457233400223,
"learning_rate": 8.804368635783002e-06,
"loss": 0.6384,
"step": 660
},
{
"epoch": 0.30490600641907384,
"grad_norm": 2.850654385793331,
"learning_rate": 8.778271206386135e-06,
"loss": 0.6456,
"step": 665
},
{
"epoch": 0.30719853278312703,
"grad_norm": 2.6958806241423643,
"learning_rate": 8.751931600177863e-06,
"loss": 0.6025,
"step": 670
},
{
"epoch": 0.30949105914718017,
"grad_norm": 2.764991202053115,
"learning_rate": 8.725351505454631e-06,
"loss": 0.6194,
"step": 675
},
{
"epoch": 0.31178358551123336,
"grad_norm": 2.6590991144561906,
"learning_rate": 8.69853262592754e-06,
"loss": 0.6348,
"step": 680
},
{
"epoch": 0.31407611187528656,
"grad_norm": 2.708732600879308,
"learning_rate": 8.671476680613134e-06,
"loss": 0.6411,
"step": 685
},
{
"epoch": 0.31636863823933975,
"grad_norm": 2.5456418831079457,
"learning_rate": 8.644185403723231e-06,
"loss": 0.6138,
"step": 690
},
{
"epoch": 0.31866116460339294,
"grad_norm": 2.903106819651818,
"learning_rate": 8.616660544553754e-06,
"loss": 0.6237,
"step": 695
},
{
"epoch": 0.32095369096744614,
"grad_norm": 2.7280408027219942,
"learning_rate": 8.588903867372607e-06,
"loss": 0.6138,
"step": 700
},
{
"epoch": 0.32324621733149933,
"grad_norm": 2.886662280669305,
"learning_rate": 8.560917151306594e-06,
"loss": 0.6066,
"step": 705
},
{
"epoch": 0.3255387436955525,
"grad_norm": 2.6016420791711994,
"learning_rate": 8.53270219022738e-06,
"loss": 0.6126,
"step": 710
},
{
"epoch": 0.32783127005960566,
"grad_norm": 2.5696831024854827,
"learning_rate": 8.50426079263651e-06,
"loss": 0.6191,
"step": 715
},
{
"epoch": 0.33012379642365886,
"grad_norm": 2.789642739261612,
"learning_rate": 8.475594781549483e-06,
"loss": 0.6171,
"step": 720
},
{
"epoch": 0.33241632278771205,
"grad_norm": 2.662350967821026,
"learning_rate": 8.446705994378913e-06,
"loss": 0.6262,
"step": 725
},
{
"epoch": 0.33470884915176524,
"grad_norm": 2.749133969632543,
"learning_rate": 8.417596282816742e-06,
"loss": 0.6084,
"step": 730
},
{
"epoch": 0.33700137551581844,
"grad_norm": 2.8389384155162736,
"learning_rate": 8.388267512715565e-06,
"loss": 0.6089,
"step": 735
},
{
"epoch": 0.33929390187987163,
"grad_norm": 2.6423715957870115,
"learning_rate": 8.358721563969027e-06,
"loss": 0.5912,
"step": 740
},
{
"epoch": 0.3415864282439248,
"grad_norm": 2.582427374014035,
"learning_rate": 8.328960330391325e-06,
"loss": 0.6015,
"step": 745
},
{
"epoch": 0.343878954607978,
"grad_norm": 2.5641005198848763,
"learning_rate": 8.298985719595824e-06,
"loss": 0.6127,
"step": 750
},
{
"epoch": 0.34617148097203115,
"grad_norm": 2.573968171901929,
"learning_rate": 8.268799652872786e-06,
"loss": 0.6108,
"step": 755
},
{
"epoch": 0.34846400733608435,
"grad_norm": 2.555840575858041,
"learning_rate": 8.23840406506621e-06,
"loss": 0.6013,
"step": 760
},
{
"epoch": 0.35075653370013754,
"grad_norm": 2.608505400595271,
"learning_rate": 8.207800904449829e-06,
"loss": 0.5868,
"step": 765
},
{
"epoch": 0.35304906006419073,
"grad_norm": 2.564041005915397,
"learning_rate": 8.176992132602221e-06,
"loss": 0.5935,
"step": 770
},
{
"epoch": 0.3553415864282439,
"grad_norm": 2.835188198766609,
"learning_rate": 8.145979724281079e-06,
"loss": 0.577,
"step": 775
},
{
"epoch": 0.3576341127922971,
"grad_norm": 2.624154236961289,
"learning_rate": 8.114765667296628e-06,
"loss": 0.5807,
"step": 780
},
{
"epoch": 0.3599266391563503,
"grad_norm": 2.803920892055745,
"learning_rate": 8.083351962384234e-06,
"loss": 0.5827,
"step": 785
},
{
"epoch": 0.3622191655204035,
"grad_norm": 2.7453769474392438,
"learning_rate": 8.051740623076132e-06,
"loss": 0.5743,
"step": 790
},
{
"epoch": 0.36451169188445665,
"grad_norm": 2.642012832230722,
"learning_rate": 8.019933675572389e-06,
"loss": 0.5924,
"step": 795
},
{
"epoch": 0.36680421824850984,
"grad_norm": 2.5959618878893496,
"learning_rate": 7.987933158611013e-06,
"loss": 0.5765,
"step": 800
},
{
"epoch": 0.36909674461256303,
"grad_norm": 2.6981842811728107,
"learning_rate": 7.95574112333729e-06,
"loss": 0.5636,
"step": 805
},
{
"epoch": 0.3713892709766162,
"grad_norm": 2.7155825019244246,
"learning_rate": 7.923359633172299e-06,
"loss": 0.5676,
"step": 810
},
{
"epoch": 0.3736817973406694,
"grad_norm": 2.722727252289237,
"learning_rate": 7.890790763680658e-06,
"loss": 0.5849,
"step": 815
},
{
"epoch": 0.3759743237047226,
"grad_norm": 2.5941959497564073,
"learning_rate": 7.85803660243749e-06,
"loss": 0.582,
"step": 820
},
{
"epoch": 0.3782668500687758,
"grad_norm": 2.448527666302428,
"learning_rate": 7.8250992488946e-06,
"loss": 0.586,
"step": 825
},
{
"epoch": 0.380559376432829,
"grad_norm": 2.786081596311819,
"learning_rate": 7.791980814245931e-06,
"loss": 0.5547,
"step": 830
},
{
"epoch": 0.38285190279688214,
"grad_norm": 2.6225345564151237,
"learning_rate": 7.758683421292217e-06,
"loss": 0.5562,
"step": 835
},
{
"epoch": 0.38514442916093533,
"grad_norm": 2.495977821656378,
"learning_rate": 7.72520920430493e-06,
"loss": 0.5728,
"step": 840
},
{
"epoch": 0.3874369555249885,
"grad_norm": 2.5523314447232535,
"learning_rate": 7.691560308889478e-06,
"loss": 0.5748,
"step": 845
},
{
"epoch": 0.3897294818890417,
"grad_norm": 2.702511447586494,
"learning_rate": 7.657738891847679e-06,
"loss": 0.5651,
"step": 850
},
{
"epoch": 0.3920220082530949,
"grad_norm": 2.6729070020445533,
"learning_rate": 7.623747121039512e-06,
"loss": 0.5716,
"step": 855
},
{
"epoch": 0.3943145346171481,
"grad_norm": 2.7351708064638665,
"learning_rate": 7.589587175244162e-06,
"loss": 0.565,
"step": 860
},
{
"epoch": 0.3966070609812013,
"grad_norm": 2.5916997954156638,
"learning_rate": 7.555261244020371e-06,
"loss": 0.5691,
"step": 865
},
{
"epoch": 0.3988995873452545,
"grad_norm": 2.4806248685486407,
"learning_rate": 7.520771527566093e-06,
"loss": 0.5672,
"step": 870
},
{
"epoch": 0.40119211370930763,
"grad_norm": 2.691711711440267,
"learning_rate": 7.486120236577464e-06,
"loss": 0.5555,
"step": 875
},
{
"epoch": 0.4034846400733608,
"grad_norm": 2.6506103202422797,
"learning_rate": 7.451309592107104e-06,
"loss": 0.5548,
"step": 880
},
{
"epoch": 0.405777166437414,
"grad_norm": 2.5210545941984983,
"learning_rate": 7.416341825421755e-06,
"loss": 0.573,
"step": 885
},
{
"epoch": 0.4080696928014672,
"grad_norm": 2.7103495153803627,
"learning_rate": 7.381219177859257e-06,
"loss": 0.5428,
"step": 890
},
{
"epoch": 0.4103622191655204,
"grad_norm": 2.5223081344987826,
"learning_rate": 7.345943900684896e-06,
"loss": 0.5605,
"step": 895
},
{
"epoch": 0.4126547455295736,
"grad_norm": 2.5684242617186364,
"learning_rate": 7.310518254947092e-06,
"loss": 0.5432,
"step": 900
},
{
"epoch": 0.4149472718936268,
"grad_norm": 2.8905063764239327,
"learning_rate": 7.274944511332479e-06,
"loss": 0.5355,
"step": 905
},
{
"epoch": 0.41723979825768,
"grad_norm": 2.7288840976281543,
"learning_rate": 7.239224950020359e-06,
"loss": 0.5583,
"step": 910
},
{
"epoch": 0.4195323246217332,
"grad_norm": 2.573090270715344,
"learning_rate": 7.203361860536544e-06,
"loss": 0.5528,
"step": 915
},
{
"epoch": 0.4218248509857863,
"grad_norm": 2.7074335935753897,
"learning_rate": 7.167357541606613e-06,
"loss": 0.5457,
"step": 920
},
{
"epoch": 0.4241173773498395,
"grad_norm": 2.6225623425429614,
"learning_rate": 7.131214301008564e-06,
"loss": 0.5405,
"step": 925
},
{
"epoch": 0.4264099037138927,
"grad_norm": 2.638186367850455,
"learning_rate": 7.094934455424889e-06,
"loss": 0.5457,
"step": 930
},
{
"epoch": 0.4287024300779459,
"grad_norm": 2.663625944879504,
"learning_rate": 7.058520330294087e-06,
"loss": 0.5499,
"step": 935
},
{
"epoch": 0.4309949564419991,
"grad_norm": 2.594656111210185,
"learning_rate": 7.021974259661607e-06,
"loss": 0.5471,
"step": 940
},
{
"epoch": 0.4332874828060523,
"grad_norm": 2.558300587882855,
"learning_rate": 6.985298586030241e-06,
"loss": 0.5465,
"step": 945
},
{
"epoch": 0.4355800091701055,
"grad_norm": 2.6435075817238425,
"learning_rate": 6.948495660209983e-06,
"loss": 0.5331,
"step": 950
},
{
"epoch": 0.43787253553415867,
"grad_norm": 2.494991656905618,
"learning_rate": 6.9115678411673345e-06,
"loss": 0.5371,
"step": 955
},
{
"epoch": 0.4401650618982118,
"grad_norm": 2.4881542600695643,
"learning_rate": 6.8745174958741164e-06,
"loss": 0.5329,
"step": 960
},
{
"epoch": 0.442457588262265,
"grad_norm": 2.552409503690461,
"learning_rate": 6.837346999155743e-06,
"loss": 0.532,
"step": 965
},
{
"epoch": 0.4447501146263182,
"grad_norm": 2.4970182042863445,
"learning_rate": 6.800058733539003e-06,
"loss": 0.5376,
"step": 970
},
{
"epoch": 0.4470426409903714,
"grad_norm": 2.468594629574796,
"learning_rate": 6.762655089099353e-06,
"loss": 0.513,
"step": 975
},
{
"epoch": 0.4493351673544246,
"grad_norm": 2.5797501981324453,
"learning_rate": 6.725138463307714e-06,
"loss": 0.5408,
"step": 980
},
{
"epoch": 0.4516276937184778,
"grad_norm": 2.8482359445979246,
"learning_rate": 6.687511260876799e-06,
"loss": 0.5189,
"step": 985
},
{
"epoch": 0.45392022008253097,
"grad_norm": 2.6612518014120816,
"learning_rate": 6.649775893606982e-06,
"loss": 0.5318,
"step": 990
},
{
"epoch": 0.45621274644658416,
"grad_norm": 2.5372082111080347,
"learning_rate": 6.611934780231704e-06,
"loss": 0.5076,
"step": 995
},
{
"epoch": 0.4585052728106373,
"grad_norm": 2.4460238122171916,
"learning_rate": 6.573990346262445e-06,
"loss": 0.5028,
"step": 1000
},
{
"epoch": 0.4607977991746905,
"grad_norm": 2.5523381259232747,
"learning_rate": 6.535945023833249e-06,
"loss": 0.5188,
"step": 1005
},
{
"epoch": 0.4630903255387437,
"grad_norm": 2.6717883324323104,
"learning_rate": 6.497801251544833e-06,
"loss": 0.5137,
"step": 1010
},
{
"epoch": 0.4653828519027969,
"grad_norm": 2.4441200104866763,
"learning_rate": 6.459561474308278e-06,
"loss": 0.513,
"step": 1015
},
{
"epoch": 0.4676753782668501,
"grad_norm": 2.4626953473958046,
"learning_rate": 6.421228143188325e-06,
"loss": 0.5241,
"step": 1020
},
{
"epoch": 0.46996790463090327,
"grad_norm": 2.414799048761899,
"learning_rate": 6.382803715246254e-06,
"loss": 0.5265,
"step": 1025
},
{
"epoch": 0.47226043099495646,
"grad_norm": 2.661888186403354,
"learning_rate": 6.344290653382408e-06,
"loss": 0.5122,
"step": 1030
},
{
"epoch": 0.47455295735900965,
"grad_norm": 2.705613301623184,
"learning_rate": 6.305691426178316e-06,
"loss": 0.5076,
"step": 1035
},
{
"epoch": 0.4768454837230628,
"grad_norm": 2.5901180556298007,
"learning_rate": 6.267008507738472e-06,
"loss": 0.5309,
"step": 1040
},
{
"epoch": 0.479138010087116,
"grad_norm": 2.5393961483789345,
"learning_rate": 6.228244377531747e-06,
"loss": 0.506,
"step": 1045
},
{
"epoch": 0.4814305364511692,
"grad_norm": 2.5959034041763154,
"learning_rate": 6.189401520232464e-06,
"loss": 0.5065,
"step": 1050
},
{
"epoch": 0.48372306281522237,
"grad_norm": 2.6419168193929963,
"learning_rate": 6.150482425561135e-06,
"loss": 0.5189,
"step": 1055
},
{
"epoch": 0.48601558917927556,
"grad_norm": 2.58024430648069,
"learning_rate": 6.11148958812488e-06,
"loss": 0.5071,
"step": 1060
},
{
"epoch": 0.48830811554332876,
"grad_norm": 2.4501378891077987,
"learning_rate": 6.072425507257528e-06,
"loss": 0.5033,
"step": 1065
},
{
"epoch": 0.49060064190738195,
"grad_norm": 2.783006969507733,
"learning_rate": 6.033292686859414e-06,
"loss": 0.4955,
"step": 1070
},
{
"epoch": 0.49289316827143514,
"grad_norm": 2.428894458608491,
"learning_rate": 5.99409363523689e-06,
"loss": 0.4973,
"step": 1075
},
{
"epoch": 0.4951856946354883,
"grad_norm": 2.7389561374869342,
"learning_rate": 5.9548308649415486e-06,
"loss": 0.5051,
"step": 1080
},
{
"epoch": 0.4974782209995415,
"grad_norm": 2.5456232835838124,
"learning_rate": 5.91550689260917e-06,
"loss": 0.4935,
"step": 1085
},
{
"epoch": 0.49977074736359467,
"grad_norm": 2.6057045786417685,
"learning_rate": 5.876124238798424e-06,
"loss": 0.501,
"step": 1090
},
{
"epoch": 0.5020632737276479,
"grad_norm": 2.4695060680872873,
"learning_rate": 5.836685427829296e-06,
"loss": 0.5032,
"step": 1095
},
{
"epoch": 0.504355800091701,
"grad_norm": 2.3783397469941376,
"learning_rate": 5.797192987621293e-06,
"loss": 0.4985,
"step": 1100
},
{
"epoch": 0.5066483264557542,
"grad_norm": 2.491153548859691,
"learning_rate": 5.7576494495314105e-06,
"loss": 0.5043,
"step": 1105
},
{
"epoch": 0.5089408528198074,
"grad_norm": 2.6062141152111673,
"learning_rate": 5.718057348191874e-06,
"loss": 0.4868,
"step": 1110
},
{
"epoch": 0.5112333791838606,
"grad_norm": 2.5012205713207405,
"learning_rate": 5.678419221347687e-06,
"loss": 0.4979,
"step": 1115
},
{
"epoch": 0.5135259055479138,
"grad_norm": 2.609877005241944,
"learning_rate": 5.638737609693953e-06,
"loss": 0.495,
"step": 1120
},
{
"epoch": 0.515818431911967,
"grad_norm": 2.684672446431491,
"learning_rate": 5.599015056713037e-06,
"loss": 0.4823,
"step": 1125
},
{
"epoch": 0.5181109582760202,
"grad_norm": 2.4771534112729228,
"learning_rate": 5.559254108511531e-06,
"loss": 0.5016,
"step": 1130
},
{
"epoch": 0.5204034846400734,
"grad_norm": 2.46810743209868,
"learning_rate": 5.519457313657056e-06,
"loss": 0.4896,
"step": 1135
},
{
"epoch": 0.5226960110041265,
"grad_norm": 2.5795208204825983,
"learning_rate": 5.479627223014902e-06,
"loss": 0.4886,
"step": 1140
},
{
"epoch": 0.5249885373681797,
"grad_norm": 2.434086073989824,
"learning_rate": 5.439766389584527e-06,
"loss": 0.4865,
"step": 1145
},
{
"epoch": 0.5272810637322329,
"grad_norm": 2.4538097489169934,
"learning_rate": 5.399877368335922e-06,
"loss": 0.4914,
"step": 1150
},
{
"epoch": 0.5295735900962861,
"grad_norm": 2.5415775013932063,
"learning_rate": 5.359962716045836e-06,
"loss": 0.4936,
"step": 1155
},
{
"epoch": 0.5318661164603393,
"grad_norm": 2.56697946552087,
"learning_rate": 5.3200249911338986e-06,
"loss": 0.4894,
"step": 1160
},
{
"epoch": 0.5341586428243925,
"grad_norm": 2.572922499741503,
"learning_rate": 5.280066753498632e-06,
"loss": 0.4794,
"step": 1165
},
{
"epoch": 0.5364511691884457,
"grad_norm": 2.623599926005301,
"learning_rate": 5.240090564353365e-06,
"loss": 0.4959,
"step": 1170
},
{
"epoch": 0.5387436955524989,
"grad_norm": 2.4231120561633324,
"learning_rate": 5.200098986062072e-06,
"loss": 0.4753,
"step": 1175
},
{
"epoch": 0.5410362219165521,
"grad_norm": 2.5196186316057108,
"learning_rate": 5.160094581975127e-06,
"loss": 0.4783,
"step": 1180
},
{
"epoch": 0.5433287482806052,
"grad_norm": 2.527690400984075,
"learning_rate": 5.1200799162650035e-06,
"loss": 0.4916,
"step": 1185
},
{
"epoch": 0.5456212746446584,
"grad_norm": 2.6015322908629415,
"learning_rate": 5.080057553761917e-06,
"loss": 0.4738,
"step": 1190
},
{
"epoch": 0.5479138010087116,
"grad_norm": 2.3467602506879786,
"learning_rate": 5.040030059789426e-06,
"loss": 0.476,
"step": 1195
},
{
"epoch": 0.5502063273727648,
"grad_norm": 2.570425940808593,
"learning_rate": 5e-06,
"loss": 0.4903,
"step": 1200
},
{
"epoch": 0.552498853736818,
"grad_norm": 2.5543989632263284,
"learning_rate": 4.9599699402105755e-06,
"loss": 0.4673,
"step": 1205
},
{
"epoch": 0.5547913801008711,
"grad_norm": 2.5213973685823277,
"learning_rate": 4.919942446238085e-06,
"loss": 0.4693,
"step": 1210
},
{
"epoch": 0.5570839064649243,
"grad_norm": 2.4952425404718075,
"learning_rate": 4.879920083734997e-06,
"loss": 0.4692,
"step": 1215
},
{
"epoch": 0.5593764328289775,
"grad_norm": 2.5419193115674776,
"learning_rate": 4.839905418024875e-06,
"loss": 0.4814,
"step": 1220
},
{
"epoch": 0.5616689591930307,
"grad_norm": 2.558303192571574,
"learning_rate": 4.7999010139379295e-06,
"loss": 0.4698,
"step": 1225
},
{
"epoch": 0.5639614855570839,
"grad_norm": 2.4678859101946315,
"learning_rate": 4.759909435646636e-06,
"loss": 0.4896,
"step": 1230
},
{
"epoch": 0.5662540119211371,
"grad_norm": 2.6716519633665783,
"learning_rate": 4.719933246501369e-06,
"loss": 0.4852,
"step": 1235
},
{
"epoch": 0.5685465382851903,
"grad_norm": 2.4330925797194807,
"learning_rate": 4.679975008866103e-06,
"loss": 0.4554,
"step": 1240
},
{
"epoch": 0.5708390646492435,
"grad_norm": 2.437937005459216,
"learning_rate": 4.640037283954165e-06,
"loss": 0.4598,
"step": 1245
},
{
"epoch": 0.5731315910132967,
"grad_norm": 2.413361545021729,
"learning_rate": 4.6001226316640804e-06,
"loss": 0.4739,
"step": 1250
},
{
"epoch": 0.5754241173773499,
"grad_norm": 2.3552453394422503,
"learning_rate": 4.5602336104154745e-06,
"loss": 0.4646,
"step": 1255
},
{
"epoch": 0.5777166437414031,
"grad_norm": 2.623470049632146,
"learning_rate": 4.520372776985101e-06,
"loss": 0.4579,
"step": 1260
},
{
"epoch": 0.5800091701054562,
"grad_norm": 2.4219278336672874,
"learning_rate": 4.480542686342946e-06,
"loss": 0.4613,
"step": 1265
},
{
"epoch": 0.5823016964695094,
"grad_norm": 2.517369439139374,
"learning_rate": 4.440745891488471e-06,
"loss": 0.4523,
"step": 1270
},
{
"epoch": 0.5845942228335625,
"grad_norm": 2.501700820037027,
"learning_rate": 4.400984943286965e-06,
"loss": 0.4671,
"step": 1275
},
{
"epoch": 0.5868867491976157,
"grad_norm": 2.4011689731614605,
"learning_rate": 4.361262390306049e-06,
"loss": 0.4527,
"step": 1280
},
{
"epoch": 0.5891792755616689,
"grad_norm": 2.5994696717863706,
"learning_rate": 4.321580778652316e-06,
"loss": 0.4493,
"step": 1285
},
{
"epoch": 0.5914718019257221,
"grad_norm": 2.491956972995198,
"learning_rate": 4.2819426518081265e-06,
"loss": 0.456,
"step": 1290
},
{
"epoch": 0.5937643282897753,
"grad_norm": 2.4353572047335996,
"learning_rate": 4.2423505504685894e-06,
"loss": 0.4611,
"step": 1295
},
{
"epoch": 0.5960568546538285,
"grad_norm": 2.4904358458702944,
"learning_rate": 4.202807012378707e-06,
"loss": 0.4546,
"step": 1300
},
{
"epoch": 0.5983493810178817,
"grad_norm": 2.4617619082762636,
"learning_rate": 4.163314572170704e-06,
"loss": 0.458,
"step": 1305
},
{
"epoch": 0.6006419073819349,
"grad_norm": 2.354023280982333,
"learning_rate": 4.123875761201576e-06,
"loss": 0.4433,
"step": 1310
},
{
"epoch": 0.6029344337459881,
"grad_norm": 2.540723557518342,
"learning_rate": 4.08449310739083e-06,
"loss": 0.4484,
"step": 1315
},
{
"epoch": 0.6052269601100413,
"grad_norm": 2.4043887566981446,
"learning_rate": 4.045169135058452e-06,
"loss": 0.4416,
"step": 1320
},
{
"epoch": 0.6075194864740945,
"grad_norm": 2.481355244310724,
"learning_rate": 4.0059063647631105e-06,
"loss": 0.4645,
"step": 1325
},
{
"epoch": 0.6098120128381477,
"grad_norm": 2.499493147862873,
"learning_rate": 3.966707313140587e-06,
"loss": 0.4542,
"step": 1330
},
{
"epoch": 0.6121045392022009,
"grad_norm": 2.5034183191594477,
"learning_rate": 3.927574492742473e-06,
"loss": 0.4465,
"step": 1335
},
{
"epoch": 0.6143970655662541,
"grad_norm": 2.450159706952634,
"learning_rate": 3.888510411875121e-06,
"loss": 0.4451,
"step": 1340
},
{
"epoch": 0.6166895919303071,
"grad_norm": 2.437273107870038,
"learning_rate": 3.849517574438866e-06,
"loss": 0.4393,
"step": 1345
},
{
"epoch": 0.6189821182943603,
"grad_norm": 2.4867270897195164,
"learning_rate": 3.8105984797675364e-06,
"loss": 0.4369,
"step": 1350
},
{
"epoch": 0.6212746446584135,
"grad_norm": 2.4474532182002156,
"learning_rate": 3.771755622468254e-06,
"loss": 0.4459,
"step": 1355
},
{
"epoch": 0.6235671710224667,
"grad_norm": 2.3883568752400737,
"learning_rate": 3.7329914922615283e-06,
"loss": 0.4414,
"step": 1360
},
{
"epoch": 0.6258596973865199,
"grad_norm": 2.323604786191338,
"learning_rate": 3.6943085738216855e-06,
"loss": 0.4294,
"step": 1365
},
{
"epoch": 0.6281522237505731,
"grad_norm": 2.5364327673030553,
"learning_rate": 3.655709346617593e-06,
"loss": 0.4482,
"step": 1370
},
{
"epoch": 0.6304447501146263,
"grad_norm": 2.528211312039227,
"learning_rate": 3.6171962847537466e-06,
"loss": 0.4483,
"step": 1375
},
{
"epoch": 0.6327372764786795,
"grad_norm": 2.4014535334880533,
"learning_rate": 3.5787718568116764e-06,
"loss": 0.4479,
"step": 1380
},
{
"epoch": 0.6350298028427327,
"grad_norm": 2.6961239350559687,
"learning_rate": 3.540438525691723e-06,
"loss": 0.4375,
"step": 1385
},
{
"epoch": 0.6373223292067859,
"grad_norm": 2.4568407427026027,
"learning_rate": 3.502198748455169e-06,
"loss": 0.4461,
"step": 1390
},
{
"epoch": 0.6396148555708391,
"grad_norm": 2.444432290321262,
"learning_rate": 3.464054976166753e-06,
"loss": 0.4409,
"step": 1395
},
{
"epoch": 0.6419073819348923,
"grad_norm": 2.3930367223498927,
"learning_rate": 3.4260096537375553e-06,
"loss": 0.433,
"step": 1400
},
{
"epoch": 0.6441999082989455,
"grad_norm": 2.431394532574176,
"learning_rate": 3.3880652197682974e-06,
"loss": 0.4229,
"step": 1405
},
{
"epoch": 0.6464924346629987,
"grad_norm": 2.434581693659057,
"learning_rate": 3.3502241063930196e-06,
"loss": 0.4389,
"step": 1410
},
{
"epoch": 0.6487849610270519,
"grad_norm": 2.3993499417107156,
"learning_rate": 3.3124887391232026e-06,
"loss": 0.4219,
"step": 1415
},
{
"epoch": 0.651077487391105,
"grad_norm": 2.476740652860741,
"learning_rate": 3.2748615366922864e-06,
"loss": 0.427,
"step": 1420
},
{
"epoch": 0.6533700137551581,
"grad_norm": 2.507048548706466,
"learning_rate": 3.2373449109006476e-06,
"loss": 0.4341,
"step": 1425
},
{
"epoch": 0.6556625401192113,
"grad_norm": 2.418497030941838,
"learning_rate": 3.1999412664609986e-06,
"loss": 0.4329,
"step": 1430
},
{
"epoch": 0.6579550664832645,
"grad_norm": 2.4312888314629144,
"learning_rate": 3.162653000844259e-06,
"loss": 0.4227,
"step": 1435
},
{
"epoch": 0.6602475928473177,
"grad_norm": 2.353877004261892,
"learning_rate": 3.1254825041258852e-06,
"loss": 0.4302,
"step": 1440
},
{
"epoch": 0.6625401192113709,
"grad_norm": 2.381814531488306,
"learning_rate": 3.0884321588326668e-06,
"loss": 0.4376,
"step": 1445
},
{
"epoch": 0.6648326455754241,
"grad_norm": 2.4501307973874287,
"learning_rate": 3.051504339790019e-06,
"loss": 0.4254,
"step": 1450
},
{
"epoch": 0.6671251719394773,
"grad_norm": 2.459251255110059,
"learning_rate": 3.0147014139697596e-06,
"loss": 0.4263,
"step": 1455
},
{
"epoch": 0.6694176983035305,
"grad_norm": 2.5254030222294466,
"learning_rate": 2.978025740338396e-06,
"loss": 0.4195,
"step": 1460
},
{
"epoch": 0.6717102246675837,
"grad_norm": 2.2951603398964235,
"learning_rate": 2.9414796697059155e-06,
"loss": 0.4129,
"step": 1465
},
{
"epoch": 0.6740027510316369,
"grad_norm": 2.364236291272217,
"learning_rate": 2.905065544575114e-06,
"loss": 0.4197,
"step": 1470
},
{
"epoch": 0.6762952773956901,
"grad_norm": 2.4601102682369205,
"learning_rate": 2.8687856989914393e-06,
"loss": 0.4234,
"step": 1475
},
{
"epoch": 0.6785878037597433,
"grad_norm": 2.686432591416178,
"learning_rate": 2.8326424583933878e-06,
"loss": 0.4223,
"step": 1480
},
{
"epoch": 0.6808803301237965,
"grad_norm": 2.3448228852350788,
"learning_rate": 2.796638139463456e-06,
"loss": 0.4149,
"step": 1485
},
{
"epoch": 0.6831728564878496,
"grad_norm": 2.317745266155718,
"learning_rate": 2.7607750499796426e-06,
"loss": 0.4161,
"step": 1490
},
{
"epoch": 0.6854653828519028,
"grad_norm": 2.3719922106424725,
"learning_rate": 2.725055488667522e-06,
"loss": 0.4275,
"step": 1495
},
{
"epoch": 0.687757909215956,
"grad_norm": 2.4553896347366746,
"learning_rate": 2.689481745052908e-06,
"loss": 0.3954,
"step": 1500
},
{
"epoch": 0.6900504355800092,
"grad_norm": 2.471280707724599,
"learning_rate": 2.6540560993151045e-06,
"loss": 0.408,
"step": 1505
},
{
"epoch": 0.6923429619440623,
"grad_norm": 2.375550619652342,
"learning_rate": 2.6187808221407433e-06,
"loss": 0.4091,
"step": 1510
},
{
"epoch": 0.6946354883081155,
"grad_norm": 2.3794291144670865,
"learning_rate": 2.5836581745782474e-06,
"loss": 0.4203,
"step": 1515
},
{
"epoch": 0.6969280146721687,
"grad_norm": 2.3959254909604133,
"learning_rate": 2.5486904078928954e-06,
"loss": 0.4019,
"step": 1520
},
{
"epoch": 0.6992205410362219,
"grad_norm": 2.4572132670378593,
"learning_rate": 2.5138797634225358e-06,
"loss": 0.4025,
"step": 1525
},
{
"epoch": 0.7015130674002751,
"grad_norm": 2.567664513817577,
"learning_rate": 2.4792284724339077e-06,
"loss": 0.4096,
"step": 1530
},
{
"epoch": 0.7038055937643283,
"grad_norm": 2.473854002398598,
"learning_rate": 2.4447387559796306e-06,
"loss": 0.4129,
"step": 1535
},
{
"epoch": 0.7060981201283815,
"grad_norm": 2.2347261984430844,
"learning_rate": 2.410412824755839e-06,
"loss": 0.4147,
"step": 1540
},
{
"epoch": 0.7083906464924347,
"grad_norm": 2.45007211279529,
"learning_rate": 2.3762528789604887e-06,
"loss": 0.4159,
"step": 1545
},
{
"epoch": 0.7106831728564879,
"grad_norm": 2.57319881552059,
"learning_rate": 2.3422611081523215e-06,
"loss": 0.4044,
"step": 1550
},
{
"epoch": 0.712975699220541,
"grad_norm": 2.40694698697041,
"learning_rate": 2.3084396911105233e-06,
"loss": 0.3888,
"step": 1555
},
{
"epoch": 0.7152682255845942,
"grad_norm": 2.6193951641238837,
"learning_rate": 2.274790795695071e-06,
"loss": 0.4186,
"step": 1560
},
{
"epoch": 0.7175607519486474,
"grad_norm": 2.3915420788033686,
"learning_rate": 2.2413165787077844e-06,
"loss": 0.4105,
"step": 1565
},
{
"epoch": 0.7198532783127006,
"grad_norm": 2.4922945082662706,
"learning_rate": 2.20801918575407e-06,
"loss": 0.41,
"step": 1570
},
{
"epoch": 0.7221458046767538,
"grad_norm": 2.361018492853961,
"learning_rate": 2.1749007511054005e-06,
"loss": 0.4075,
"step": 1575
},
{
"epoch": 0.724438331040807,
"grad_norm": 2.453915782459234,
"learning_rate": 2.1419633975625113e-06,
"loss": 0.4123,
"step": 1580
},
{
"epoch": 0.7267308574048602,
"grad_norm": 2.2558599145275458,
"learning_rate": 2.109209236319342e-06,
"loss": 0.3971,
"step": 1585
},
{
"epoch": 0.7290233837689133,
"grad_norm": 2.3942282865574103,
"learning_rate": 2.076640366827703e-06,
"loss": 0.4012,
"step": 1590
},
{
"epoch": 0.7313159101329665,
"grad_norm": 2.4100293351001714,
"learning_rate": 2.04425887666271e-06,
"loss": 0.3926,
"step": 1595
},
{
"epoch": 0.7336084364970197,
"grad_norm": 2.5693096989442927,
"learning_rate": 2.0120668413889877e-06,
"loss": 0.4021,
"step": 1600
},
{
"epoch": 0.7359009628610729,
"grad_norm": 2.513834629858347,
"learning_rate": 1.980066324427613e-06,
"loss": 0.3926,
"step": 1605
},
{
"epoch": 0.7381934892251261,
"grad_norm": 2.500502153829468,
"learning_rate": 1.9482593769238695e-06,
"loss": 0.3932,
"step": 1610
},
{
"epoch": 0.7404860155891793,
"grad_norm": 2.2943690678553827,
"learning_rate": 1.916648037615767e-06,
"loss": 0.3961,
"step": 1615
},
{
"epoch": 0.7427785419532325,
"grad_norm": 2.4947450845729904,
"learning_rate": 1.8852343327033717e-06,
"loss": 0.3918,
"step": 1620
},
{
"epoch": 0.7450710683172856,
"grad_norm": 2.475640064869192,
"learning_rate": 1.854020275718924e-06,
"loss": 0.3953,
"step": 1625
},
{
"epoch": 0.7473635946813388,
"grad_norm": 2.380898479266151,
"learning_rate": 1.8230078673977802e-06,
"loss": 0.3767,
"step": 1630
},
{
"epoch": 0.749656121045392,
"grad_norm": 2.3124836007659444,
"learning_rate": 1.7921990955501705e-06,
"loss": 0.386,
"step": 1635
},
{
"epoch": 0.7519486474094452,
"grad_norm": 2.3942291132445375,
"learning_rate": 1.7615959349337914e-06,
"loss": 0.3964,
"step": 1640
},
{
"epoch": 0.7542411737734984,
"grad_norm": 2.4125792225674614,
"learning_rate": 1.731200347127217e-06,
"loss": 0.3918,
"step": 1645
},
{
"epoch": 0.7565337001375516,
"grad_norm": 2.4570540617910788,
"learning_rate": 1.7010142804041785e-06,
"loss": 0.4012,
"step": 1650
},
{
"epoch": 0.7588262265016048,
"grad_norm": 2.3060832536528006,
"learning_rate": 1.6710396696086768e-06,
"loss": 0.4026,
"step": 1655
},
{
"epoch": 0.761118752865658,
"grad_norm": 2.357410070095031,
"learning_rate": 1.6412784360309753e-06,
"loss": 0.3876,
"step": 1660
},
{
"epoch": 0.7634112792297112,
"grad_norm": 2.5569987658890434,
"learning_rate": 1.611732487284437e-06,
"loss": 0.3875,
"step": 1665
},
{
"epoch": 0.7657038055937643,
"grad_norm": 2.5367416684876805,
"learning_rate": 1.5824037171832595e-06,
"loss": 0.3923,
"step": 1670
},
{
"epoch": 0.7679963319578175,
"grad_norm": 2.370553404803813,
"learning_rate": 1.5532940056210882e-06,
"loss": 0.3916,
"step": 1675
},
{
"epoch": 0.7702888583218707,
"grad_norm": 2.445473374507484,
"learning_rate": 1.524405218450517e-06,
"loss": 0.4005,
"step": 1680
},
{
"epoch": 0.7725813846859239,
"grad_norm": 2.416383451707918,
"learning_rate": 1.4957392073634912e-06,
"loss": 0.385,
"step": 1685
},
{
"epoch": 0.774873911049977,
"grad_norm": 2.4307180782279976,
"learning_rate": 1.4672978097726204e-06,
"loss": 0.3857,
"step": 1690
},
{
"epoch": 0.7771664374140302,
"grad_norm": 2.4572760495599795,
"learning_rate": 1.439082848693406e-06,
"loss": 0.3916,
"step": 1695
},
{
"epoch": 0.7794589637780834,
"grad_norm": 2.408412846059606,
"learning_rate": 1.4110961326273936e-06,
"loss": 0.3908,
"step": 1700
},
{
"epoch": 0.7817514901421366,
"grad_norm": 2.6601098763821596,
"learning_rate": 1.3833394554462477e-06,
"loss": 0.3859,
"step": 1705
},
{
"epoch": 0.7840440165061898,
"grad_norm": 2.520675032421566,
"learning_rate": 1.35581459627677e-06,
"loss": 0.3936,
"step": 1710
},
{
"epoch": 0.786336542870243,
"grad_norm": 2.257467358094596,
"learning_rate": 1.3285233193868663e-06,
"loss": 0.3799,
"step": 1715
},
{
"epoch": 0.7886290692342962,
"grad_norm": 2.327829634660073,
"learning_rate": 1.3014673740724615e-06,
"loss": 0.3876,
"step": 1720
},
{
"epoch": 0.7909215955983494,
"grad_norm": 2.366347981314184,
"learning_rate": 1.2746484945453691e-06,
"loss": 0.3829,
"step": 1725
},
{
"epoch": 0.7932141219624026,
"grad_norm": 2.391058577508851,
"learning_rate": 1.2480683998221365e-06,
"loss": 0.3825,
"step": 1730
},
{
"epoch": 0.7955066483264558,
"grad_norm": 2.470899547865623,
"learning_rate": 1.221728793613865e-06,
"loss": 0.3895,
"step": 1735
},
{
"epoch": 0.797799174690509,
"grad_norm": 2.399551521415764,
"learning_rate": 1.1956313642169974e-06,
"loss": 0.3846,
"step": 1740
},
{
"epoch": 0.8000917010545622,
"grad_norm": 2.463312952219633,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.3788,
"step": 1745
},
{
"epoch": 0.8023842274186153,
"grad_norm": 2.4348320894873092,
"learning_rate": 1.1441697113216893e-06,
"loss": 0.3803,
"step": 1750
},
{
"epoch": 0.8046767537826685,
"grad_norm": 2.385545108416876,
"learning_rate": 1.1188087863739173e-06,
"loss": 0.3859,
"step": 1755
},
{
"epoch": 0.8069692801467216,
"grad_norm": 2.4484362721344195,
"learning_rate": 1.0936966351274554e-06,
"loss": 0.3739,
"step": 1760
},
{
"epoch": 0.8092618065107748,
"grad_norm": 2.4361451039130317,
"learning_rate": 1.0688348672022547e-06,
"loss": 0.4012,
"step": 1765
},
{
"epoch": 0.811554332874828,
"grad_norm": 2.5671935693516192,
"learning_rate": 1.0442250761693829e-06,
"loss": 0.3717,
"step": 1770
},
{
"epoch": 0.8138468592388812,
"grad_norm": 2.3910678127476475,
"learning_rate": 1.0198688394488837e-06,
"loss": 0.3824,
"step": 1775
},
{
"epoch": 0.8161393856029344,
"grad_norm": 2.4337476998865237,
"learning_rate": 9.957677182086611e-07,
"loss": 0.3754,
"step": 1780
},
{
"epoch": 0.8184319119669876,
"grad_norm": 2.3930303860053055,
"learning_rate": 9.719232572644189e-07,
"loss": 0.3814,
"step": 1785
},
{
"epoch": 0.8207244383310408,
"grad_norm": 2.4070725664187194,
"learning_rate": 9.483369849806401e-07,
"loss": 0.3681,
"step": 1790
},
{
"epoch": 0.823016964695094,
"grad_norm": 2.4234654890940277,
"learning_rate": 9.250104131726256e-07,
"loss": 0.3748,
"step": 1795
},
{
"epoch": 0.8253094910591472,
"grad_norm": 2.4405075201633486,
"learning_rate": 9.019450370095867e-07,
"loss": 0.3852,
"step": 1800
},
{
"epoch": 0.8276020174232004,
"grad_norm": 2.4157009817816535,
"learning_rate": 8.791423349188111e-07,
"loss": 0.3738,
"step": 1805
},
{
"epoch": 0.8298945437872536,
"grad_norm": 2.3817117068747695,
"learning_rate": 8.566037684908985e-07,
"loss": 0.3774,
"step": 1810
},
{
"epoch": 0.8321870701513068,
"grad_norm": 2.643862606121901,
"learning_rate": 8.343307823860819e-07,
"loss": 0.3747,
"step": 1815
},
{
"epoch": 0.83447959651536,
"grad_norm": 2.415451666660326,
"learning_rate": 8.123248042416209e-07,
"loss": 0.3807,
"step": 1820
},
{
"epoch": 0.8367721228794132,
"grad_norm": 2.367699275816763,
"learning_rate": 7.905872445802976e-07,
"loss": 0.3819,
"step": 1825
},
{
"epoch": 0.8390646492434664,
"grad_norm": 2.401428866906129,
"learning_rate": 7.691194967200099e-07,
"loss": 0.3773,
"step": 1830
},
{
"epoch": 0.8413571756075194,
"grad_norm": 2.3851132017870444,
"learning_rate": 7.47922936684457e-07,
"loss": 0.3848,
"step": 1835
},
{
"epoch": 0.8436497019715726,
"grad_norm": 2.334920050986847,
"learning_rate": 7.269989231149432e-07,
"loss": 0.3646,
"step": 1840
},
{
"epoch": 0.8459422283356258,
"grad_norm": 2.302533584527464,
"learning_rate": 7.063487971832922e-07,
"loss": 0.3719,
"step": 1845
},
{
"epoch": 0.848234754699679,
"grad_norm": 2.4631469089449443,
"learning_rate": 6.85973882505886e-07,
"loss": 0.3951,
"step": 1850
},
{
"epoch": 0.8505272810637322,
"grad_norm": 2.4860937019904426,
"learning_rate": 6.658754850588161e-07,
"loss": 0.3877,
"step": 1855
},
{
"epoch": 0.8528198074277854,
"grad_norm": 2.366824744001058,
"learning_rate": 6.460548930941801e-07,
"loss": 0.3711,
"step": 1860
},
{
"epoch": 0.8551123337918386,
"grad_norm": 2.587488334709295,
"learning_rate": 6.265133770575066e-07,
"loss": 0.366,
"step": 1865
},
{
"epoch": 0.8574048601558918,
"grad_norm": 2.4606917803825072,
"learning_rate": 6.072521895063255e-07,
"loss": 0.3818,
"step": 1870
},
{
"epoch": 0.859697386519945,
"grad_norm": 2.4967563072720576,
"learning_rate": 5.882725650298787e-07,
"loss": 0.3804,
"step": 1875
},
{
"epoch": 0.8619899128839982,
"grad_norm": 2.4902108475668214,
"learning_rate": 5.695757201699875e-07,
"loss": 0.3751,
"step": 1880
},
{
"epoch": 0.8642824392480514,
"grad_norm": 2.3545990508632713,
"learning_rate": 5.511628533430769e-07,
"loss": 0.3887,
"step": 1885
},
{
"epoch": 0.8665749656121046,
"grad_norm": 2.4583864322248363,
"learning_rate": 5.330351447633603e-07,
"loss": 0.3846,
"step": 1890
},
{
"epoch": 0.8688674919761578,
"grad_norm": 2.558178264129578,
"learning_rate": 5.151937563671889e-07,
"loss": 0.3761,
"step": 1895
},
{
"epoch": 0.871160018340211,
"grad_norm": 2.4125538046249133,
"learning_rate": 4.976398317385767e-07,
"loss": 0.3789,
"step": 1900
},
{
"epoch": 0.8734525447042641,
"grad_norm": 2.5261586438137718,
"learning_rate": 4.803744960358992e-07,
"loss": 0.3692,
"step": 1905
},
{
"epoch": 0.8757450710683173,
"grad_norm": 2.5343814063203913,
"learning_rate": 4.633988559197761e-07,
"loss": 0.3741,
"step": 1910
},
{
"epoch": 0.8780375974323704,
"grad_norm": 2.5455270767430305,
"learning_rate": 4.4671399948213233e-07,
"loss": 0.3742,
"step": 1915
},
{
"epoch": 0.8803301237964236,
"grad_norm": 2.4299267640638442,
"learning_rate": 4.3032099617645874e-07,
"loss": 0.3793,
"step": 1920
},
{
"epoch": 0.8826226501604768,
"grad_norm": 2.5350282869807215,
"learning_rate": 4.1422089674926113e-07,
"loss": 0.3708,
"step": 1925
},
{
"epoch": 0.88491517652453,
"grad_norm": 2.4052098639642745,
"learning_rate": 3.984147331727128e-07,
"loss": 0.3815,
"step": 1930
},
{
"epoch": 0.8872077028885832,
"grad_norm": 2.440029806154777,
"learning_rate": 3.829035185785035e-07,
"loss": 0.3559,
"step": 1935
},
{
"epoch": 0.8895002292526364,
"grad_norm": 2.4757422584836783,
"learning_rate": 3.676882471929044e-07,
"loss": 0.3724,
"step": 1940
},
{
"epoch": 0.8917927556166896,
"grad_norm": 2.405181037542438,
"learning_rate": 3.527698942730384e-07,
"loss": 0.3678,
"step": 1945
},
{
"epoch": 0.8940852819807428,
"grad_norm": 2.477077740628022,
"learning_rate": 3.3814941604437155e-07,
"loss": 0.3696,
"step": 1950
},
{
"epoch": 0.896377808344796,
"grad_norm": 2.594645970360135,
"learning_rate": 3.2382774963941823e-07,
"loss": 0.3689,
"step": 1955
},
{
"epoch": 0.8986703347088492,
"grad_norm": 2.4996924524050526,
"learning_rate": 3.0980581303767576e-07,
"loss": 0.3641,
"step": 1960
},
{
"epoch": 0.9009628610729024,
"grad_norm": 2.5351766412057364,
"learning_rate": 2.9608450500678566e-07,
"loss": 0.3736,
"step": 1965
},
{
"epoch": 0.9032553874369555,
"grad_norm": 2.4812985119515374,
"learning_rate": 2.826647050449216e-07,
"loss": 0.3652,
"step": 1970
},
{
"epoch": 0.9055479138010087,
"grad_norm": 2.4498300583099506,
"learning_rate": 2.69547273324417e-07,
"loss": 0.3653,
"step": 1975
},
{
"epoch": 0.9078404401650619,
"grad_norm": 2.546961383266402,
"learning_rate": 2.5673305063663335e-07,
"loss": 0.3723,
"step": 1980
},
{
"epoch": 0.9101329665291151,
"grad_norm": 2.34777660611532,
"learning_rate": 2.442228583380646e-07,
"loss": 0.3596,
"step": 1985
},
{
"epoch": 0.9124254928931683,
"grad_norm": 2.410870301241545,
"learning_rate": 2.3201749829769083e-07,
"loss": 0.3783,
"step": 1990
},
{
"epoch": 0.9147180192572214,
"grad_norm": 2.519326020963244,
"learning_rate": 2.201177528455828e-07,
"loss": 0.3739,
"step": 1995
},
{
"epoch": 0.9170105456212746,
"grad_norm": 2.4872058403028574,
"learning_rate": 2.085243847227525e-07,
"loss": 0.3768,
"step": 2000
},
{
"epoch": 0.9193030719853278,
"grad_norm": 2.4175176965392544,
"learning_rate": 1.9723813703227013e-07,
"loss": 0.3794,
"step": 2005
},
{
"epoch": 0.921595598349381,
"grad_norm": 2.514035461894725,
"learning_rate": 1.8625973319162605e-07,
"loss": 0.3656,
"step": 2010
},
{
"epoch": 0.9238881247134342,
"grad_norm": 2.4532676789082166,
"learning_rate": 1.7558987688636675e-07,
"loss": 0.361,
"step": 2015
},
{
"epoch": 0.9261806510774874,
"grad_norm": 2.580005311393483,
"learning_rate": 1.652292520249865e-07,
"loss": 0.369,
"step": 2020
},
{
"epoch": 0.9284731774415406,
"grad_norm": 2.359368965829793,
"learning_rate": 1.5517852269509692e-07,
"loss": 0.3571,
"step": 2025
},
{
"epoch": 0.9307657038055938,
"grad_norm": 2.4993672807867178,
"learning_rate": 1.4543833312085365e-07,
"loss": 0.3588,
"step": 2030
},
{
"epoch": 0.933058230169647,
"grad_norm": 2.41149322411576,
"learning_rate": 1.360093076216673e-07,
"loss": 0.3705,
"step": 2035
},
{
"epoch": 0.9353507565337001,
"grad_norm": 2.474736948512413,
"learning_rate": 1.2689205057218602e-07,
"loss": 0.361,
"step": 2040
},
{
"epoch": 0.9376432828977533,
"grad_norm": 2.3336360044904736,
"learning_rate": 1.1808714636355634e-07,
"loss": 0.3568,
"step": 2045
},
{
"epoch": 0.9399358092618065,
"grad_norm": 2.566200023951429,
"learning_rate": 1.0959515936596387e-07,
"loss": 0.3783,
"step": 2050
},
{
"epoch": 0.9422283356258597,
"grad_norm": 2.5160190954507264,
"learning_rate": 1.014166338924627e-07,
"loss": 0.372,
"step": 2055
},
{
"epoch": 0.9445208619899129,
"grad_norm": 2.509256348018165,
"learning_rate": 9.355209416408051e-08,
"loss": 0.3853,
"step": 2060
},
{
"epoch": 0.9468133883539661,
"grad_norm": 2.5224442995349152,
"learning_rate": 8.600204427622438e-08,
"loss": 0.365,
"step": 2065
},
{
"epoch": 0.9491059147180193,
"grad_norm": 2.4001792608745602,
"learning_rate": 7.876696816636276e-08,
"loss": 0.3736,
"step": 2070
},
{
"epoch": 0.9513984410820725,
"grad_norm": 2.4422332203602553,
"learning_rate": 7.184732958301078e-08,
"loss": 0.3651,
"step": 2075
},
{
"epoch": 0.9536909674461256,
"grad_norm": 2.471890892444078,
"learning_rate": 6.524357205600518e-08,
"loss": 0.3624,
"step": 2080
},
{
"epoch": 0.9559834938101788,
"grad_norm": 2.523417346804641,
"learning_rate": 5.895611886807317e-08,
"loss": 0.369,
"step": 2085
},
{
"epoch": 0.958276020174232,
"grad_norm": 2.4584360575665776,
"learning_rate": 5.2985373027702455e-08,
"loss": 0.363,
"step": 2090
},
{
"epoch": 0.9605685465382852,
"grad_norm": 2.467603595232153,
"learning_rate": 4.733171724330854e-08,
"loss": 0.3814,
"step": 2095
},
{
"epoch": 0.9628610729023384,
"grad_norm": 2.5238201533198072,
"learning_rate": 4.19955138987066e-08,
"loss": 0.369,
"step": 2100
},
{
"epoch": 0.9651535992663915,
"grad_norm": 2.5600424647957807,
"learning_rate": 3.697710502988006e-08,
"loss": 0.3652,
"step": 2105
},
{
"epoch": 0.9674461256304447,
"grad_norm": 2.475992842961113,
"learning_rate": 3.2276812303060346e-08,
"loss": 0.3741,
"step": 2110
},
{
"epoch": 0.9697386519944979,
"grad_norm": 2.4735410644370606,
"learning_rate": 2.7894936994106724e-08,
"loss": 0.3571,
"step": 2115
},
{
"epoch": 0.9720311783585511,
"grad_norm": 2.384962513457078,
"learning_rate": 2.383175996919673e-08,
"loss": 0.3654,
"step": 2120
},
{
"epoch": 0.9743237047226043,
"grad_norm": 2.4369560907719414,
"learning_rate": 2.008754166682225e-08,
"loss": 0.3614,
"step": 2125
},
{
"epoch": 0.9766162310866575,
"grad_norm": 2.334334624814976,
"learning_rate": 1.6662522081097308e-08,
"loss": 0.3598,
"step": 2130
},
{
"epoch": 0.9789087574507107,
"grad_norm": 2.515966550970349,
"learning_rate": 1.3556920746373714e-08,
"loss": 0.3539,
"step": 2135
},
{
"epoch": 0.9812012838147639,
"grad_norm": 2.4578356166282704,
"learning_rate": 1.0770936723171199e-08,
"loss": 0.3684,
"step": 2140
},
{
"epoch": 0.9834938101788171,
"grad_norm": 2.534561019356648,
"learning_rate": 8.304748585417077e-09,
"loss": 0.3629,
"step": 2145
},
{
"epoch": 0.9857863365428703,
"grad_norm": 2.4815228224834254,
"learning_rate": 6.158514409000393e-09,
"loss": 0.3617,
"step": 2150
},
{
"epoch": 0.9880788629069235,
"grad_norm": 2.520302407708297,
"learning_rate": 4.332371761638921e-09,
"loss": 0.3716,
"step": 2155
},
{
"epoch": 0.9903713892709766,
"grad_norm": 2.939805778253569,
"learning_rate": 2.8264376940634332e-09,
"loss": 0.3685,
"step": 2160
},
{
"epoch": 0.9926639156350298,
"grad_norm": 2.6736093020039484,
"learning_rate": 1.640808732513155e-09,
"loss": 0.3724,
"step": 2165
},
{
"epoch": 0.994956441999083,
"grad_norm": 2.3884833213363144,
"learning_rate": 7.755608725490415e-10,
"loss": 0.354,
"step": 2170
},
{
"epoch": 0.9972489683631361,
"grad_norm": 2.378189457774983,
"learning_rate": 2.307495741843413e-10,
"loss": 0.356,
"step": 2175
},
{
"epoch": 0.9995414947271893,
"grad_norm": 2.543796138070999,
"learning_rate": 6.4097583263311725e-12,
"loss": 0.3664,
"step": 2180
},
{
"epoch": 1.0,
"eval_runtime": 2.6844,
"eval_samples_per_second": 3.725,
"eval_steps_per_second": 1.118,
"step": 2181
},
{
"epoch": 1.0,
"step": 2181,
"total_flos": 228328514519040.0,
"train_loss": 0.0,
"train_runtime": 0.0089,
"train_samples_per_second": 3927873.864,
"train_steps_per_second": 245597.686
}
],
"logging_steps": 5,
"max_steps": 2181,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 228328514519040.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}