fats-fme's picture
Training in progress, step 423, checkpoint
ea6d694 verified
raw
history blame
74.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.751165371809101,
"eval_steps": 141,
"global_step": 423,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017758046614872365,
"grad_norm": 0.40501952171325684,
"learning_rate": 4.000000000000001e-06,
"loss": 1.1387,
"step": 1
},
{
"epoch": 0.0017758046614872365,
"eval_loss": 1.4082584381103516,
"eval_runtime": 167.7664,
"eval_samples_per_second": 5.657,
"eval_steps_per_second": 1.419,
"step": 1
},
{
"epoch": 0.003551609322974473,
"grad_norm": 0.491682767868042,
"learning_rate": 8.000000000000001e-06,
"loss": 1.2151,
"step": 2
},
{
"epoch": 0.005327413984461709,
"grad_norm": 0.49752455949783325,
"learning_rate": 1.2e-05,
"loss": 1.1941,
"step": 3
},
{
"epoch": 0.007103218645948946,
"grad_norm": 0.5617953538894653,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.2472,
"step": 4
},
{
"epoch": 0.008879023307436182,
"grad_norm": 0.646000862121582,
"learning_rate": 2e-05,
"loss": 1.2767,
"step": 5
},
{
"epoch": 0.010654827968923418,
"grad_norm": 0.6190630197525024,
"learning_rate": 2.4e-05,
"loss": 1.2839,
"step": 6
},
{
"epoch": 0.012430632630410655,
"grad_norm": 0.6891798973083496,
"learning_rate": 2.8000000000000003e-05,
"loss": 1.2914,
"step": 7
},
{
"epoch": 0.014206437291897892,
"grad_norm": 0.6742885708808899,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.3001,
"step": 8
},
{
"epoch": 0.01598224195338513,
"grad_norm": 0.693493664264679,
"learning_rate": 3.6e-05,
"loss": 1.2673,
"step": 9
},
{
"epoch": 0.017758046614872364,
"grad_norm": 0.7951493859291077,
"learning_rate": 4e-05,
"loss": 1.3314,
"step": 10
},
{
"epoch": 0.0195338512763596,
"grad_norm": 0.7866435050964355,
"learning_rate": 4.4000000000000006e-05,
"loss": 1.2703,
"step": 11
},
{
"epoch": 0.021309655937846835,
"grad_norm": 0.7218112349510193,
"learning_rate": 4.8e-05,
"loss": 1.2542,
"step": 12
},
{
"epoch": 0.023085460599334074,
"grad_norm": 0.6838662028312683,
"learning_rate": 5.2000000000000004e-05,
"loss": 1.2432,
"step": 13
},
{
"epoch": 0.02486126526082131,
"grad_norm": 0.6592800617218018,
"learning_rate": 5.6000000000000006e-05,
"loss": 1.2374,
"step": 14
},
{
"epoch": 0.026637069922308545,
"grad_norm": 0.513134241104126,
"learning_rate": 6e-05,
"loss": 1.246,
"step": 15
},
{
"epoch": 0.028412874583795784,
"grad_norm": 0.5785119533538818,
"learning_rate": 6.400000000000001e-05,
"loss": 1.169,
"step": 16
},
{
"epoch": 0.03018867924528302,
"grad_norm": 0.6144536733627319,
"learning_rate": 6.800000000000001e-05,
"loss": 1.1532,
"step": 17
},
{
"epoch": 0.03196448390677026,
"grad_norm": 0.674633800983429,
"learning_rate": 7.2e-05,
"loss": 1.1175,
"step": 18
},
{
"epoch": 0.03374028856825749,
"grad_norm": 0.5997682809829712,
"learning_rate": 7.6e-05,
"loss": 1.092,
"step": 19
},
{
"epoch": 0.03551609322974473,
"grad_norm": 0.5651845335960388,
"learning_rate": 8e-05,
"loss": 1.0543,
"step": 20
},
{
"epoch": 0.03729189789123197,
"grad_norm": 0.562713623046875,
"learning_rate": 8.4e-05,
"loss": 1.0377,
"step": 21
},
{
"epoch": 0.0390677025527192,
"grad_norm": 0.5826591849327087,
"learning_rate": 8.800000000000001e-05,
"loss": 1.0178,
"step": 22
},
{
"epoch": 0.04084350721420644,
"grad_norm": 0.5972415208816528,
"learning_rate": 9.200000000000001e-05,
"loss": 0.9916,
"step": 23
},
{
"epoch": 0.04261931187569367,
"grad_norm": 0.6266159415245056,
"learning_rate": 9.6e-05,
"loss": 1.0026,
"step": 24
},
{
"epoch": 0.04439511653718091,
"grad_norm": 0.7757481932640076,
"learning_rate": 0.0001,
"loss": 1.0245,
"step": 25
},
{
"epoch": 0.04617092119866815,
"grad_norm": 0.6832363605499268,
"learning_rate": 0.00010400000000000001,
"loss": 0.9408,
"step": 26
},
{
"epoch": 0.04794672586015538,
"grad_norm": 1.2983894348144531,
"learning_rate": 0.00010800000000000001,
"loss": 0.9228,
"step": 27
},
{
"epoch": 0.04972253052164262,
"grad_norm": 0.9382829666137695,
"learning_rate": 0.00011200000000000001,
"loss": 0.9605,
"step": 28
},
{
"epoch": 0.05149833518312986,
"grad_norm": 0.5051376819610596,
"learning_rate": 0.000116,
"loss": 0.9769,
"step": 29
},
{
"epoch": 0.05327413984461709,
"grad_norm": 0.40853050351142883,
"learning_rate": 0.00012,
"loss": 0.8912,
"step": 30
},
{
"epoch": 0.05504994450610433,
"grad_norm": 0.4261438846588135,
"learning_rate": 0.000124,
"loss": 0.9438,
"step": 31
},
{
"epoch": 0.05682574916759157,
"grad_norm": 0.44900333881378174,
"learning_rate": 0.00012800000000000002,
"loss": 0.9589,
"step": 32
},
{
"epoch": 0.0586015538290788,
"grad_norm": 0.4262010157108307,
"learning_rate": 0.000132,
"loss": 0.8775,
"step": 33
},
{
"epoch": 0.06037735849056604,
"grad_norm": 0.40672022104263306,
"learning_rate": 0.00013600000000000003,
"loss": 0.8956,
"step": 34
},
{
"epoch": 0.06215316315205328,
"grad_norm": 0.39336153864860535,
"learning_rate": 0.00014,
"loss": 0.866,
"step": 35
},
{
"epoch": 0.06392896781354052,
"grad_norm": 0.40699368715286255,
"learning_rate": 0.000144,
"loss": 0.8612,
"step": 36
},
{
"epoch": 0.06570477247502775,
"grad_norm": 0.438643217086792,
"learning_rate": 0.000148,
"loss": 0.922,
"step": 37
},
{
"epoch": 0.06748057713651498,
"grad_norm": 0.45053544640541077,
"learning_rate": 0.000152,
"loss": 0.8681,
"step": 38
},
{
"epoch": 0.06925638179800223,
"grad_norm": 0.4289852976799011,
"learning_rate": 0.00015600000000000002,
"loss": 0.9071,
"step": 39
},
{
"epoch": 0.07103218645948946,
"grad_norm": 0.4101032316684723,
"learning_rate": 0.00016,
"loss": 0.8692,
"step": 40
},
{
"epoch": 0.07280799112097669,
"grad_norm": 0.418319433927536,
"learning_rate": 0.000164,
"loss": 0.8654,
"step": 41
},
{
"epoch": 0.07458379578246394,
"grad_norm": 0.41637811064720154,
"learning_rate": 0.000168,
"loss": 0.8348,
"step": 42
},
{
"epoch": 0.07635960044395117,
"grad_norm": 0.40830302238464355,
"learning_rate": 0.000172,
"loss": 0.8928,
"step": 43
},
{
"epoch": 0.0781354051054384,
"grad_norm": 0.4163912236690521,
"learning_rate": 0.00017600000000000002,
"loss": 0.8793,
"step": 44
},
{
"epoch": 0.07991120976692564,
"grad_norm": 0.4240954518318176,
"learning_rate": 0.00018,
"loss": 0.9029,
"step": 45
},
{
"epoch": 0.08168701442841288,
"grad_norm": 0.48420408368110657,
"learning_rate": 0.00018400000000000003,
"loss": 0.8632,
"step": 46
},
{
"epoch": 0.08346281908990011,
"grad_norm": 0.5267483592033386,
"learning_rate": 0.000188,
"loss": 0.8575,
"step": 47
},
{
"epoch": 0.08523862375138734,
"grad_norm": 0.4947332441806793,
"learning_rate": 0.000192,
"loss": 0.9051,
"step": 48
},
{
"epoch": 0.08701442841287459,
"grad_norm": 0.5025691986083984,
"learning_rate": 0.000196,
"loss": 0.9145,
"step": 49
},
{
"epoch": 0.08879023307436182,
"grad_norm": 0.5430313944816589,
"learning_rate": 0.0002,
"loss": 0.8954,
"step": 50
},
{
"epoch": 0.09056603773584905,
"grad_norm": 0.45721662044525146,
"learning_rate": 0.00019999812486015523,
"loss": 0.9655,
"step": 51
},
{
"epoch": 0.0923418423973363,
"grad_norm": 0.4364672899246216,
"learning_rate": 0.00019999249951094388,
"loss": 0.9318,
"step": 52
},
{
"epoch": 0.09411764705882353,
"grad_norm": 0.38933759927749634,
"learning_rate": 0.00019998312416333227,
"loss": 0.8963,
"step": 53
},
{
"epoch": 0.09589345172031076,
"grad_norm": 0.35572728514671326,
"learning_rate": 0.0001999699991689222,
"loss": 0.9073,
"step": 54
},
{
"epoch": 0.097669256381798,
"grad_norm": 0.3042948544025421,
"learning_rate": 0.00019995312501993765,
"loss": 0.8751,
"step": 55
},
{
"epoch": 0.09944506104328524,
"grad_norm": 0.32266151905059814,
"learning_rate": 0.00019993250234920636,
"loss": 0.8493,
"step": 56
},
{
"epoch": 0.10122086570477247,
"grad_norm": 0.31894031167030334,
"learning_rate": 0.00019990813193013625,
"loss": 0.8512,
"step": 57
},
{
"epoch": 0.10299667036625972,
"grad_norm": 0.33073991537094116,
"learning_rate": 0.0001998800146766861,
"loss": 0.8424,
"step": 58
},
{
"epoch": 0.10477247502774695,
"grad_norm": 0.32064828276634216,
"learning_rate": 0.00019984815164333163,
"loss": 0.8698,
"step": 59
},
{
"epoch": 0.10654827968923418,
"grad_norm": 0.3364376425743103,
"learning_rate": 0.00019981254402502566,
"loss": 0.8525,
"step": 60
},
{
"epoch": 0.10832408435072143,
"grad_norm": 0.31403639912605286,
"learning_rate": 0.0001997731931571535,
"loss": 0.8309,
"step": 61
},
{
"epoch": 0.11009988901220866,
"grad_norm": 0.3375100791454315,
"learning_rate": 0.00019973010051548275,
"loss": 0.8573,
"step": 62
},
{
"epoch": 0.11187569367369589,
"grad_norm": 0.3584939241409302,
"learning_rate": 0.00019968326771610797,
"loss": 0.8479,
"step": 63
},
{
"epoch": 0.11365149833518313,
"grad_norm": 0.35480472445487976,
"learning_rate": 0.00019963269651539017,
"loss": 0.845,
"step": 64
},
{
"epoch": 0.11542730299667037,
"grad_norm": 0.33250972628593445,
"learning_rate": 0.00019957838880989078,
"loss": 0.8438,
"step": 65
},
{
"epoch": 0.1172031076581576,
"grad_norm": 0.39302438497543335,
"learning_rate": 0.00019952034663630062,
"loss": 0.8391,
"step": 66
},
{
"epoch": 0.11897891231964484,
"grad_norm": 0.3517158031463623,
"learning_rate": 0.00019945857217136363,
"loss": 0.7966,
"step": 67
},
{
"epoch": 0.12075471698113208,
"grad_norm": 0.38860467076301575,
"learning_rate": 0.00019939306773179497,
"loss": 0.8279,
"step": 68
},
{
"epoch": 0.12253052164261931,
"grad_norm": 0.3762984573841095,
"learning_rate": 0.00019932383577419432,
"loss": 0.7848,
"step": 69
},
{
"epoch": 0.12430632630410655,
"grad_norm": 0.4535103440284729,
"learning_rate": 0.00019925087889495374,
"loss": 0.8,
"step": 70
},
{
"epoch": 0.12608213096559379,
"grad_norm": 0.4869844317436218,
"learning_rate": 0.00019917419983016025,
"loss": 0.8442,
"step": 71
},
{
"epoch": 0.12785793562708103,
"grad_norm": 0.4379689395427704,
"learning_rate": 0.00019909380145549324,
"loss": 0.8353,
"step": 72
},
{
"epoch": 0.12963374028856825,
"grad_norm": 0.39510270953178406,
"learning_rate": 0.00019900968678611666,
"loss": 0.8538,
"step": 73
},
{
"epoch": 0.1314095449500555,
"grad_norm": 0.4764181971549988,
"learning_rate": 0.00019892185897656578,
"loss": 0.8509,
"step": 74
},
{
"epoch": 0.13318534961154274,
"grad_norm": 0.5591267347335815,
"learning_rate": 0.00019883032132062925,
"loss": 0.8661,
"step": 75
},
{
"epoch": 0.13496115427302996,
"grad_norm": 0.41077056527137756,
"learning_rate": 0.00019873507725122504,
"loss": 0.9418,
"step": 76
},
{
"epoch": 0.1367369589345172,
"grad_norm": 0.393622487783432,
"learning_rate": 0.00019863613034027224,
"loss": 0.926,
"step": 77
},
{
"epoch": 0.13851276359600445,
"grad_norm": 0.36414071917533875,
"learning_rate": 0.00019853348429855672,
"loss": 0.8649,
"step": 78
},
{
"epoch": 0.14028856825749167,
"grad_norm": 0.3100601136684418,
"learning_rate": 0.00019842714297559213,
"loss": 0.9114,
"step": 79
},
{
"epoch": 0.14206437291897892,
"grad_norm": 0.29151105880737305,
"learning_rate": 0.0001983171103594755,
"loss": 0.8681,
"step": 80
},
{
"epoch": 0.14384017758046616,
"grad_norm": 0.28398221731185913,
"learning_rate": 0.0001982033905767377,
"loss": 0.8515,
"step": 81
},
{
"epoch": 0.14561598224195338,
"grad_norm": 0.2883840799331665,
"learning_rate": 0.00019808598789218865,
"loss": 0.8569,
"step": 82
},
{
"epoch": 0.14739178690344062,
"grad_norm": 0.29812031984329224,
"learning_rate": 0.0001979649067087574,
"loss": 0.8529,
"step": 83
},
{
"epoch": 0.14916759156492787,
"grad_norm": 0.3074108958244324,
"learning_rate": 0.00019784015156732693,
"loss": 0.8771,
"step": 84
},
{
"epoch": 0.1509433962264151,
"grad_norm": 0.3601110279560089,
"learning_rate": 0.000197711727146564,
"loss": 0.8609,
"step": 85
},
{
"epoch": 0.15271920088790233,
"grad_norm": 0.3126521110534668,
"learning_rate": 0.00019757963826274357,
"loss": 0.8162,
"step": 86
},
{
"epoch": 0.15449500554938958,
"grad_norm": 0.3152073323726654,
"learning_rate": 0.00019744388986956822,
"loss": 0.7661,
"step": 87
},
{
"epoch": 0.1562708102108768,
"grad_norm": 0.33570149540901184,
"learning_rate": 0.00019730448705798239,
"loss": 0.8179,
"step": 88
},
{
"epoch": 0.15804661487236404,
"grad_norm": 0.33989331126213074,
"learning_rate": 0.0001971614350559814,
"loss": 0.8288,
"step": 89
},
{
"epoch": 0.1598224195338513,
"grad_norm": 0.3292713761329651,
"learning_rate": 0.0001970147392284154,
"loss": 0.8415,
"step": 90
},
{
"epoch": 0.1615982241953385,
"grad_norm": 0.3394547700881958,
"learning_rate": 0.00019686440507678824,
"loss": 0.8232,
"step": 91
},
{
"epoch": 0.16337402885682575,
"grad_norm": 0.3370296061038971,
"learning_rate": 0.0001967104382390511,
"loss": 0.7771,
"step": 92
},
{
"epoch": 0.16514983351831297,
"grad_norm": 0.3798193633556366,
"learning_rate": 0.00019655284448939094,
"loss": 0.789,
"step": 93
},
{
"epoch": 0.16692563817980022,
"grad_norm": 0.3790013790130615,
"learning_rate": 0.00019639162973801426,
"loss": 0.8153,
"step": 94
},
{
"epoch": 0.16870144284128746,
"grad_norm": 0.42274704575538635,
"learning_rate": 0.00019622680003092503,
"loss": 0.8012,
"step": 95
},
{
"epoch": 0.17047724750277468,
"grad_norm": 0.4776620864868164,
"learning_rate": 0.0001960583615496984,
"loss": 0.8132,
"step": 96
},
{
"epoch": 0.17225305216426193,
"grad_norm": 0.4170360565185547,
"learning_rate": 0.00019588632061124837,
"loss": 0.8139,
"step": 97
},
{
"epoch": 0.17402885682574917,
"grad_norm": 0.47097012400627136,
"learning_rate": 0.00019571068366759143,
"loss": 0.7711,
"step": 98
},
{
"epoch": 0.1758046614872364,
"grad_norm": 0.8176291584968567,
"learning_rate": 0.00019553145730560415,
"loss": 0.7906,
"step": 99
},
{
"epoch": 0.17758046614872364,
"grad_norm": 0.7204902172088623,
"learning_rate": 0.0001953486482467764,
"loss": 0.9088,
"step": 100
},
{
"epoch": 0.17935627081021088,
"grad_norm": 0.3952767252922058,
"learning_rate": 0.0001951622633469592,
"loss": 0.9362,
"step": 101
},
{
"epoch": 0.1811320754716981,
"grad_norm": 0.3742019534111023,
"learning_rate": 0.00019497230959610756,
"loss": 0.933,
"step": 102
},
{
"epoch": 0.18290788013318535,
"grad_norm": 0.3385975658893585,
"learning_rate": 0.00019477879411801844,
"loss": 0.9028,
"step": 103
},
{
"epoch": 0.1846836847946726,
"grad_norm": 0.2950561046600342,
"learning_rate": 0.00019458172417006347,
"loss": 0.8245,
"step": 104
},
{
"epoch": 0.1864594894561598,
"grad_norm": 0.30859696865081787,
"learning_rate": 0.00019438110714291694,
"loss": 0.8771,
"step": 105
},
{
"epoch": 0.18823529411764706,
"grad_norm": 0.3490929901599884,
"learning_rate": 0.00019417695056027844,
"loss": 0.8565,
"step": 106
},
{
"epoch": 0.1900110987791343,
"grad_norm": 0.31133994460105896,
"learning_rate": 0.00019396926207859084,
"loss": 0.8734,
"step": 107
},
{
"epoch": 0.19178690344062152,
"grad_norm": 0.2884789705276489,
"learning_rate": 0.00019375804948675306,
"loss": 0.8645,
"step": 108
},
{
"epoch": 0.19356270810210877,
"grad_norm": 0.2969193160533905,
"learning_rate": 0.0001935433207058281,
"loss": 0.8751,
"step": 109
},
{
"epoch": 0.195338512763596,
"grad_norm": 0.41810932755470276,
"learning_rate": 0.0001933250837887457,
"loss": 0.8037,
"step": 110
},
{
"epoch": 0.19711431742508323,
"grad_norm": 0.3271716833114624,
"learning_rate": 0.00019310334692000075,
"loss": 0.7814,
"step": 111
},
{
"epoch": 0.19889012208657048,
"grad_norm": 0.4146140515804291,
"learning_rate": 0.00019287811841534595,
"loss": 0.8425,
"step": 112
},
{
"epoch": 0.20066592674805772,
"grad_norm": 0.3369704484939575,
"learning_rate": 0.00019264940672148018,
"loss": 0.8301,
"step": 113
},
{
"epoch": 0.20244173140954494,
"grad_norm": 0.32731175422668457,
"learning_rate": 0.00019241722041573166,
"loss": 0.7964,
"step": 114
},
{
"epoch": 0.20421753607103219,
"grad_norm": 0.3840983510017395,
"learning_rate": 0.0001921815682057362,
"loss": 0.7864,
"step": 115
},
{
"epoch": 0.20599334073251943,
"grad_norm": 0.37049344182014465,
"learning_rate": 0.0001919424589291108,
"loss": 0.8086,
"step": 116
},
{
"epoch": 0.20776914539400665,
"grad_norm": 0.380991131067276,
"learning_rate": 0.0001916999015531221,
"loss": 0.8039,
"step": 117
},
{
"epoch": 0.2095449500554939,
"grad_norm": 0.3884637653827667,
"learning_rate": 0.00019145390517435012,
"loss": 0.7693,
"step": 118
},
{
"epoch": 0.21132075471698114,
"grad_norm": 0.39195218682289124,
"learning_rate": 0.00019120447901834706,
"loss": 0.8139,
"step": 119
},
{
"epoch": 0.21309655937846836,
"grad_norm": 0.41479626297950745,
"learning_rate": 0.00019095163243929142,
"loss": 0.7714,
"step": 120
},
{
"epoch": 0.2148723640399556,
"grad_norm": 0.3856278657913208,
"learning_rate": 0.0001906953749196371,
"loss": 0.8198,
"step": 121
},
{
"epoch": 0.21664816870144285,
"grad_norm": 0.3706349730491638,
"learning_rate": 0.00019043571606975777,
"loss": 0.7106,
"step": 122
},
{
"epoch": 0.21842397336293007,
"grad_norm": 0.5981292724609375,
"learning_rate": 0.00019017266562758659,
"loss": 0.8005,
"step": 123
},
{
"epoch": 0.22019977802441731,
"grad_norm": 0.4480712115764618,
"learning_rate": 0.00018990623345825083,
"loss": 0.8167,
"step": 124
},
{
"epoch": 0.22197558268590456,
"grad_norm": 0.9817702770233154,
"learning_rate": 0.00018963642955370201,
"loss": 0.8555,
"step": 125
},
{
"epoch": 0.22375138734739178,
"grad_norm": 0.4110267460346222,
"learning_rate": 0.00018936326403234125,
"loss": 0.9069,
"step": 126
},
{
"epoch": 0.22552719200887902,
"grad_norm": 0.36051687598228455,
"learning_rate": 0.00018908674713863952,
"loss": 0.8783,
"step": 127
},
{
"epoch": 0.22730299667036627,
"grad_norm": 0.34053486585617065,
"learning_rate": 0.00018880688924275378,
"loss": 0.8563,
"step": 128
},
{
"epoch": 0.2290788013318535,
"grad_norm": 0.30984926223754883,
"learning_rate": 0.0001885237008401378,
"loss": 0.8434,
"step": 129
},
{
"epoch": 0.23085460599334073,
"grad_norm": 0.3125753700733185,
"learning_rate": 0.0001882371925511488,
"loss": 0.831,
"step": 130
},
{
"epoch": 0.23263041065482798,
"grad_norm": 0.3113706409931183,
"learning_rate": 0.0001879473751206489,
"loss": 0.8659,
"step": 131
},
{
"epoch": 0.2344062153163152,
"grad_norm": 0.2837103605270386,
"learning_rate": 0.00018765425941760238,
"loss": 0.812,
"step": 132
},
{
"epoch": 0.23618201997780244,
"grad_norm": 0.2814521789550781,
"learning_rate": 0.00018735785643466784,
"loss": 0.8116,
"step": 133
},
{
"epoch": 0.2379578246392897,
"grad_norm": 0.2922544777393341,
"learning_rate": 0.00018705817728778624,
"loss": 0.8305,
"step": 134
},
{
"epoch": 0.2397336293007769,
"grad_norm": 0.3140820860862732,
"learning_rate": 0.00018675523321576371,
"loss": 0.7882,
"step": 135
},
{
"epoch": 0.24150943396226415,
"grad_norm": 0.29498058557510376,
"learning_rate": 0.00018644903557985025,
"loss": 0.8226,
"step": 136
},
{
"epoch": 0.2432852386237514,
"grad_norm": 0.3298538625240326,
"learning_rate": 0.00018613959586331362,
"loss": 0.7867,
"step": 137
},
{
"epoch": 0.24506104328523862,
"grad_norm": 0.3474237024784088,
"learning_rate": 0.00018582692567100867,
"loss": 0.7876,
"step": 138
},
{
"epoch": 0.24683684794672586,
"grad_norm": 0.3735051155090332,
"learning_rate": 0.00018551103672894206,
"loss": 0.818,
"step": 139
},
{
"epoch": 0.2486126526082131,
"grad_norm": 0.3931002914905548,
"learning_rate": 0.00018519194088383273,
"loss": 0.7896,
"step": 140
},
{
"epoch": 0.2503884572697003,
"grad_norm": 0.36460694670677185,
"learning_rate": 0.00018486965010266725,
"loss": 0.8105,
"step": 141
},
{
"epoch": 0.2503884572697003,
"eval_loss": 0.8086357712745667,
"eval_runtime": 159.8215,
"eval_samples_per_second": 5.938,
"eval_steps_per_second": 1.489,
"step": 141
},
{
"epoch": 0.25216426193118757,
"grad_norm": 0.3713844120502472,
"learning_rate": 0.0001845441764722514,
"loss": 0.7688,
"step": 142
},
{
"epoch": 0.2539400665926748,
"grad_norm": 0.352450430393219,
"learning_rate": 0.00018421553219875658,
"loss": 0.7769,
"step": 143
},
{
"epoch": 0.25571587125416206,
"grad_norm": 0.3609173893928528,
"learning_rate": 0.00018388372960726228,
"loss": 0.7718,
"step": 144
},
{
"epoch": 0.25749167591564925,
"grad_norm": 0.36195874214172363,
"learning_rate": 0.00018354878114129367,
"loss": 0.7375,
"step": 145
},
{
"epoch": 0.2592674805771365,
"grad_norm": 0.3802485466003418,
"learning_rate": 0.00018321069936235503,
"loss": 0.7778,
"step": 146
},
{
"epoch": 0.26104328523862375,
"grad_norm": 0.38449469208717346,
"learning_rate": 0.00018286949694945866,
"loss": 0.7458,
"step": 147
},
{
"epoch": 0.262819089900111,
"grad_norm": 0.3975572884082794,
"learning_rate": 0.00018252518669864936,
"loss": 0.7367,
"step": 148
},
{
"epoch": 0.26459489456159824,
"grad_norm": 0.49581316113471985,
"learning_rate": 0.0001821777815225245,
"loss": 0.7948,
"step": 149
},
{
"epoch": 0.2663706992230855,
"grad_norm": 0.5556712746620178,
"learning_rate": 0.00018182729444974992,
"loss": 0.8143,
"step": 150
},
{
"epoch": 0.2681465038845727,
"grad_norm": 0.3207700848579407,
"learning_rate": 0.00018147373862457107,
"loss": 0.8578,
"step": 151
},
{
"epoch": 0.2699223085460599,
"grad_norm": 0.3484250605106354,
"learning_rate": 0.00018111712730632022,
"loss": 0.8757,
"step": 152
},
{
"epoch": 0.27169811320754716,
"grad_norm": 0.33792024850845337,
"learning_rate": 0.0001807574738689193,
"loss": 0.8464,
"step": 153
},
{
"epoch": 0.2734739178690344,
"grad_norm": 0.3430371582508087,
"learning_rate": 0.000180394791800378,
"loss": 0.8607,
"step": 154
},
{
"epoch": 0.27524972253052166,
"grad_norm": 0.3120534420013428,
"learning_rate": 0.00018002909470228842,
"loss": 0.8392,
"step": 155
},
{
"epoch": 0.2770255271920089,
"grad_norm": 0.3126620054244995,
"learning_rate": 0.00017966039628931446,
"loss": 0.8191,
"step": 156
},
{
"epoch": 0.2788013318534961,
"grad_norm": 0.32269468903541565,
"learning_rate": 0.00017928871038867784,
"loss": 0.8164,
"step": 157
},
{
"epoch": 0.28057713651498334,
"grad_norm": 0.3052617907524109,
"learning_rate": 0.00017891405093963938,
"loss": 0.8268,
"step": 158
},
{
"epoch": 0.2823529411764706,
"grad_norm": 0.29926028847694397,
"learning_rate": 0.00017853643199297633,
"loss": 0.7847,
"step": 159
},
{
"epoch": 0.28412874583795783,
"grad_norm": 0.2997240722179413,
"learning_rate": 0.00017815586771045535,
"loss": 0.8143,
"step": 160
},
{
"epoch": 0.2859045504994451,
"grad_norm": 0.29772111773490906,
"learning_rate": 0.0001777723723643014,
"loss": 0.7412,
"step": 161
},
{
"epoch": 0.2876803551609323,
"grad_norm": 0.3138352632522583,
"learning_rate": 0.0001773859603366626,
"loss": 0.7747,
"step": 162
},
{
"epoch": 0.2894561598224195,
"grad_norm": 0.32726818323135376,
"learning_rate": 0.00017699664611907072,
"loss": 0.8123,
"step": 163
},
{
"epoch": 0.29123196448390676,
"grad_norm": 0.3244825005531311,
"learning_rate": 0.0001766044443118978,
"loss": 0.7705,
"step": 164
},
{
"epoch": 0.293007769145394,
"grad_norm": 0.35875847935676575,
"learning_rate": 0.00017620936962380856,
"loss": 0.7881,
"step": 165
},
{
"epoch": 0.29478357380688125,
"grad_norm": 0.36488401889801025,
"learning_rate": 0.00017581143687120875,
"loss": 0.7956,
"step": 166
},
{
"epoch": 0.2965593784683685,
"grad_norm": 0.33817097544670105,
"learning_rate": 0.00017541066097768963,
"loss": 0.7719,
"step": 167
},
{
"epoch": 0.29833518312985574,
"grad_norm": 0.36390411853790283,
"learning_rate": 0.0001750070569734681,
"loss": 0.8172,
"step": 168
},
{
"epoch": 0.30011098779134293,
"grad_norm": 0.34076422452926636,
"learning_rate": 0.00017460063999482316,
"loss": 0.7419,
"step": 169
},
{
"epoch": 0.3018867924528302,
"grad_norm": 0.39437592029571533,
"learning_rate": 0.00017419142528352817,
"loss": 0.7519,
"step": 170
},
{
"epoch": 0.3036625971143174,
"grad_norm": 0.4019312560558319,
"learning_rate": 0.00017377942818627942,
"loss": 0.7944,
"step": 171
},
{
"epoch": 0.30543840177580467,
"grad_norm": 0.40751898288726807,
"learning_rate": 0.00017336466415412028,
"loss": 0.7827,
"step": 172
},
{
"epoch": 0.3072142064372919,
"grad_norm": 0.4780448079109192,
"learning_rate": 0.0001729471487418621,
"loss": 0.7872,
"step": 173
},
{
"epoch": 0.30899001109877916,
"grad_norm": 0.40511685609817505,
"learning_rate": 0.0001725268976075005,
"loss": 0.7642,
"step": 174
},
{
"epoch": 0.31076581576026635,
"grad_norm": 0.5618127584457397,
"learning_rate": 0.0001721039265116285,
"loss": 0.872,
"step": 175
},
{
"epoch": 0.3125416204217536,
"grad_norm": 0.294917494058609,
"learning_rate": 0.00017167825131684513,
"loss": 0.8545,
"step": 176
},
{
"epoch": 0.31431742508324084,
"grad_norm": 0.3281805217266083,
"learning_rate": 0.00017124988798716083,
"loss": 0.8404,
"step": 177
},
{
"epoch": 0.3160932297447281,
"grad_norm": 0.33336278796195984,
"learning_rate": 0.00017081885258739846,
"loss": 0.8495,
"step": 178
},
{
"epoch": 0.31786903440621533,
"grad_norm": 0.3366440236568451,
"learning_rate": 0.00017038516128259115,
"loss": 0.8659,
"step": 179
},
{
"epoch": 0.3196448390677026,
"grad_norm": 0.32397955656051636,
"learning_rate": 0.00016994883033737582,
"loss": 0.8292,
"step": 180
},
{
"epoch": 0.32142064372918977,
"grad_norm": 0.2874945402145386,
"learning_rate": 0.00016950987611538324,
"loss": 0.7949,
"step": 181
},
{
"epoch": 0.323196448390677,
"grad_norm": 0.3074096143245697,
"learning_rate": 0.00016906831507862443,
"loss": 0.8076,
"step": 182
},
{
"epoch": 0.32497225305216426,
"grad_norm": 0.30116966366767883,
"learning_rate": 0.0001686241637868734,
"loss": 0.8058,
"step": 183
},
{
"epoch": 0.3267480577136515,
"grad_norm": 0.3052218556404114,
"learning_rate": 0.00016817743889704565,
"loss": 0.8067,
"step": 184
},
{
"epoch": 0.32852386237513875,
"grad_norm": 0.3073555827140808,
"learning_rate": 0.00016772815716257412,
"loss": 0.8496,
"step": 185
},
{
"epoch": 0.33029966703662594,
"grad_norm": 0.289145290851593,
"learning_rate": 0.0001672763354327804,
"loss": 0.7362,
"step": 186
},
{
"epoch": 0.3320754716981132,
"grad_norm": 0.31561294198036194,
"learning_rate": 0.00016682199065224307,
"loss": 0.802,
"step": 187
},
{
"epoch": 0.33385127635960044,
"grad_norm": 0.2900339365005493,
"learning_rate": 0.00016636513986016213,
"loss": 0.7432,
"step": 188
},
{
"epoch": 0.3356270810210877,
"grad_norm": 0.3267146646976471,
"learning_rate": 0.0001659058001897201,
"loss": 0.7771,
"step": 189
},
{
"epoch": 0.3374028856825749,
"grad_norm": 0.3258307874202728,
"learning_rate": 0.00016544398886743933,
"loss": 0.7345,
"step": 190
},
{
"epoch": 0.3391786903440622,
"grad_norm": 0.32989659905433655,
"learning_rate": 0.000164979723212536,
"loss": 0.7383,
"step": 191
},
{
"epoch": 0.34095449500554936,
"grad_norm": 0.3265599310398102,
"learning_rate": 0.00016451302063627066,
"loss": 0.6977,
"step": 192
},
{
"epoch": 0.3427302996670366,
"grad_norm": 0.39376598596572876,
"learning_rate": 0.00016404389864129533,
"loss": 0.7851,
"step": 193
},
{
"epoch": 0.34450610432852385,
"grad_norm": 0.40358301997184753,
"learning_rate": 0.00016357237482099684,
"loss": 0.7928,
"step": 194
},
{
"epoch": 0.3462819089900111,
"grad_norm": 0.3747034966945648,
"learning_rate": 0.00016309846685883726,
"loss": 0.7751,
"step": 195
},
{
"epoch": 0.34805771365149835,
"grad_norm": 0.4160248041152954,
"learning_rate": 0.00016262219252769064,
"loss": 0.8035,
"step": 196
},
{
"epoch": 0.3498335183129856,
"grad_norm": 0.39067476987838745,
"learning_rate": 0.00016214356968917648,
"loss": 0.6726,
"step": 197
},
{
"epoch": 0.3516093229744728,
"grad_norm": 0.4980023205280304,
"learning_rate": 0.00016166261629298995,
"loss": 0.7917,
"step": 198
},
{
"epoch": 0.35338512763596003,
"grad_norm": 0.4774058163166046,
"learning_rate": 0.0001611793503762285,
"loss": 0.7599,
"step": 199
},
{
"epoch": 0.3551609322974473,
"grad_norm": 0.5196167230606079,
"learning_rate": 0.00016069379006271566,
"loss": 0.7608,
"step": 200
},
{
"epoch": 0.3569367369589345,
"grad_norm": 0.2735799551010132,
"learning_rate": 0.00016020595356232135,
"loss": 0.8588,
"step": 201
},
{
"epoch": 0.35871254162042177,
"grad_norm": 0.30770814418792725,
"learning_rate": 0.00015971585917027862,
"loss": 0.8222,
"step": 202
},
{
"epoch": 0.360488346281909,
"grad_norm": 0.317123144865036,
"learning_rate": 0.00015922352526649803,
"loss": 0.7941,
"step": 203
},
{
"epoch": 0.3622641509433962,
"grad_norm": 0.32672154903411865,
"learning_rate": 0.00015872897031487791,
"loss": 0.867,
"step": 204
},
{
"epoch": 0.36403995560488345,
"grad_norm": 0.3169744610786438,
"learning_rate": 0.00015823221286261215,
"loss": 0.8781,
"step": 205
},
{
"epoch": 0.3658157602663707,
"grad_norm": 0.30588722229003906,
"learning_rate": 0.00015773327153949465,
"loss": 0.7827,
"step": 206
},
{
"epoch": 0.36759156492785794,
"grad_norm": 0.3179618716239929,
"learning_rate": 0.0001572321650572205,
"loss": 0.8178,
"step": 207
},
{
"epoch": 0.3693673695893452,
"grad_norm": 0.3094286322593689,
"learning_rate": 0.00015672891220868432,
"loss": 0.7966,
"step": 208
},
{
"epoch": 0.37114317425083243,
"grad_norm": 0.31584280729293823,
"learning_rate": 0.00015622353186727544,
"loss": 0.7982,
"step": 209
},
{
"epoch": 0.3729189789123196,
"grad_norm": 0.29120850563049316,
"learning_rate": 0.0001557160429861702,
"loss": 0.7789,
"step": 210
},
{
"epoch": 0.37469478357380687,
"grad_norm": 0.29743698239326477,
"learning_rate": 0.000155206464597621,
"loss": 0.7799,
"step": 211
},
{
"epoch": 0.3764705882352941,
"grad_norm": 0.31440189480781555,
"learning_rate": 0.00015469481581224272,
"loss": 0.7661,
"step": 212
},
{
"epoch": 0.37824639289678136,
"grad_norm": 0.3395606279373169,
"learning_rate": 0.00015418111581829574,
"loss": 0.7657,
"step": 213
},
{
"epoch": 0.3800221975582686,
"grad_norm": 0.31749066710472107,
"learning_rate": 0.0001536653838809667,
"loss": 0.7913,
"step": 214
},
{
"epoch": 0.38179800221975585,
"grad_norm": 0.3586166501045227,
"learning_rate": 0.0001531476393416456,
"loss": 0.7774,
"step": 215
},
{
"epoch": 0.38357380688124304,
"grad_norm": 0.32895100116729736,
"learning_rate": 0.0001526279016172008,
"loss": 0.7882,
"step": 216
},
{
"epoch": 0.3853496115427303,
"grad_norm": 0.3541489839553833,
"learning_rate": 0.00015210619019925066,
"loss": 0.7708,
"step": 217
},
{
"epoch": 0.38712541620421753,
"grad_norm": 0.3232908546924591,
"learning_rate": 0.00015158252465343242,
"loss": 0.7238,
"step": 218
},
{
"epoch": 0.3889012208657048,
"grad_norm": 0.36565467715263367,
"learning_rate": 0.00015105692461866874,
"loss": 0.7685,
"step": 219
},
{
"epoch": 0.390677025527192,
"grad_norm": 0.3799486756324768,
"learning_rate": 0.000150529409806431,
"loss": 0.7296,
"step": 220
},
{
"epoch": 0.39245283018867927,
"grad_norm": 0.4193985164165497,
"learning_rate": 0.00015000000000000001,
"loss": 0.7731,
"step": 221
},
{
"epoch": 0.39422863485016646,
"grad_norm": 0.4226386845111847,
"learning_rate": 0.00014946871505372425,
"loss": 0.8048,
"step": 222
},
{
"epoch": 0.3960044395116537,
"grad_norm": 0.40805166959762573,
"learning_rate": 0.00014893557489227517,
"loss": 0.7389,
"step": 223
},
{
"epoch": 0.39778024417314095,
"grad_norm": 0.5135468244552612,
"learning_rate": 0.0001484005995098999,
"loss": 0.779,
"step": 224
},
{
"epoch": 0.3995560488346282,
"grad_norm": 0.6674650311470032,
"learning_rate": 0.0001478638089696716,
"loss": 0.82,
"step": 225
},
{
"epoch": 0.40133185349611544,
"grad_norm": 0.3206911087036133,
"learning_rate": 0.00014732522340273684,
"loss": 0.8985,
"step": 226
},
{
"epoch": 0.4031076581576027,
"grad_norm": 0.33583980798721313,
"learning_rate": 0.0001467848630075608,
"loss": 0.8171,
"step": 227
},
{
"epoch": 0.4048834628190899,
"grad_norm": 0.3324304223060608,
"learning_rate": 0.00014624274804916958,
"loss": 0.8531,
"step": 228
},
{
"epoch": 0.4066592674805771,
"grad_norm": 0.32210710644721985,
"learning_rate": 0.00014569889885839037,
"loss": 0.8349,
"step": 229
},
{
"epoch": 0.40843507214206437,
"grad_norm": 0.30829885601997375,
"learning_rate": 0.00014515333583108896,
"loss": 0.8176,
"step": 230
},
{
"epoch": 0.4102108768035516,
"grad_norm": 0.31730225682258606,
"learning_rate": 0.00014460607942740468,
"loss": 0.8109,
"step": 231
},
{
"epoch": 0.41198668146503886,
"grad_norm": 0.32128164172172546,
"learning_rate": 0.00014405715017098335,
"loss": 0.8049,
"step": 232
},
{
"epoch": 0.4137624861265261,
"grad_norm": 0.32257241010665894,
"learning_rate": 0.00014350656864820733,
"loss": 0.79,
"step": 233
},
{
"epoch": 0.4155382907880133,
"grad_norm": 0.29663363099098206,
"learning_rate": 0.0001429543555074237,
"loss": 0.7606,
"step": 234
},
{
"epoch": 0.41731409544950054,
"grad_norm": 0.3175968527793884,
"learning_rate": 0.00014240053145816967,
"loss": 0.8093,
"step": 235
},
{
"epoch": 0.4190899001109878,
"grad_norm": 0.30839797854423523,
"learning_rate": 0.00014184511727039612,
"loss": 0.8033,
"step": 236
},
{
"epoch": 0.42086570477247504,
"grad_norm": 0.32169485092163086,
"learning_rate": 0.0001412881337736885,
"loss": 0.7583,
"step": 237
},
{
"epoch": 0.4226415094339623,
"grad_norm": 0.3165202736854553,
"learning_rate": 0.00014072960185648577,
"loss": 0.7864,
"step": 238
},
{
"epoch": 0.4244173140954495,
"grad_norm": 0.3507262170314789,
"learning_rate": 0.00014016954246529696,
"loss": 0.8196,
"step": 239
},
{
"epoch": 0.4261931187569367,
"grad_norm": 0.3330634534358978,
"learning_rate": 0.0001396079766039157,
"loss": 0.7356,
"step": 240
},
{
"epoch": 0.42796892341842396,
"grad_norm": 0.3456502854824066,
"learning_rate": 0.00013904492533263244,
"loss": 0.7636,
"step": 241
},
{
"epoch": 0.4297447280799112,
"grad_norm": 0.3290559649467468,
"learning_rate": 0.00013848040976744457,
"loss": 0.6921,
"step": 242
},
{
"epoch": 0.43152053274139845,
"grad_norm": 0.34343284368515015,
"learning_rate": 0.00013791445107926478,
"loss": 0.7661,
"step": 243
},
{
"epoch": 0.4332963374028857,
"grad_norm": 0.34806933999061584,
"learning_rate": 0.00013734707049312673,
"loss": 0.7266,
"step": 244
},
{
"epoch": 0.43507214206437295,
"grad_norm": 0.3577682375907898,
"learning_rate": 0.00013677828928738934,
"loss": 0.7337,
"step": 245
},
{
"epoch": 0.43684794672586014,
"grad_norm": 0.37708649039268494,
"learning_rate": 0.00013620812879293863,
"loss": 0.6949,
"step": 246
},
{
"epoch": 0.4386237513873474,
"grad_norm": 0.3661216199398041,
"learning_rate": 0.00013563661039238785,
"loss": 0.7049,
"step": 247
},
{
"epoch": 0.44039955604883463,
"grad_norm": 0.4453539550304413,
"learning_rate": 0.00013506375551927547,
"loss": 0.7957,
"step": 248
},
{
"epoch": 0.4421753607103219,
"grad_norm": 0.46171826124191284,
"learning_rate": 0.00013448958565726144,
"loss": 0.7175,
"step": 249
},
{
"epoch": 0.4439511653718091,
"grad_norm": 0.6314205527305603,
"learning_rate": 0.00013391412233932149,
"loss": 0.8853,
"step": 250
},
{
"epoch": 0.4457269700332963,
"grad_norm": 0.29680782556533813,
"learning_rate": 0.00013333738714693956,
"loss": 0.8789,
"step": 251
},
{
"epoch": 0.44750277469478356,
"grad_norm": 0.30771735310554504,
"learning_rate": 0.00013275940170929843,
"loss": 0.8126,
"step": 252
},
{
"epoch": 0.4492785793562708,
"grad_norm": 0.3242880403995514,
"learning_rate": 0.00013218018770246858,
"loss": 0.7787,
"step": 253
},
{
"epoch": 0.45105438401775805,
"grad_norm": 0.33549076318740845,
"learning_rate": 0.00013159976684859527,
"loss": 0.8113,
"step": 254
},
{
"epoch": 0.4528301886792453,
"grad_norm": 0.34281155467033386,
"learning_rate": 0.00013101816091508388,
"loss": 0.8371,
"step": 255
},
{
"epoch": 0.45460599334073254,
"grad_norm": 0.3422442078590393,
"learning_rate": 0.0001304353917137836,
"loss": 0.8362,
"step": 256
},
{
"epoch": 0.45638179800221973,
"grad_norm": 0.3019155263900757,
"learning_rate": 0.00012985148110016947,
"loss": 0.7317,
"step": 257
},
{
"epoch": 0.458157602663707,
"grad_norm": 0.32793429493904114,
"learning_rate": 0.0001292664509725226,
"loss": 0.7861,
"step": 258
},
{
"epoch": 0.4599334073251942,
"grad_norm": 0.32433855533599854,
"learning_rate": 0.00012868032327110904,
"loss": 0.7708,
"step": 259
},
{
"epoch": 0.46170921198668147,
"grad_norm": 0.31858816742897034,
"learning_rate": 0.00012809311997735696,
"loss": 0.7754,
"step": 260
},
{
"epoch": 0.4634850166481687,
"grad_norm": 0.3172609210014343,
"learning_rate": 0.00012750486311303218,
"loss": 0.7839,
"step": 261
},
{
"epoch": 0.46526082130965596,
"grad_norm": 0.2951931953430176,
"learning_rate": 0.00012691557473941243,
"loss": 0.7261,
"step": 262
},
{
"epoch": 0.46703662597114315,
"grad_norm": 0.31385374069213867,
"learning_rate": 0.00012632527695645993,
"loss": 0.8221,
"step": 263
},
{
"epoch": 0.4688124306326304,
"grad_norm": 0.31157392263412476,
"learning_rate": 0.0001257339919019925,
"loss": 0.7711,
"step": 264
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.32580870389938354,
"learning_rate": 0.00012514174175085345,
"loss": 0.7592,
"step": 265
},
{
"epoch": 0.4723640399556049,
"grad_norm": 0.33285781741142273,
"learning_rate": 0.00012454854871407994,
"loss": 0.7349,
"step": 266
},
{
"epoch": 0.47413984461709213,
"grad_norm": 0.3179035186767578,
"learning_rate": 0.0001239544350380699,
"loss": 0.7338,
"step": 267
},
{
"epoch": 0.4759156492785794,
"grad_norm": 0.31393003463745117,
"learning_rate": 0.00012335942300374788,
"loss": 0.7088,
"step": 268
},
{
"epoch": 0.47769145394006657,
"grad_norm": 0.33285436034202576,
"learning_rate": 0.00012276353492572935,
"loss": 0.7069,
"step": 269
},
{
"epoch": 0.4794672586015538,
"grad_norm": 0.38329485058784485,
"learning_rate": 0.00012216679315148386,
"loss": 0.7093,
"step": 270
},
{
"epoch": 0.48124306326304106,
"grad_norm": 0.3584016263484955,
"learning_rate": 0.00012156922006049702,
"loss": 0.7513,
"step": 271
},
{
"epoch": 0.4830188679245283,
"grad_norm": 0.3995126187801361,
"learning_rate": 0.00012097083806343103,
"loss": 0.7384,
"step": 272
},
{
"epoch": 0.48479467258601555,
"grad_norm": 0.4097007215023041,
"learning_rate": 0.00012037166960128443,
"loss": 0.7794,
"step": 273
},
{
"epoch": 0.4865704772475028,
"grad_norm": 0.4780315160751343,
"learning_rate": 0.00011977173714455034,
"loss": 0.7437,
"step": 274
},
{
"epoch": 0.48834628190899,
"grad_norm": 0.5396427512168884,
"learning_rate": 0.00011917106319237386,
"loss": 0.7542,
"step": 275
},
{
"epoch": 0.49012208657047723,
"grad_norm": 0.29439178109169006,
"learning_rate": 0.00011856967027170818,
"loss": 0.8389,
"step": 276
},
{
"epoch": 0.4918978912319645,
"grad_norm": 0.3243663012981415,
"learning_rate": 0.00011796758093646989,
"loss": 0.8767,
"step": 277
},
{
"epoch": 0.4936736958934517,
"grad_norm": 0.342454195022583,
"learning_rate": 0.00011736481776669306,
"loss": 0.8538,
"step": 278
},
{
"epoch": 0.49544950055493897,
"grad_norm": 0.30882903933525085,
"learning_rate": 0.00011676140336768236,
"loss": 0.7766,
"step": 279
},
{
"epoch": 0.4972253052164262,
"grad_norm": 0.3247200548648834,
"learning_rate": 0.00011615736036916549,
"loss": 0.8268,
"step": 280
},
{
"epoch": 0.4990011098779134,
"grad_norm": 0.3077162504196167,
"learning_rate": 0.00011555271142444433,
"loss": 0.7786,
"step": 281
},
{
"epoch": 0.5007769145394007,
"grad_norm": 0.3300260603427887,
"learning_rate": 0.00011494747920954545,
"loss": 0.7853,
"step": 282
},
{
"epoch": 0.5007769145394007,
"eval_loss": 0.7658749222755432,
"eval_runtime": 158.4653,
"eval_samples_per_second": 5.989,
"eval_steps_per_second": 1.502,
"step": 282
},
{
"epoch": 0.502552719200888,
"grad_norm": 0.331061989068985,
"learning_rate": 0.00011434168642236964,
"loss": 0.8114,
"step": 283
},
{
"epoch": 0.5043285238623751,
"grad_norm": 0.3186919689178467,
"learning_rate": 0.00011373535578184082,
"loss": 0.7872,
"step": 284
},
{
"epoch": 0.5061043285238623,
"grad_norm": 0.3114188611507416,
"learning_rate": 0.00011312851002705383,
"loss": 0.7311,
"step": 285
},
{
"epoch": 0.5078801331853496,
"grad_norm": 0.3148879408836365,
"learning_rate": 0.00011252117191642175,
"loss": 0.7311,
"step": 286
},
{
"epoch": 0.5096559378468368,
"grad_norm": 0.3390887379646301,
"learning_rate": 0.00011191336422682237,
"loss": 0.7773,
"step": 287
},
{
"epoch": 0.5114317425083241,
"grad_norm": 0.31982842087745667,
"learning_rate": 0.00011130510975274409,
"loss": 0.7474,
"step": 288
},
{
"epoch": 0.5132075471698113,
"grad_norm": 0.31643104553222656,
"learning_rate": 0.00011069643130543084,
"loss": 0.7375,
"step": 289
},
{
"epoch": 0.5149833518312985,
"grad_norm": 0.33758479356765747,
"learning_rate": 0.00011008735171202684,
"loss": 0.7411,
"step": 290
},
{
"epoch": 0.5167591564927858,
"grad_norm": 0.324556440114975,
"learning_rate": 0.00010947789381472035,
"loss": 0.7235,
"step": 291
},
{
"epoch": 0.518534961154273,
"grad_norm": 0.3768496513366699,
"learning_rate": 0.00010886808046988717,
"loss": 0.7618,
"step": 292
},
{
"epoch": 0.5203107658157603,
"grad_norm": 0.34034618735313416,
"learning_rate": 0.00010825793454723325,
"loss": 0.7426,
"step": 293
},
{
"epoch": 0.5220865704772475,
"grad_norm": 0.3409979045391083,
"learning_rate": 0.00010764747892893723,
"loss": 0.7327,
"step": 294
},
{
"epoch": 0.5238623751387348,
"grad_norm": 0.35839787125587463,
"learning_rate": 0.00010703673650879218,
"loss": 0.7057,
"step": 295
},
{
"epoch": 0.525638179800222,
"grad_norm": 0.3807874023914337,
"learning_rate": 0.00010642573019134703,
"loss": 0.7225,
"step": 296
},
{
"epoch": 0.5274139844617092,
"grad_norm": 0.4682140648365021,
"learning_rate": 0.00010581448289104758,
"loss": 0.715,
"step": 297
},
{
"epoch": 0.5291897891231965,
"grad_norm": 0.4261273145675659,
"learning_rate": 0.00010520301753137724,
"loss": 0.7239,
"step": 298
},
{
"epoch": 0.5309655937846837,
"grad_norm": 0.4854682981967926,
"learning_rate": 0.00010459135704399718,
"loss": 0.7304,
"step": 299
},
{
"epoch": 0.532741398446171,
"grad_norm": 0.6740989685058594,
"learning_rate": 0.00010397952436788642,
"loss": 0.8604,
"step": 300
},
{
"epoch": 0.5345172031076582,
"grad_norm": 0.2903907299041748,
"learning_rate": 0.00010336754244848157,
"loss": 0.8551,
"step": 301
},
{
"epoch": 0.5362930077691453,
"grad_norm": 0.28648582100868225,
"learning_rate": 0.00010275543423681621,
"loss": 0.7958,
"step": 302
},
{
"epoch": 0.5380688124306326,
"grad_norm": 0.33123767375946045,
"learning_rate": 0.00010214322268866032,
"loss": 0.7853,
"step": 303
},
{
"epoch": 0.5398446170921198,
"grad_norm": 0.31327784061431885,
"learning_rate": 0.00010153093076365923,
"loss": 0.7856,
"step": 304
},
{
"epoch": 0.5416204217536071,
"grad_norm": 0.3101854622364044,
"learning_rate": 0.00010091858142447265,
"loss": 0.7694,
"step": 305
},
{
"epoch": 0.5433962264150943,
"grad_norm": 0.3217926621437073,
"learning_rate": 0.00010030619763591347,
"loss": 0.7899,
"step": 306
},
{
"epoch": 0.5451720310765816,
"grad_norm": 0.33827194571495056,
"learning_rate": 9.969380236408656e-05,
"loss": 0.8088,
"step": 307
},
{
"epoch": 0.5469478357380688,
"grad_norm": 0.32632124423980713,
"learning_rate": 9.908141857552737e-05,
"loss": 0.769,
"step": 308
},
{
"epoch": 0.548723640399556,
"grad_norm": 0.3152617812156677,
"learning_rate": 9.846906923634079e-05,
"loss": 0.7804,
"step": 309
},
{
"epoch": 0.5504994450610433,
"grad_norm": 0.33337536454200745,
"learning_rate": 9.78567773113397e-05,
"loss": 0.7379,
"step": 310
},
{
"epoch": 0.5522752497225305,
"grad_norm": 0.3020349144935608,
"learning_rate": 9.724456576318381e-05,
"loss": 0.7146,
"step": 311
},
{
"epoch": 0.5540510543840178,
"grad_norm": 0.34656378626823425,
"learning_rate": 9.663245755151846e-05,
"loss": 0.7437,
"step": 312
},
{
"epoch": 0.555826859045505,
"grad_norm": 0.3417186737060547,
"learning_rate": 9.602047563211359e-05,
"loss": 0.7472,
"step": 313
},
{
"epoch": 0.5576026637069922,
"grad_norm": 0.34442222118377686,
"learning_rate": 9.540864295600283e-05,
"loss": 0.7426,
"step": 314
},
{
"epoch": 0.5593784683684795,
"grad_norm": 0.3521478772163391,
"learning_rate": 9.479698246862276e-05,
"loss": 0.7522,
"step": 315
},
{
"epoch": 0.5611542730299667,
"grad_norm": 0.3358227014541626,
"learning_rate": 9.418551710895243e-05,
"loss": 0.7454,
"step": 316
},
{
"epoch": 0.562930077691454,
"grad_norm": 0.343226820230484,
"learning_rate": 9.357426980865301e-05,
"loss": 0.7341,
"step": 317
},
{
"epoch": 0.5647058823529412,
"grad_norm": 0.3432699739933014,
"learning_rate": 9.296326349120785e-05,
"loss": 0.6836,
"step": 318
},
{
"epoch": 0.5664816870144284,
"grad_norm": 0.3710852265357971,
"learning_rate": 9.235252107106279e-05,
"loss": 0.6961,
"step": 319
},
{
"epoch": 0.5682574916759157,
"grad_norm": 0.351094514131546,
"learning_rate": 9.174206545276677e-05,
"loss": 0.6668,
"step": 320
},
{
"epoch": 0.5700332963374029,
"grad_norm": 0.4484163224697113,
"learning_rate": 9.113191953011287e-05,
"loss": 0.7427,
"step": 321
},
{
"epoch": 0.5718091009988902,
"grad_norm": 0.44636109471321106,
"learning_rate": 9.052210618527966e-05,
"loss": 0.8119,
"step": 322
},
{
"epoch": 0.5735849056603773,
"grad_norm": 0.43749314546585083,
"learning_rate": 8.991264828797319e-05,
"loss": 0.7846,
"step": 323
},
{
"epoch": 0.5753607103218646,
"grad_norm": 0.4471510350704193,
"learning_rate": 8.930356869456919e-05,
"loss": 0.7215,
"step": 324
},
{
"epoch": 0.5771365149833518,
"grad_norm": 0.5141078233718872,
"learning_rate": 8.869489024725595e-05,
"loss": 0.7492,
"step": 325
},
{
"epoch": 0.578912319644839,
"grad_norm": 0.2640296518802643,
"learning_rate": 8.808663577317764e-05,
"loss": 0.8625,
"step": 326
},
{
"epoch": 0.5806881243063263,
"grad_norm": 0.28867048025131226,
"learning_rate": 8.747882808357828e-05,
"loss": 0.8352,
"step": 327
},
{
"epoch": 0.5824639289678135,
"grad_norm": 0.2925030589103699,
"learning_rate": 8.687148997294621e-05,
"loss": 0.8091,
"step": 328
},
{
"epoch": 0.5842397336293008,
"grad_norm": 0.28383681178092957,
"learning_rate": 8.626464421815919e-05,
"loss": 0.784,
"step": 329
},
{
"epoch": 0.586015538290788,
"grad_norm": 0.3055633306503296,
"learning_rate": 8.565831357763039e-05,
"loss": 0.79,
"step": 330
},
{
"epoch": 0.5877913429522752,
"grad_norm": 0.30299943685531616,
"learning_rate": 8.505252079045458e-05,
"loss": 0.8105,
"step": 331
},
{
"epoch": 0.5895671476137625,
"grad_norm": 0.3154890239238739,
"learning_rate": 8.444728857555572e-05,
"loss": 0.7664,
"step": 332
},
{
"epoch": 0.5913429522752497,
"grad_norm": 0.31844133138656616,
"learning_rate": 8.384263963083453e-05,
"loss": 0.7709,
"step": 333
},
{
"epoch": 0.593118756936737,
"grad_norm": 0.31844353675842285,
"learning_rate": 8.323859663231768e-05,
"loss": 0.7426,
"step": 334
},
{
"epoch": 0.5948945615982242,
"grad_norm": 0.31527841091156006,
"learning_rate": 8.263518223330697e-05,
"loss": 0.7441,
"step": 335
},
{
"epoch": 0.5966703662597115,
"grad_norm": 0.32145699858665466,
"learning_rate": 8.203241906353014e-05,
"loss": 0.7333,
"step": 336
},
{
"epoch": 0.5984461709211987,
"grad_norm": 0.3175109922885895,
"learning_rate": 8.143032972829183e-05,
"loss": 0.7488,
"step": 337
},
{
"epoch": 0.6002219755826859,
"grad_norm": 0.3342651128768921,
"learning_rate": 8.082893680762619e-05,
"loss": 0.7265,
"step": 338
},
{
"epoch": 0.6019977802441732,
"grad_norm": 0.339743971824646,
"learning_rate": 8.022826285544968e-05,
"loss": 0.7005,
"step": 339
},
{
"epoch": 0.6037735849056604,
"grad_norm": 0.35757359862327576,
"learning_rate": 7.96283303987156e-05,
"loss": 0.7806,
"step": 340
},
{
"epoch": 0.6055493895671477,
"grad_norm": 0.4024328291416168,
"learning_rate": 7.902916193656898e-05,
"loss": 0.6895,
"step": 341
},
{
"epoch": 0.6073251942286348,
"grad_norm": 0.3628247380256653,
"learning_rate": 7.843077993950302e-05,
"loss": 0.7285,
"step": 342
},
{
"epoch": 0.609100998890122,
"grad_norm": 0.3793889582157135,
"learning_rate": 7.783320684851614e-05,
"loss": 0.729,
"step": 343
},
{
"epoch": 0.6108768035516093,
"grad_norm": 0.37614578008651733,
"learning_rate": 7.72364650742707e-05,
"loss": 0.6869,
"step": 344
},
{
"epoch": 0.6126526082130965,
"grad_norm": 0.3737132251262665,
"learning_rate": 7.664057699625214e-05,
"loss": 0.7373,
"step": 345
},
{
"epoch": 0.6144284128745838,
"grad_norm": 0.40523961186408997,
"learning_rate": 7.604556496193015e-05,
"loss": 0.729,
"step": 346
},
{
"epoch": 0.616204217536071,
"grad_norm": 0.3903469145298004,
"learning_rate": 7.54514512859201e-05,
"loss": 0.7063,
"step": 347
},
{
"epoch": 0.6179800221975583,
"grad_norm": 0.43782973289489746,
"learning_rate": 7.485825824914659e-05,
"loss": 0.6763,
"step": 348
},
{
"epoch": 0.6197558268590455,
"grad_norm": 0.4907206594944,
"learning_rate": 7.426600809800752e-05,
"loss": 0.7405,
"step": 349
},
{
"epoch": 0.6215316315205327,
"grad_norm": 0.5378274917602539,
"learning_rate": 7.36747230435401e-05,
"loss": 0.7417,
"step": 350
},
{
"epoch": 0.62330743618202,
"grad_norm": 0.266481876373291,
"learning_rate": 7.308442526058756e-05,
"loss": 0.8434,
"step": 351
},
{
"epoch": 0.6250832408435072,
"grad_norm": 0.28670433163642883,
"learning_rate": 7.249513688696786e-05,
"loss": 0.8049,
"step": 352
},
{
"epoch": 0.6268590455049945,
"grad_norm": 0.29961690306663513,
"learning_rate": 7.190688002264308e-05,
"loss": 0.762,
"step": 353
},
{
"epoch": 0.6286348501664817,
"grad_norm": 0.2873949706554413,
"learning_rate": 7.131967672889101e-05,
"loss": 0.7389,
"step": 354
},
{
"epoch": 0.6304106548279689,
"grad_norm": 0.3315136730670929,
"learning_rate": 7.073354902747741e-05,
"loss": 0.7719,
"step": 355
},
{
"epoch": 0.6321864594894562,
"grad_norm": 0.31057095527648926,
"learning_rate": 7.014851889983057e-05,
"loss": 0.7407,
"step": 356
},
{
"epoch": 0.6339622641509434,
"grad_norm": 0.345838725566864,
"learning_rate": 6.95646082862164e-05,
"loss": 0.7838,
"step": 357
},
{
"epoch": 0.6357380688124307,
"grad_norm": 0.31915196776390076,
"learning_rate": 6.898183908491617e-05,
"loss": 0.7591,
"step": 358
},
{
"epoch": 0.6375138734739179,
"grad_norm": 0.3124110698699951,
"learning_rate": 6.840023315140475e-05,
"loss": 0.7222,
"step": 359
},
{
"epoch": 0.6392896781354052,
"grad_norm": 0.3307512104511261,
"learning_rate": 6.781981229753145e-05,
"loss": 0.7472,
"step": 360
},
{
"epoch": 0.6410654827968923,
"grad_norm": 0.3425205945968628,
"learning_rate": 6.724059829070158e-05,
"loss": 0.764,
"step": 361
},
{
"epoch": 0.6428412874583795,
"grad_norm": 0.33861225843429565,
"learning_rate": 6.666261285306047e-05,
"loss": 0.7396,
"step": 362
},
{
"epoch": 0.6446170921198668,
"grad_norm": 0.3248923420906067,
"learning_rate": 6.608587766067852e-05,
"loss": 0.7158,
"step": 363
},
{
"epoch": 0.646392896781354,
"grad_norm": 0.349185049533844,
"learning_rate": 6.551041434273861e-05,
"loss": 0.7415,
"step": 364
},
{
"epoch": 0.6481687014428413,
"grad_norm": 0.33934569358825684,
"learning_rate": 6.493624448072457e-05,
"loss": 0.744,
"step": 365
},
{
"epoch": 0.6499445061043285,
"grad_norm": 0.3628052771091461,
"learning_rate": 6.43633896076122e-05,
"loss": 0.7328,
"step": 366
},
{
"epoch": 0.6517203107658157,
"grad_norm": 0.348979115486145,
"learning_rate": 6.379187120706138e-05,
"loss": 0.6755,
"step": 367
},
{
"epoch": 0.653496115427303,
"grad_norm": 0.38474076986312866,
"learning_rate": 6.322171071261071e-05,
"loss": 0.711,
"step": 368
},
{
"epoch": 0.6552719200887902,
"grad_norm": 0.34556257724761963,
"learning_rate": 6.26529295068733e-05,
"loss": 0.6995,
"step": 369
},
{
"epoch": 0.6570477247502775,
"grad_norm": 0.4337230622768402,
"learning_rate": 6.208554892073528e-05,
"loss": 0.7412,
"step": 370
},
{
"epoch": 0.6588235294117647,
"grad_norm": 0.37804853916168213,
"learning_rate": 6.151959023255545e-05,
"loss": 0.6724,
"step": 371
},
{
"epoch": 0.6605993340732519,
"grad_norm": 0.40870919823646545,
"learning_rate": 6.095507466736763e-05,
"loss": 0.7243,
"step": 372
},
{
"epoch": 0.6623751387347392,
"grad_norm": 0.45504140853881836,
"learning_rate": 6.039202339608432e-05,
"loss": 0.7373,
"step": 373
},
{
"epoch": 0.6641509433962264,
"grad_norm": 0.46973538398742676,
"learning_rate": 5.983045753470308e-05,
"loss": 0.7101,
"step": 374
},
{
"epoch": 0.6659267480577137,
"grad_norm": 0.5572993755340576,
"learning_rate": 5.927039814351426e-05,
"loss": 0.7393,
"step": 375
},
{
"epoch": 0.6677025527192009,
"grad_norm": 0.2691468596458435,
"learning_rate": 5.8711866226311553e-05,
"loss": 0.8102,
"step": 376
},
{
"epoch": 0.6694783573806882,
"grad_norm": 0.2898322641849518,
"learning_rate": 5.8154882729603876e-05,
"loss": 0.7968,
"step": 377
},
{
"epoch": 0.6712541620421754,
"grad_norm": 0.3048444092273712,
"learning_rate": 5.7599468541830356e-05,
"loss": 0.775,
"step": 378
},
{
"epoch": 0.6730299667036626,
"grad_norm": 0.3111611604690552,
"learning_rate": 5.7045644492576346e-05,
"loss": 0.7742,
"step": 379
},
{
"epoch": 0.6748057713651499,
"grad_norm": 0.31889772415161133,
"learning_rate": 5.64934313517927e-05,
"loss": 0.7304,
"step": 380
},
{
"epoch": 0.676581576026637,
"grad_norm": 0.3219664692878723,
"learning_rate": 5.5942849829016695e-05,
"loss": 0.7679,
"step": 381
},
{
"epoch": 0.6783573806881243,
"grad_norm": 0.30955034494400024,
"learning_rate": 5.5393920572595356e-05,
"loss": 0.7443,
"step": 382
},
{
"epoch": 0.6801331853496115,
"grad_norm": 0.344043105840683,
"learning_rate": 5.484666416891109e-05,
"loss": 0.7272,
"step": 383
},
{
"epoch": 0.6819089900110987,
"grad_norm": 0.33895599842071533,
"learning_rate": 5.430110114160964e-05,
"loss": 0.7585,
"step": 384
},
{
"epoch": 0.683684794672586,
"grad_norm": 0.37816834449768066,
"learning_rate": 5.375725195083046e-05,
"loss": 0.7749,
"step": 385
},
{
"epoch": 0.6854605993340732,
"grad_norm": 0.3477395176887512,
"learning_rate": 5.321513699243924e-05,
"loss": 0.7022,
"step": 386
},
{
"epoch": 0.6872364039955605,
"grad_norm": 0.3380398154258728,
"learning_rate": 5.2674776597263186e-05,
"loss": 0.7266,
"step": 387
},
{
"epoch": 0.6890122086570477,
"grad_norm": 0.35505762696266174,
"learning_rate": 5.2136191030328455e-05,
"loss": 0.7411,
"step": 388
},
{
"epoch": 0.690788013318535,
"grad_norm": 0.38739171624183655,
"learning_rate": 5.159940049010015e-05,
"loss": 0.7666,
"step": 389
},
{
"epoch": 0.6925638179800222,
"grad_norm": 0.38473132252693176,
"learning_rate": 5.106442510772489e-05,
"loss": 0.7038,
"step": 390
},
{
"epoch": 0.6943396226415094,
"grad_norm": 0.37635302543640137,
"learning_rate": 5.0531284946275784e-05,
"loss": 0.7488,
"step": 391
},
{
"epoch": 0.6961154273029967,
"grad_norm": 0.37422046065330505,
"learning_rate": 5.000000000000002e-05,
"loss": 0.693,
"step": 392
},
{
"epoch": 0.6978912319644839,
"grad_norm": 0.3987278342247009,
"learning_rate": 4.9470590193569044e-05,
"loss": 0.6965,
"step": 393
},
{
"epoch": 0.6996670366259712,
"grad_norm": 0.34372609853744507,
"learning_rate": 4.894307538133129e-05,
"loss": 0.6632,
"step": 394
},
{
"epoch": 0.7014428412874584,
"grad_norm": 0.4215118885040283,
"learning_rate": 4.841747534656763e-05,
"loss": 0.7081,
"step": 395
},
{
"epoch": 0.7032186459489456,
"grad_norm": 0.4211183488368988,
"learning_rate": 4.7893809800749403e-05,
"loss": 0.687,
"step": 396
},
{
"epoch": 0.7049944506104329,
"grad_norm": 0.44248080253601074,
"learning_rate": 4.737209838279922e-05,
"loss": 0.7118,
"step": 397
},
{
"epoch": 0.7067702552719201,
"grad_norm": 0.38100606203079224,
"learning_rate": 4.685236065835443e-05,
"loss": 0.6259,
"step": 398
},
{
"epoch": 0.7085460599334074,
"grad_norm": 0.46482354402542114,
"learning_rate": 4.6334616119033356e-05,
"loss": 0.6668,
"step": 399
},
{
"epoch": 0.7103218645948945,
"grad_norm": 0.5484885573387146,
"learning_rate": 4.5818884181704294e-05,
"loss": 0.7973,
"step": 400
},
{
"epoch": 0.7120976692563818,
"grad_norm": 0.2660059928894043,
"learning_rate": 4.530518418775733e-05,
"loss": 0.7845,
"step": 401
},
{
"epoch": 0.713873473917869,
"grad_norm": 0.30005505681037903,
"learning_rate": 4.479353540237903e-05,
"loss": 0.8141,
"step": 402
},
{
"epoch": 0.7156492785793562,
"grad_norm": 0.3031437397003174,
"learning_rate": 4.4283957013829846e-05,
"loss": 0.7505,
"step": 403
},
{
"epoch": 0.7174250832408435,
"grad_norm": 0.3152884542942047,
"learning_rate": 4.3776468132724604e-05,
"loss": 0.8191,
"step": 404
},
{
"epoch": 0.7192008879023307,
"grad_norm": 0.3122805058956146,
"learning_rate": 4.3271087791315734e-05,
"loss": 0.7732,
"step": 405
},
{
"epoch": 0.720976692563818,
"grad_norm": 0.3241139054298401,
"learning_rate": 4.276783494277954e-05,
"loss": 0.7652,
"step": 406
},
{
"epoch": 0.7227524972253052,
"grad_norm": 0.3523857295513153,
"learning_rate": 4.2266728460505375e-05,
"loss": 0.7923,
"step": 407
},
{
"epoch": 0.7245283018867924,
"grad_norm": 0.3518478274345398,
"learning_rate": 4.176778713738787e-05,
"loss": 0.8046,
"step": 408
},
{
"epoch": 0.7263041065482797,
"grad_norm": 0.35740435123443604,
"learning_rate": 4.127102968512214e-05,
"loss": 0.741,
"step": 409
},
{
"epoch": 0.7280799112097669,
"grad_norm": 0.3561273217201233,
"learning_rate": 4.077647473350201e-05,
"loss": 0.7304,
"step": 410
},
{
"epoch": 0.7298557158712542,
"grad_norm": 0.3595544397830963,
"learning_rate": 4.028414082972141e-05,
"loss": 0.7601,
"step": 411
},
{
"epoch": 0.7316315205327414,
"grad_norm": 0.38603028655052185,
"learning_rate": 3.97940464376787e-05,
"loss": 0.768,
"step": 412
},
{
"epoch": 0.7334073251942287,
"grad_norm": 0.347781240940094,
"learning_rate": 3.9306209937284346e-05,
"loss": 0.7255,
"step": 413
},
{
"epoch": 0.7351831298557159,
"grad_norm": 0.3760242462158203,
"learning_rate": 3.882064962377154e-05,
"loss": 0.7371,
"step": 414
},
{
"epoch": 0.7369589345172031,
"grad_norm": 0.359371542930603,
"learning_rate": 3.83373837070101e-05,
"loss": 0.7422,
"step": 415
},
{
"epoch": 0.7387347391786904,
"grad_norm": 0.3574449419975281,
"learning_rate": 3.7856430310823545e-05,
"loss": 0.6915,
"step": 416
},
{
"epoch": 0.7405105438401776,
"grad_norm": 0.3730245530605316,
"learning_rate": 3.737780747230941e-05,
"loss": 0.7309,
"step": 417
},
{
"epoch": 0.7422863485016649,
"grad_norm": 0.36496400833129883,
"learning_rate": 3.69015331411628e-05,
"loss": 0.7245,
"step": 418
},
{
"epoch": 0.744062153163152,
"grad_norm": 0.3593985140323639,
"learning_rate": 3.642762517900322e-05,
"loss": 0.6389,
"step": 419
},
{
"epoch": 0.7458379578246392,
"grad_norm": 0.3603939116001129,
"learning_rate": 3.595610135870472e-05,
"loss": 0.703,
"step": 420
},
{
"epoch": 0.7476137624861265,
"grad_norm": 0.397124320268631,
"learning_rate": 3.548697936372937e-05,
"loss": 0.7265,
"step": 421
},
{
"epoch": 0.7493895671476137,
"grad_norm": 0.4071907103061676,
"learning_rate": 3.5020276787464056e-05,
"loss": 0.6752,
"step": 422
},
{
"epoch": 0.751165371809101,
"grad_norm": 0.3834024965763092,
"learning_rate": 3.455601113256073e-05,
"loss": 0.6297,
"step": 423
},
{
"epoch": 0.751165371809101,
"eval_loss": 0.7374839186668396,
"eval_runtime": 156.6123,
"eval_samples_per_second": 6.06,
"eval_steps_per_second": 1.52,
"step": 423
}
],
"logging_steps": 1,
"max_steps": 563,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 141,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.571234948741857e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}