bbytxt's picture
Training in progress, step 200, checkpoint
7aff449 verified
{
"best_metric": 0.587491512298584,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.03306741619476708,
"eval_steps": 25,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001653370809738354,
"grad_norm": 1.1458706855773926,
"learning_rate": 2.9999999999999997e-05,
"loss": 0.9686,
"step": 1
},
{
"epoch": 0.0001653370809738354,
"eval_loss": 1.4532192945480347,
"eval_runtime": 1.8909,
"eval_samples_per_second": 26.443,
"eval_steps_per_second": 3.702,
"step": 1
},
{
"epoch": 0.0003306741619476708,
"grad_norm": 1.2934821844100952,
"learning_rate": 5.9999999999999995e-05,
"loss": 1.1419,
"step": 2
},
{
"epoch": 0.0004960112429215062,
"grad_norm": 1.3526453971862793,
"learning_rate": 8.999999999999999e-05,
"loss": 1.1456,
"step": 3
},
{
"epoch": 0.0006613483238953416,
"grad_norm": 1.4010525941848755,
"learning_rate": 0.00011999999999999999,
"loss": 1.2078,
"step": 4
},
{
"epoch": 0.000826685404869177,
"grad_norm": 1.1631656885147095,
"learning_rate": 0.00015,
"loss": 1.0343,
"step": 5
},
{
"epoch": 0.0009920224858430125,
"grad_norm": 0.9487802982330322,
"learning_rate": 0.00017999999999999998,
"loss": 1.0424,
"step": 6
},
{
"epoch": 0.0011573595668168477,
"grad_norm": 0.6479355096817017,
"learning_rate": 0.00020999999999999998,
"loss": 0.9562,
"step": 7
},
{
"epoch": 0.0013226966477906832,
"grad_norm": 1.1392258405685425,
"learning_rate": 0.00023999999999999998,
"loss": 0.8981,
"step": 8
},
{
"epoch": 0.0014880337287645187,
"grad_norm": 1.2654004096984863,
"learning_rate": 0.00027,
"loss": 0.9577,
"step": 9
},
{
"epoch": 0.001653370809738354,
"grad_norm": 0.9595345258712769,
"learning_rate": 0.0003,
"loss": 0.8777,
"step": 10
},
{
"epoch": 0.0018187078907121895,
"grad_norm": 0.5911861062049866,
"learning_rate": 0.0002999794957488703,
"loss": 0.8749,
"step": 11
},
{
"epoch": 0.001984044971686025,
"grad_norm": 0.46297988295555115,
"learning_rate": 0.0002999179886011389,
"loss": 0.7729,
"step": 12
},
{
"epoch": 0.0021493820526598604,
"grad_norm": 0.5073149800300598,
"learning_rate": 0.0002998154953722457,
"loss": 0.8461,
"step": 13
},
{
"epoch": 0.0023147191336336955,
"grad_norm": 0.5063093900680542,
"learning_rate": 0.00029967204408281613,
"loss": 0.7949,
"step": 14
},
{
"epoch": 0.002480056214607531,
"grad_norm": 0.4964233934879303,
"learning_rate": 0.00029948767395100045,
"loss": 0.8145,
"step": 15
},
{
"epoch": 0.0026453932955813665,
"grad_norm": 0.46869364380836487,
"learning_rate": 0.0002992624353817517,
"loss": 0.7973,
"step": 16
},
{
"epoch": 0.002810730376555202,
"grad_norm": 0.48064351081848145,
"learning_rate": 0.0002989963899530457,
"loss": 0.7762,
"step": 17
},
{
"epoch": 0.0029760674575290374,
"grad_norm": 0.4655352532863617,
"learning_rate": 0.00029868961039904624,
"loss": 0.8017,
"step": 18
},
{
"epoch": 0.003141404538502873,
"grad_norm": 0.4899359345436096,
"learning_rate": 0.00029834218059022024,
"loss": 0.7672,
"step": 19
},
{
"epoch": 0.003306741619476708,
"grad_norm": 0.49928662180900574,
"learning_rate": 0.00029795419551040833,
"loss": 0.7774,
"step": 20
},
{
"epoch": 0.0034720787004505435,
"grad_norm": 0.5411067008972168,
"learning_rate": 0.00029752576123085736,
"loss": 0.7915,
"step": 21
},
{
"epoch": 0.003637415781424379,
"grad_norm": 0.49483177065849304,
"learning_rate": 0.0002970569948812214,
"loss": 0.81,
"step": 22
},
{
"epoch": 0.0038027528623982144,
"grad_norm": 0.477321594953537,
"learning_rate": 0.0002965480246175399,
"loss": 0.7135,
"step": 23
},
{
"epoch": 0.00396808994337205,
"grad_norm": 0.5247819423675537,
"learning_rate": 0.0002959989895872009,
"loss": 0.7725,
"step": 24
},
{
"epoch": 0.004133427024345885,
"grad_norm": 0.5035298466682434,
"learning_rate": 0.0002954100398908995,
"loss": 0.6796,
"step": 25
},
{
"epoch": 0.004133427024345885,
"eval_loss": 0.740797221660614,
"eval_runtime": 1.8714,
"eval_samples_per_second": 26.719,
"eval_steps_per_second": 3.741,
"step": 25
},
{
"epoch": 0.004298764105319721,
"grad_norm": 0.5274375081062317,
"learning_rate": 0.0002947813365416023,
"loss": 0.7499,
"step": 26
},
{
"epoch": 0.004464101186293556,
"grad_norm": 0.5270937085151672,
"learning_rate": 0.0002941130514205272,
"loss": 0.72,
"step": 27
},
{
"epoch": 0.004629438267267391,
"grad_norm": 0.5450596213340759,
"learning_rate": 0.0002934053672301536,
"loss": 0.7604,
"step": 28
},
{
"epoch": 0.004794775348241227,
"grad_norm": 0.5546329021453857,
"learning_rate": 0.00029265847744427303,
"loss": 0.7516,
"step": 29
},
{
"epoch": 0.004960112429215062,
"grad_norm": 0.5378047823905945,
"learning_rate": 0.00029187258625509513,
"loss": 0.7325,
"step": 30
},
{
"epoch": 0.005125449510188898,
"grad_norm": 0.5384945869445801,
"learning_rate": 0.00029104790851742417,
"loss": 0.7204,
"step": 31
},
{
"epoch": 0.005290786591162733,
"grad_norm": 0.564714252948761,
"learning_rate": 0.0002901846696899191,
"loss": 0.7475,
"step": 32
},
{
"epoch": 0.005456123672136569,
"grad_norm": 0.5964205861091614,
"learning_rate": 0.00028928310577345606,
"loss": 0.7079,
"step": 33
},
{
"epoch": 0.005621460753110404,
"grad_norm": 0.5732771754264832,
"learning_rate": 0.0002883434632466077,
"loss": 0.6735,
"step": 34
},
{
"epoch": 0.005786797834084239,
"grad_norm": 0.5388650298118591,
"learning_rate": 0.00028736599899825856,
"loss": 0.6702,
"step": 35
},
{
"epoch": 0.005952134915058075,
"grad_norm": 0.5706294775009155,
"learning_rate": 0.00028635098025737434,
"loss": 0.6716,
"step": 36
},
{
"epoch": 0.00611747199603191,
"grad_norm": 0.5575750470161438,
"learning_rate": 0.00028529868451994384,
"loss": 0.6672,
"step": 37
},
{
"epoch": 0.006282809077005746,
"grad_norm": 0.6226919889450073,
"learning_rate": 0.0002842093994731145,
"loss": 0.669,
"step": 38
},
{
"epoch": 0.006448146157979581,
"grad_norm": 0.6209577322006226,
"learning_rate": 0.00028308342291654174,
"loss": 0.6994,
"step": 39
},
{
"epoch": 0.006613483238953416,
"grad_norm": 0.5951119065284729,
"learning_rate": 0.00028192106268097334,
"loss": 0.6644,
"step": 40
},
{
"epoch": 0.006778820319927252,
"grad_norm": 0.6036494374275208,
"learning_rate": 0.00028072263654409154,
"loss": 0.6463,
"step": 41
},
{
"epoch": 0.006944157400901087,
"grad_norm": 0.6368991136550903,
"learning_rate": 0.0002794884721436361,
"loss": 0.63,
"step": 42
},
{
"epoch": 0.007109494481874923,
"grad_norm": 0.6688757538795471,
"learning_rate": 0.00027821890688783083,
"loss": 0.6476,
"step": 43
},
{
"epoch": 0.007274831562848758,
"grad_norm": 0.6986092329025269,
"learning_rate": 0.0002769142878631403,
"loss": 0.7047,
"step": 44
},
{
"epoch": 0.007440168643822593,
"grad_norm": 0.7855607867240906,
"learning_rate": 0.00027557497173937923,
"loss": 0.6652,
"step": 45
},
{
"epoch": 0.007605505724796429,
"grad_norm": 0.7658830881118774,
"learning_rate": 0.000274201324672203,
"loss": 0.705,
"step": 46
},
{
"epoch": 0.007770842805770264,
"grad_norm": 0.7558281421661377,
"learning_rate": 0.00027279372220300385,
"loss": 0.6567,
"step": 47
},
{
"epoch": 0.0079361798867441,
"grad_norm": 0.8528456687927246,
"learning_rate": 0.0002713525491562421,
"loss": 0.6641,
"step": 48
},
{
"epoch": 0.008101516967717935,
"grad_norm": 0.8852099180221558,
"learning_rate": 0.00026987819953423867,
"loss": 0.6517,
"step": 49
},
{
"epoch": 0.00826685404869177,
"grad_norm": 1.2807631492614746,
"learning_rate": 0.00026837107640945905,
"loss": 0.7514,
"step": 50
},
{
"epoch": 0.00826685404869177,
"eval_loss": 0.7300238013267517,
"eval_runtime": 1.8747,
"eval_samples_per_second": 26.671,
"eval_steps_per_second": 3.734,
"step": 50
},
{
"epoch": 0.008432191129665605,
"grad_norm": 0.662638783454895,
"learning_rate": 0.0002668315918143169,
"loss": 0.5879,
"step": 51
},
{
"epoch": 0.008597528210639442,
"grad_norm": 0.6220346689224243,
"learning_rate": 0.00026526016662852886,
"loss": 0.6805,
"step": 52
},
{
"epoch": 0.008762865291613277,
"grad_norm": 0.4488891065120697,
"learning_rate": 0.00026365723046405023,
"loss": 0.6268,
"step": 53
},
{
"epoch": 0.008928202372587112,
"grad_norm": 0.45153406262397766,
"learning_rate": 0.0002620232215476231,
"loss": 0.6505,
"step": 54
},
{
"epoch": 0.009093539453560947,
"grad_norm": 0.4724684953689575,
"learning_rate": 0.0002603585866009697,
"loss": 0.618,
"step": 55
},
{
"epoch": 0.009258876534534782,
"grad_norm": 0.44388478994369507,
"learning_rate": 0.00025866378071866334,
"loss": 0.628,
"step": 56
},
{
"epoch": 0.009424213615508619,
"grad_norm": 0.4252602159976959,
"learning_rate": 0.00025693926724370956,
"loss": 0.6648,
"step": 57
},
{
"epoch": 0.009589550696482454,
"grad_norm": 0.44454148411750793,
"learning_rate": 0.00025518551764087326,
"loss": 0.7274,
"step": 58
},
{
"epoch": 0.009754887777456289,
"grad_norm": 0.4203338325023651,
"learning_rate": 0.00025340301136778483,
"loss": 0.6249,
"step": 59
},
{
"epoch": 0.009920224858430124,
"grad_norm": 0.4334009885787964,
"learning_rate": 0.00025159223574386114,
"loss": 0.626,
"step": 60
},
{
"epoch": 0.010085561939403959,
"grad_norm": 0.4594862163066864,
"learning_rate": 0.0002497536858170772,
"loss": 0.7358,
"step": 61
},
{
"epoch": 0.010250899020377796,
"grad_norm": 0.4198598563671112,
"learning_rate": 0.00024788786422862526,
"loss": 0.6522,
"step": 62
},
{
"epoch": 0.01041623610135163,
"grad_norm": 0.47395503520965576,
"learning_rate": 0.00024599528107549745,
"loss": 0.6878,
"step": 63
},
{
"epoch": 0.010581573182325466,
"grad_norm": 0.42716339230537415,
"learning_rate": 0.00024407645377103054,
"loss": 0.662,
"step": 64
},
{
"epoch": 0.010746910263299301,
"grad_norm": 0.40592852234840393,
"learning_rate": 0.00024213190690345018,
"loss": 0.692,
"step": 65
},
{
"epoch": 0.010912247344273138,
"grad_norm": 0.4136260747909546,
"learning_rate": 0.00024016217209245374,
"loss": 0.6462,
"step": 66
},
{
"epoch": 0.011077584425246973,
"grad_norm": 0.43495914340019226,
"learning_rate": 0.00023816778784387094,
"loss": 0.672,
"step": 67
},
{
"epoch": 0.011242921506220808,
"grad_norm": 0.4204349219799042,
"learning_rate": 0.0002361492994024415,
"loss": 0.6677,
"step": 68
},
{
"epoch": 0.011408258587194643,
"grad_norm": 0.41397807002067566,
"learning_rate": 0.0002341072586027509,
"loss": 0.6314,
"step": 69
},
{
"epoch": 0.011573595668168478,
"grad_norm": 0.4293558895587921,
"learning_rate": 0.00023204222371836405,
"loss": 0.6423,
"step": 70
},
{
"epoch": 0.011738932749142315,
"grad_norm": 0.41890963912010193,
"learning_rate": 0.00022995475930919905,
"loss": 0.5675,
"step": 71
},
{
"epoch": 0.01190426983011615,
"grad_norm": 0.4755418002605438,
"learning_rate": 0.00022784543606718227,
"loss": 0.7091,
"step": 72
},
{
"epoch": 0.012069606911089985,
"grad_norm": 0.4745832085609436,
"learning_rate": 0.00022571483066022657,
"loss": 0.6212,
"step": 73
},
{
"epoch": 0.01223494399206382,
"grad_norm": 0.48046061396598816,
"learning_rate": 0.0002235635255745762,
"loss": 0.6447,
"step": 74
},
{
"epoch": 0.012400281073037655,
"grad_norm": 0.48077794909477234,
"learning_rate": 0.00022139210895556104,
"loss": 0.6251,
"step": 75
},
{
"epoch": 0.012400281073037655,
"eval_loss": 0.6494519114494324,
"eval_runtime": 1.8961,
"eval_samples_per_second": 26.37,
"eval_steps_per_second": 3.692,
"step": 75
},
{
"epoch": 0.012565618154011492,
"grad_norm": 0.518615186214447,
"learning_rate": 0.00021920117444680317,
"loss": 0.6074,
"step": 76
},
{
"epoch": 0.012730955234985327,
"grad_norm": 0.5402657985687256,
"learning_rate": 0.00021699132102792097,
"loss": 0.6575,
"step": 77
},
{
"epoch": 0.012896292315959162,
"grad_norm": 0.5036848783493042,
"learning_rate": 0.0002147631528507739,
"loss": 0.6517,
"step": 78
},
{
"epoch": 0.013061629396932997,
"grad_norm": 0.517012894153595,
"learning_rate": 0.00021251727907429355,
"loss": 0.6246,
"step": 79
},
{
"epoch": 0.013226966477906832,
"grad_norm": 0.4987259805202484,
"learning_rate": 0.0002102543136979454,
"loss": 0.5903,
"step": 80
},
{
"epoch": 0.013392303558880669,
"grad_norm": 0.5765737295150757,
"learning_rate": 0.0002079748753938678,
"loss": 0.6795,
"step": 81
},
{
"epoch": 0.013557640639854504,
"grad_norm": 0.5101305246353149,
"learning_rate": 0.0002056795873377331,
"loss": 0.5953,
"step": 82
},
{
"epoch": 0.013722977720828339,
"grad_norm": 0.5157580375671387,
"learning_rate": 0.00020336907703837748,
"loss": 0.6183,
"step": 83
},
{
"epoch": 0.013888314801802174,
"grad_norm": 0.5230023860931396,
"learning_rate": 0.00020104397616624645,
"loss": 0.6494,
"step": 84
},
{
"epoch": 0.014053651882776009,
"grad_norm": 0.6248013973236084,
"learning_rate": 0.00019870492038070252,
"loss": 0.6383,
"step": 85
},
{
"epoch": 0.014218988963749846,
"grad_norm": 0.5421463847160339,
"learning_rate": 0.0001963525491562421,
"loss": 0.5986,
"step": 86
},
{
"epoch": 0.01438432604472368,
"grad_norm": 0.5650635957717896,
"learning_rate": 0.0001939875056076697,
"loss": 0.6032,
"step": 87
},
{
"epoch": 0.014549663125697516,
"grad_norm": 0.5500821471214294,
"learning_rate": 0.00019161043631427666,
"loss": 0.5598,
"step": 88
},
{
"epoch": 0.01471500020667135,
"grad_norm": 0.6109346151351929,
"learning_rate": 0.00018922199114307294,
"loss": 0.696,
"step": 89
},
{
"epoch": 0.014880337287645186,
"grad_norm": 0.5851466059684753,
"learning_rate": 0.00018682282307111987,
"loss": 0.5489,
"step": 90
},
{
"epoch": 0.015045674368619023,
"grad_norm": 0.585314929485321,
"learning_rate": 0.00018441358800701273,
"loss": 0.5473,
"step": 91
},
{
"epoch": 0.015211011449592858,
"grad_norm": 0.6997946500778198,
"learning_rate": 0.00018199494461156203,
"loss": 0.6722,
"step": 92
},
{
"epoch": 0.015376348530566693,
"grad_norm": 0.6760852336883545,
"learning_rate": 0.000179567554117722,
"loss": 0.5968,
"step": 93
},
{
"epoch": 0.015541685611540528,
"grad_norm": 0.6954542994499207,
"learning_rate": 0.00017713208014981648,
"loss": 0.6346,
"step": 94
},
{
"epoch": 0.015707022692514363,
"grad_norm": 0.706315279006958,
"learning_rate": 0.00017468918854211007,
"loss": 0.6147,
"step": 95
},
{
"epoch": 0.0158723597734882,
"grad_norm": 0.7285186052322388,
"learning_rate": 0.00017223954715677627,
"loss": 0.6042,
"step": 96
},
{
"epoch": 0.016037696854462033,
"grad_norm": 0.7623412013053894,
"learning_rate": 0.00016978382570131034,
"loss": 0.645,
"step": 97
},
{
"epoch": 0.01620303393543587,
"grad_norm": 0.8401342630386353,
"learning_rate": 0.00016732269554543794,
"loss": 0.596,
"step": 98
},
{
"epoch": 0.016368371016409707,
"grad_norm": 0.8520634174346924,
"learning_rate": 0.00016485682953756942,
"loss": 0.5435,
"step": 99
},
{
"epoch": 0.01653370809738354,
"grad_norm": 1.2710648775100708,
"learning_rate": 0.00016238690182084986,
"loss": 0.6776,
"step": 100
},
{
"epoch": 0.01653370809738354,
"eval_loss": 0.6591371893882751,
"eval_runtime": 1.8728,
"eval_samples_per_second": 26.698,
"eval_steps_per_second": 3.738,
"step": 100
},
{
"epoch": 0.016699045178357377,
"grad_norm": 0.5595121383666992,
"learning_rate": 0.0001599135876488549,
"loss": 0.529,
"step": 101
},
{
"epoch": 0.01686438225933121,
"grad_norm": 0.6090410947799683,
"learning_rate": 0.00015743756320098332,
"loss": 0.6318,
"step": 102
},
{
"epoch": 0.017029719340305047,
"grad_norm": 0.5462281703948975,
"learning_rate": 0.0001549595053975962,
"loss": 0.6279,
"step": 103
},
{
"epoch": 0.017195056421278884,
"grad_norm": 0.470628559589386,
"learning_rate": 0.00015248009171495378,
"loss": 0.6353,
"step": 104
},
{
"epoch": 0.017360393502252717,
"grad_norm": 0.40791985392570496,
"learning_rate": 0.00015,
"loss": 0.6027,
"step": 105
},
{
"epoch": 0.017525730583226554,
"grad_norm": 0.42699339985847473,
"learning_rate": 0.00014751990828504622,
"loss": 0.6589,
"step": 106
},
{
"epoch": 0.017691067664200387,
"grad_norm": 0.42080384492874146,
"learning_rate": 0.00014504049460240375,
"loss": 0.6261,
"step": 107
},
{
"epoch": 0.017856404745174224,
"grad_norm": 0.42046162486076355,
"learning_rate": 0.00014256243679901663,
"loss": 0.6235,
"step": 108
},
{
"epoch": 0.01802174182614806,
"grad_norm": 0.42919886112213135,
"learning_rate": 0.00014008641235114508,
"loss": 0.615,
"step": 109
},
{
"epoch": 0.018187078907121894,
"grad_norm": 0.41335728764533997,
"learning_rate": 0.00013761309817915014,
"loss": 0.6216,
"step": 110
},
{
"epoch": 0.01835241598809573,
"grad_norm": 0.407696396112442,
"learning_rate": 0.00013514317046243058,
"loss": 0.6429,
"step": 111
},
{
"epoch": 0.018517753069069564,
"grad_norm": 0.4250294864177704,
"learning_rate": 0.00013267730445456208,
"loss": 0.6084,
"step": 112
},
{
"epoch": 0.0186830901500434,
"grad_norm": 0.40817221999168396,
"learning_rate": 0.00013021617429868963,
"loss": 0.6541,
"step": 113
},
{
"epoch": 0.018848427231017238,
"grad_norm": 0.43035033345222473,
"learning_rate": 0.00012776045284322368,
"loss": 0.6516,
"step": 114
},
{
"epoch": 0.01901376431199107,
"grad_norm": 0.40453994274139404,
"learning_rate": 0.00012531081145788987,
"loss": 0.6025,
"step": 115
},
{
"epoch": 0.019179101392964908,
"grad_norm": 0.39926087856292725,
"learning_rate": 0.00012286791985018355,
"loss": 0.5548,
"step": 116
},
{
"epoch": 0.01934443847393874,
"grad_norm": 0.43014204502105713,
"learning_rate": 0.00012043244588227796,
"loss": 0.5593,
"step": 117
},
{
"epoch": 0.019509775554912578,
"grad_norm": 0.4401710629463196,
"learning_rate": 0.00011800505538843798,
"loss": 0.6202,
"step": 118
},
{
"epoch": 0.019675112635886415,
"grad_norm": 0.4146181344985962,
"learning_rate": 0.00011558641199298727,
"loss": 0.5496,
"step": 119
},
{
"epoch": 0.019840449716860248,
"grad_norm": 0.4258963167667389,
"learning_rate": 0.00011317717692888012,
"loss": 0.5361,
"step": 120
},
{
"epoch": 0.020005786797834085,
"grad_norm": 0.45607423782348633,
"learning_rate": 0.00011077800885692702,
"loss": 0.5844,
"step": 121
},
{
"epoch": 0.020171123878807918,
"grad_norm": 0.445334255695343,
"learning_rate": 0.00010838956368572334,
"loss": 0.5698,
"step": 122
},
{
"epoch": 0.020336460959781755,
"grad_norm": 0.4584890604019165,
"learning_rate": 0.0001060124943923303,
"loss": 0.5761,
"step": 123
},
{
"epoch": 0.02050179804075559,
"grad_norm": 0.4687618911266327,
"learning_rate": 0.0001036474508437579,
"loss": 0.6092,
"step": 124
},
{
"epoch": 0.020667135121729425,
"grad_norm": 0.4850897192955017,
"learning_rate": 0.00010129507961929748,
"loss": 0.6443,
"step": 125
},
{
"epoch": 0.020667135121729425,
"eval_loss": 0.6090381145477295,
"eval_runtime": 1.8935,
"eval_samples_per_second": 26.406,
"eval_steps_per_second": 3.697,
"step": 125
},
{
"epoch": 0.02083247220270326,
"grad_norm": 0.474062979221344,
"learning_rate": 9.895602383375353e-05,
"loss": 0.6096,
"step": 126
},
{
"epoch": 0.0209978092836771,
"grad_norm": 0.5057529211044312,
"learning_rate": 9.663092296162251e-05,
"loss": 0.6463,
"step": 127
},
{
"epoch": 0.02116314636465093,
"grad_norm": 0.4857181906700134,
"learning_rate": 9.432041266226686e-05,
"loss": 0.5901,
"step": 128
},
{
"epoch": 0.02132848344562477,
"grad_norm": 0.48144418001174927,
"learning_rate": 9.202512460613219e-05,
"loss": 0.5829,
"step": 129
},
{
"epoch": 0.021493820526598602,
"grad_norm": 0.500487744808197,
"learning_rate": 8.97456863020546e-05,
"loss": 0.4955,
"step": 130
},
{
"epoch": 0.02165915760757244,
"grad_norm": 0.5146012902259827,
"learning_rate": 8.748272092570646e-05,
"loss": 0.6286,
"step": 131
},
{
"epoch": 0.021824494688546275,
"grad_norm": 0.48560625314712524,
"learning_rate": 8.523684714922608e-05,
"loss": 0.5786,
"step": 132
},
{
"epoch": 0.02198983176952011,
"grad_norm": 0.4935201108455658,
"learning_rate": 8.300867897207903e-05,
"loss": 0.5155,
"step": 133
},
{
"epoch": 0.022155168850493945,
"grad_norm": 0.5251325964927673,
"learning_rate": 8.079882555319684e-05,
"loss": 0.618,
"step": 134
},
{
"epoch": 0.02232050593146778,
"grad_norm": 0.5414552092552185,
"learning_rate": 7.860789104443896e-05,
"loss": 0.5664,
"step": 135
},
{
"epoch": 0.022485843012441616,
"grad_norm": 0.5745555758476257,
"learning_rate": 7.643647442542382e-05,
"loss": 0.597,
"step": 136
},
{
"epoch": 0.022651180093415452,
"grad_norm": 0.5867097973823547,
"learning_rate": 7.428516933977347e-05,
"loss": 0.6112,
"step": 137
},
{
"epoch": 0.022816517174389286,
"grad_norm": 0.617051362991333,
"learning_rate": 7.215456393281776e-05,
"loss": 0.5667,
"step": 138
},
{
"epoch": 0.022981854255363122,
"grad_norm": 0.6218559741973877,
"learning_rate": 7.004524069080096e-05,
"loss": 0.5878,
"step": 139
},
{
"epoch": 0.023147191336336956,
"grad_norm": 0.5842517614364624,
"learning_rate": 6.795777628163599e-05,
"loss": 0.5362,
"step": 140
},
{
"epoch": 0.023312528417310793,
"grad_norm": 0.6078557968139648,
"learning_rate": 6.58927413972491e-05,
"loss": 0.561,
"step": 141
},
{
"epoch": 0.02347786549828463,
"grad_norm": 0.6673625111579895,
"learning_rate": 6.385070059755846e-05,
"loss": 0.5567,
"step": 142
},
{
"epoch": 0.023643202579258463,
"grad_norm": 0.6668835282325745,
"learning_rate": 6.183221215612904e-05,
"loss": 0.5856,
"step": 143
},
{
"epoch": 0.0238085396602323,
"grad_norm": 0.6619709730148315,
"learning_rate": 5.983782790754623e-05,
"loss": 0.5695,
"step": 144
},
{
"epoch": 0.023973876741206133,
"grad_norm": 0.7347633838653564,
"learning_rate": 5.786809309654982e-05,
"loss": 0.6575,
"step": 145
},
{
"epoch": 0.02413921382217997,
"grad_norm": 0.681242823600769,
"learning_rate": 5.592354622896944e-05,
"loss": 0.4653,
"step": 146
},
{
"epoch": 0.024304550903153806,
"grad_norm": 0.6918179988861084,
"learning_rate": 5.40047189245025e-05,
"loss": 0.4623,
"step": 147
},
{
"epoch": 0.02446988798412764,
"grad_norm": 0.8781294226646423,
"learning_rate": 5.211213577137469e-05,
"loss": 0.6373,
"step": 148
},
{
"epoch": 0.024635225065101476,
"grad_norm": 0.8795979619026184,
"learning_rate": 5.024631418292274e-05,
"loss": 0.5014,
"step": 149
},
{
"epoch": 0.02480056214607531,
"grad_norm": 1.1492856740951538,
"learning_rate": 4.840776425613886e-05,
"loss": 0.5911,
"step": 150
},
{
"epoch": 0.02480056214607531,
"eval_loss": 0.6090312004089355,
"eval_runtime": 1.8777,
"eval_samples_per_second": 26.628,
"eval_steps_per_second": 3.728,
"step": 150
},
{
"epoch": 0.024965899227049147,
"grad_norm": 0.3518829047679901,
"learning_rate": 4.659698863221513e-05,
"loss": 0.4504,
"step": 151
},
{
"epoch": 0.025131236308022983,
"grad_norm": 0.3778611719608307,
"learning_rate": 4.481448235912671e-05,
"loss": 0.5729,
"step": 152
},
{
"epoch": 0.025296573388996817,
"grad_norm": 0.4172021448612213,
"learning_rate": 4.306073275629044e-05,
"loss": 0.5852,
"step": 153
},
{
"epoch": 0.025461910469970653,
"grad_norm": 0.4507042169570923,
"learning_rate": 4.133621928133665e-05,
"loss": 0.6136,
"step": 154
},
{
"epoch": 0.025627247550944487,
"grad_norm": 0.45023030042648315,
"learning_rate": 3.964141339903026e-05,
"loss": 0.6059,
"step": 155
},
{
"epoch": 0.025792584631918324,
"grad_norm": 0.4398917257785797,
"learning_rate": 3.797677845237696e-05,
"loss": 0.611,
"step": 156
},
{
"epoch": 0.02595792171289216,
"grad_norm": 0.426236629486084,
"learning_rate": 3.634276953594982e-05,
"loss": 0.5538,
"step": 157
},
{
"epoch": 0.026123258793865994,
"grad_norm": 0.4280710816383362,
"learning_rate": 3.473983337147118e-05,
"loss": 0.6121,
"step": 158
},
{
"epoch": 0.02628859587483983,
"grad_norm": 0.39926227927207947,
"learning_rate": 3.316840818568315e-05,
"loss": 0.5853,
"step": 159
},
{
"epoch": 0.026453932955813664,
"grad_norm": 0.3878289759159088,
"learning_rate": 3.162892359054098e-05,
"loss": 0.5152,
"step": 160
},
{
"epoch": 0.0266192700367875,
"grad_norm": 0.42500802874565125,
"learning_rate": 3.0121800465761293e-05,
"loss": 0.6141,
"step": 161
},
{
"epoch": 0.026784607117761337,
"grad_norm": 0.42845767736434937,
"learning_rate": 2.8647450843757897e-05,
"loss": 0.5855,
"step": 162
},
{
"epoch": 0.02694994419873517,
"grad_norm": 0.4272685647010803,
"learning_rate": 2.7206277796996144e-05,
"loss": 0.5931,
"step": 163
},
{
"epoch": 0.027115281279709007,
"grad_norm": 0.44554266333580017,
"learning_rate": 2.5798675327796993e-05,
"loss": 0.6541,
"step": 164
},
{
"epoch": 0.02728061836068284,
"grad_norm": 0.44582295417785645,
"learning_rate": 2.4425028260620715e-05,
"loss": 0.63,
"step": 165
},
{
"epoch": 0.027445955441656678,
"grad_norm": 0.4189260005950928,
"learning_rate": 2.3085712136859668e-05,
"loss": 0.5798,
"step": 166
},
{
"epoch": 0.027611292522630514,
"grad_norm": 0.41501569747924805,
"learning_rate": 2.178109311216913e-05,
"loss": 0.5838,
"step": 167
},
{
"epoch": 0.027776629603604348,
"grad_norm": 0.4392796754837036,
"learning_rate": 2.0511527856363912e-05,
"loss": 0.5914,
"step": 168
},
{
"epoch": 0.027941966684578184,
"grad_norm": 0.4092908501625061,
"learning_rate": 1.927736345590839e-05,
"loss": 0.5392,
"step": 169
},
{
"epoch": 0.028107303765552018,
"grad_norm": 0.4480851888656616,
"learning_rate": 1.8078937319026654e-05,
"loss": 0.5967,
"step": 170
},
{
"epoch": 0.028272640846525855,
"grad_norm": 0.45969000458717346,
"learning_rate": 1.6916577083458228e-05,
"loss": 0.6781,
"step": 171
},
{
"epoch": 0.02843797792749969,
"grad_norm": 0.4783402979373932,
"learning_rate": 1.579060052688548e-05,
"loss": 0.6911,
"step": 172
},
{
"epoch": 0.028603315008473525,
"grad_norm": 0.4612955152988434,
"learning_rate": 1.4701315480056164e-05,
"loss": 0.5574,
"step": 173
},
{
"epoch": 0.02876865208944736,
"grad_norm": 0.45882025361061096,
"learning_rate": 1.3649019742625623e-05,
"loss": 0.5463,
"step": 174
},
{
"epoch": 0.028933989170421195,
"grad_norm": 0.4618784189224243,
"learning_rate": 1.2634001001741373e-05,
"loss": 0.5609,
"step": 175
},
{
"epoch": 0.028933989170421195,
"eval_loss": 0.5873807668685913,
"eval_runtime": 1.8942,
"eval_samples_per_second": 26.397,
"eval_steps_per_second": 3.696,
"step": 175
},
{
"epoch": 0.02909932625139503,
"grad_norm": 0.4690948724746704,
"learning_rate": 1.1656536753392287e-05,
"loss": 0.5743,
"step": 176
},
{
"epoch": 0.02926466333236887,
"grad_norm": 0.4852280914783478,
"learning_rate": 1.0716894226543953e-05,
"loss": 0.6567,
"step": 177
},
{
"epoch": 0.0294300004133427,
"grad_norm": 0.47134000062942505,
"learning_rate": 9.815330310080887e-06,
"loss": 0.5788,
"step": 178
},
{
"epoch": 0.02959533749431654,
"grad_norm": 0.4749037027359009,
"learning_rate": 8.952091482575824e-06,
"loss": 0.5837,
"step": 179
},
{
"epoch": 0.029760674575290372,
"grad_norm": 0.48075568675994873,
"learning_rate": 8.127413744904804e-06,
"loss": 0.5929,
"step": 180
},
{
"epoch": 0.02992601165626421,
"grad_norm": 0.49417707324028015,
"learning_rate": 7.34152255572697e-06,
"loss": 0.5795,
"step": 181
},
{
"epoch": 0.030091348737238045,
"grad_norm": 0.49787455797195435,
"learning_rate": 6.594632769846353e-06,
"loss": 0.5481,
"step": 182
},
{
"epoch": 0.03025668581821188,
"grad_norm": 0.5144615173339844,
"learning_rate": 5.886948579472778e-06,
"loss": 0.5726,
"step": 183
},
{
"epoch": 0.030422022899185715,
"grad_norm": 0.5090218186378479,
"learning_rate": 5.218663458397715e-06,
"loss": 0.5759,
"step": 184
},
{
"epoch": 0.03058735998015955,
"grad_norm": 0.499025821685791,
"learning_rate": 4.589960109100444e-06,
"loss": 0.5272,
"step": 185
},
{
"epoch": 0.030752697061133386,
"grad_norm": 0.5264498591423035,
"learning_rate": 4.001010412799138e-06,
"loss": 0.5944,
"step": 186
},
{
"epoch": 0.030918034142107222,
"grad_norm": 0.5010021328926086,
"learning_rate": 3.451975382460109e-06,
"loss": 0.5202,
"step": 187
},
{
"epoch": 0.031083371223081056,
"grad_norm": 0.5727500319480896,
"learning_rate": 2.9430051187785962e-06,
"loss": 0.6056,
"step": 188
},
{
"epoch": 0.031248708304054892,
"grad_norm": 0.5564864873886108,
"learning_rate": 2.4742387691426445e-06,
"loss": 0.5835,
"step": 189
},
{
"epoch": 0.031414045385028726,
"grad_norm": 0.5777730941772461,
"learning_rate": 2.0458044895916513e-06,
"loss": 0.5449,
"step": 190
},
{
"epoch": 0.031579382466002566,
"grad_norm": 0.6091128587722778,
"learning_rate": 1.6578194097797258e-06,
"loss": 0.6131,
"step": 191
},
{
"epoch": 0.0317447195469764,
"grad_norm": 0.596784770488739,
"learning_rate": 1.3103896009537207e-06,
"loss": 0.5658,
"step": 192
},
{
"epoch": 0.03191005662795023,
"grad_norm": 0.6246776580810547,
"learning_rate": 1.0036100469542786e-06,
"loss": 0.5655,
"step": 193
},
{
"epoch": 0.032075393708924066,
"grad_norm": 0.6081706285476685,
"learning_rate": 7.375646182482875e-07,
"loss": 0.5007,
"step": 194
},
{
"epoch": 0.032240730789897906,
"grad_norm": 0.6714624166488647,
"learning_rate": 5.123260489995229e-07,
"loss": 0.5301,
"step": 195
},
{
"epoch": 0.03240606787087174,
"grad_norm": 0.731858491897583,
"learning_rate": 3.2795591718381975e-07,
"loss": 0.5636,
"step": 196
},
{
"epoch": 0.03257140495184557,
"grad_norm": 0.7109096050262451,
"learning_rate": 1.8450462775428942e-07,
"loss": 0.5681,
"step": 197
},
{
"epoch": 0.03273674203281941,
"grad_norm": 0.6945204138755798,
"learning_rate": 8.201139886109264e-08,
"loss": 0.4689,
"step": 198
},
{
"epoch": 0.032902079113793246,
"grad_norm": 0.8688917756080627,
"learning_rate": 2.0504251129649374e-08,
"loss": 0.5341,
"step": 199
},
{
"epoch": 0.03306741619476708,
"grad_norm": 1.1077585220336914,
"learning_rate": 0.0,
"loss": 0.6469,
"step": 200
},
{
"epoch": 0.03306741619476708,
"eval_loss": 0.587491512298584,
"eval_runtime": 1.8735,
"eval_samples_per_second": 26.688,
"eval_steps_per_second": 3.736,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.466616566153216e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}