Ayyoob's picture
extra2
489f060
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 200,
"global_step": 6341,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004731114966093676,
"grad_norm": 0.9585382342338562,
"learning_rate": 7.235790156711095e-05,
"loss": 1.19,
"step": 30
},
{
"epoch": 0.009462229932187352,
"grad_norm": 0.9438452124595642,
"learning_rate": 8.817139967814685e-05,
"loss": 1.0589,
"step": 60
},
{
"epoch": 0.014193344898281028,
"grad_norm": 0.9442492723464966,
"learning_rate": 9.722413360750843e-05,
"loss": 1.0764,
"step": 90
},
{
"epoch": 0.018924459864374705,
"grad_norm": 0.8840267658233643,
"learning_rate": 9.994621104255655e-05,
"loss": 1.0847,
"step": 120
},
{
"epoch": 0.02365557483046838,
"grad_norm": 0.8207218050956726,
"learning_rate": 9.985445340927068e-05,
"loss": 1.0912,
"step": 150
},
{
"epoch": 0.028386689796562056,
"grad_norm": 0.8883314728736877,
"learning_rate": 9.975953171966461e-05,
"loss": 1.0608,
"step": 180
},
{
"epoch": 0.03154076644062451,
"eval_loss": 1.2097724676132202,
"eval_runtime": 3.756,
"eval_samples_per_second": 26.89,
"eval_steps_per_second": 3.461,
"step": 200
},
{
"epoch": 0.03311780476265573,
"grad_norm": 0.7577874064445496,
"learning_rate": 9.966461003005853e-05,
"loss": 1.0802,
"step": 210
},
{
"epoch": 0.03784891972874941,
"grad_norm": 1.4911932945251465,
"learning_rate": 9.956968834045246e-05,
"loss": 1.0397,
"step": 240
},
{
"epoch": 0.04258003469484308,
"grad_norm": 0.8236317038536072,
"learning_rate": 9.947476665084638e-05,
"loss": 1.0575,
"step": 270
},
{
"epoch": 0.04731114966093676,
"grad_norm": 0.7883521318435669,
"learning_rate": 9.937984496124031e-05,
"loss": 1.0369,
"step": 300
},
{
"epoch": 0.05204226462703044,
"grad_norm": 0.7798565626144409,
"learning_rate": 9.928492327163424e-05,
"loss": 1.0354,
"step": 330
},
{
"epoch": 0.05677337959312411,
"grad_norm": 0.7784315943717957,
"learning_rate": 9.919000158202817e-05,
"loss": 1.0341,
"step": 360
},
{
"epoch": 0.06150449455921779,
"grad_norm": 0.836300790309906,
"learning_rate": 9.909507989242209e-05,
"loss": 1.0272,
"step": 390
},
{
"epoch": 0.06308153288124901,
"eval_loss": 1.1889104843139648,
"eval_runtime": 3.7553,
"eval_samples_per_second": 26.895,
"eval_steps_per_second": 3.462,
"step": 400
},
{
"epoch": 0.06623560952531146,
"grad_norm": 0.7245925664901733,
"learning_rate": 9.900015820281602e-05,
"loss": 1.0256,
"step": 420
},
{
"epoch": 0.07096672449140515,
"grad_norm": 0.8321049213409424,
"learning_rate": 9.890523651320994e-05,
"loss": 1.0332,
"step": 450
},
{
"epoch": 0.07569783945749882,
"grad_norm": 0.7657173275947571,
"learning_rate": 9.881031482360387e-05,
"loss": 1.0221,
"step": 480
},
{
"epoch": 0.08042895442359249,
"grad_norm": 0.7464463114738464,
"learning_rate": 9.871539313399779e-05,
"loss": 0.9911,
"step": 510
},
{
"epoch": 0.08516006938968616,
"grad_norm": 0.7290617227554321,
"learning_rate": 9.862047144439172e-05,
"loss": 1.0258,
"step": 540
},
{
"epoch": 0.08989118435577985,
"grad_norm": 0.7311350703239441,
"learning_rate": 9.852554975478564e-05,
"loss": 1.0165,
"step": 570
},
{
"epoch": 0.09462229932187352,
"grad_norm": 0.8087915182113647,
"learning_rate": 9.843062806517957e-05,
"loss": 0.9716,
"step": 600
},
{
"epoch": 0.09462229932187352,
"eval_loss": 1.1471492052078247,
"eval_runtime": 3.7536,
"eval_samples_per_second": 26.907,
"eval_steps_per_second": 3.463,
"step": 600
},
{
"epoch": 0.09935341428796719,
"grad_norm": 0.7442970275878906,
"learning_rate": 9.833570637557348e-05,
"loss": 0.9747,
"step": 630
},
{
"epoch": 0.10408452925406088,
"grad_norm": 0.9510965347290039,
"learning_rate": 9.824078468596742e-05,
"loss": 0.9582,
"step": 660
},
{
"epoch": 0.10881564422015455,
"grad_norm": 0.6995567083358765,
"learning_rate": 9.814586299636133e-05,
"loss": 1.0118,
"step": 690
},
{
"epoch": 0.11354675918624822,
"grad_norm": 0.9319436550140381,
"learning_rate": 9.805094130675526e-05,
"loss": 0.9815,
"step": 720
},
{
"epoch": 0.11827787415234191,
"grad_norm": 0.7033783793449402,
"learning_rate": 9.795601961714918e-05,
"loss": 0.9738,
"step": 750
},
{
"epoch": 0.12300898911843558,
"grad_norm": 0.6606217622756958,
"learning_rate": 9.786109792754311e-05,
"loss": 0.961,
"step": 780
},
{
"epoch": 0.12616306576249803,
"eval_loss": 1.125948190689087,
"eval_runtime": 3.7557,
"eval_samples_per_second": 26.892,
"eval_steps_per_second": 3.461,
"step": 800
},
{
"epoch": 0.12774010408452927,
"grad_norm": 0.9087960124015808,
"learning_rate": 9.776617623793703e-05,
"loss": 0.9734,
"step": 810
},
{
"epoch": 0.13247121905062292,
"grad_norm": 0.7387025952339172,
"learning_rate": 9.767125454833097e-05,
"loss": 0.9605,
"step": 840
},
{
"epoch": 0.1372023340167166,
"grad_norm": 0.7939543724060059,
"learning_rate": 9.757633285872489e-05,
"loss": 0.952,
"step": 870
},
{
"epoch": 0.1419334489828103,
"grad_norm": 1.1417864561080933,
"learning_rate": 9.748141116911882e-05,
"loss": 0.9113,
"step": 900
},
{
"epoch": 0.14666456394890395,
"grad_norm": 0.7591778635978699,
"learning_rate": 9.738648947951274e-05,
"loss": 0.9565,
"step": 930
},
{
"epoch": 0.15139567891499764,
"grad_norm": 0.759545087814331,
"learning_rate": 9.729156778990667e-05,
"loss": 0.9401,
"step": 960
},
{
"epoch": 0.1561267938810913,
"grad_norm": 0.700552761554718,
"learning_rate": 9.719664610030059e-05,
"loss": 0.9447,
"step": 990
},
{
"epoch": 0.15770383220312253,
"eval_loss": 1.0677810907363892,
"eval_runtime": 3.7551,
"eval_samples_per_second": 26.897,
"eval_steps_per_second": 3.462,
"step": 1000
},
{
"epoch": 0.16085790884718498,
"grad_norm": 0.6673519015312195,
"learning_rate": 9.710172441069452e-05,
"loss": 0.8919,
"step": 1020
},
{
"epoch": 0.16558902381327867,
"grad_norm": 0.8046931028366089,
"learning_rate": 9.700680272108844e-05,
"loss": 0.9136,
"step": 1050
},
{
"epoch": 0.17032013877937233,
"grad_norm": 0.7277413606643677,
"learning_rate": 9.691188103148237e-05,
"loss": 0.9001,
"step": 1080
},
{
"epoch": 0.175051253745466,
"grad_norm": 0.661359429359436,
"learning_rate": 9.681695934187629e-05,
"loss": 0.9119,
"step": 1110
},
{
"epoch": 0.1797823687115597,
"grad_norm": 0.7349006533622742,
"learning_rate": 9.672203765227022e-05,
"loss": 0.8825,
"step": 1140
},
{
"epoch": 0.18451348367765336,
"grad_norm": 0.7114729285240173,
"learning_rate": 9.662711596266414e-05,
"loss": 0.8872,
"step": 1170
},
{
"epoch": 0.18924459864374704,
"grad_norm": 0.6496574282646179,
"learning_rate": 9.653219427305807e-05,
"loss": 0.8809,
"step": 1200
},
{
"epoch": 0.18924459864374704,
"eval_loss": 1.0253973007202148,
"eval_runtime": 3.7532,
"eval_samples_per_second": 26.91,
"eval_steps_per_second": 3.464,
"step": 1200
},
{
"epoch": 0.19397571360984073,
"grad_norm": 0.6576619744300842,
"learning_rate": 9.643727258345198e-05,
"loss": 0.876,
"step": 1230
},
{
"epoch": 0.19870682857593439,
"grad_norm": 0.666749119758606,
"learning_rate": 9.634235089384591e-05,
"loss": 0.8877,
"step": 1260
},
{
"epoch": 0.20343794354202807,
"grad_norm": 0.7769750952720642,
"learning_rate": 9.624742920423983e-05,
"loss": 0.8894,
"step": 1290
},
{
"epoch": 0.20816905850812176,
"grad_norm": 0.6562801599502563,
"learning_rate": 9.615250751463376e-05,
"loss": 0.8912,
"step": 1320
},
{
"epoch": 0.21290017347421542,
"grad_norm": 0.6531364917755127,
"learning_rate": 9.605758582502768e-05,
"loss": 0.875,
"step": 1350
},
{
"epoch": 0.2176312884403091,
"grad_norm": 0.6414660811424255,
"learning_rate": 9.596266413542163e-05,
"loss": 0.8721,
"step": 1380
},
{
"epoch": 0.22078536508437155,
"eval_loss": 1.0128834247589111,
"eval_runtime": 3.7539,
"eval_samples_per_second": 26.906,
"eval_steps_per_second": 3.463,
"step": 1400
},
{
"epoch": 0.2223624034064028,
"grad_norm": 0.8413099646568298,
"learning_rate": 9.586774244581554e-05,
"loss": 0.8807,
"step": 1410
},
{
"epoch": 0.22709351837249644,
"grad_norm": 0.6748294830322266,
"learning_rate": 9.577282075620947e-05,
"loss": 0.8245,
"step": 1440
},
{
"epoch": 0.23182463333859013,
"grad_norm": 0.7067525386810303,
"learning_rate": 9.567789906660339e-05,
"loss": 0.8767,
"step": 1470
},
{
"epoch": 0.23655574830468382,
"grad_norm": 1.074791431427002,
"learning_rate": 9.558297737699732e-05,
"loss": 0.8856,
"step": 1500
},
{
"epoch": 0.24128686327077747,
"grad_norm": 0.7461240887641907,
"learning_rate": 9.548805568739124e-05,
"loss": 0.8759,
"step": 1530
},
{
"epoch": 0.24601797823687116,
"grad_norm": 0.6231616139411926,
"learning_rate": 9.539313399778517e-05,
"loss": 0.837,
"step": 1560
},
{
"epoch": 0.25074909320296485,
"grad_norm": 0.7053641080856323,
"learning_rate": 9.529821230817909e-05,
"loss": 0.8763,
"step": 1590
},
{
"epoch": 0.25232613152499606,
"eval_loss": 0.9505324959754944,
"eval_runtime": 3.7563,
"eval_samples_per_second": 26.888,
"eval_steps_per_second": 3.461,
"step": 1600
},
{
"epoch": 0.25548020816905853,
"grad_norm": 0.6484207510948181,
"learning_rate": 9.520329061857302e-05,
"loss": 0.8787,
"step": 1620
},
{
"epoch": 0.26021132313515216,
"grad_norm": 0.5929827094078064,
"learning_rate": 9.510836892896694e-05,
"loss": 0.844,
"step": 1650
},
{
"epoch": 0.26494243810124585,
"grad_norm": 0.6840829849243164,
"learning_rate": 9.501344723936087e-05,
"loss": 0.8492,
"step": 1680
},
{
"epoch": 0.26967355306733953,
"grad_norm": 0.7365448474884033,
"learning_rate": 9.491852554975479e-05,
"loss": 0.8584,
"step": 1710
},
{
"epoch": 0.2744046680334332,
"grad_norm": 0.6528182029724121,
"learning_rate": 9.482360386014872e-05,
"loss": 0.8346,
"step": 1740
},
{
"epoch": 0.2791357829995269,
"grad_norm": 0.6200223565101624,
"learning_rate": 9.472868217054263e-05,
"loss": 0.8008,
"step": 1770
},
{
"epoch": 0.2838668979656206,
"grad_norm": 0.7503982186317444,
"learning_rate": 9.463376048093657e-05,
"loss": 0.8197,
"step": 1800
},
{
"epoch": 0.2838668979656206,
"eval_loss": 0.9286572933197021,
"eval_runtime": 3.7535,
"eval_samples_per_second": 26.908,
"eval_steps_per_second": 3.463,
"step": 1800
},
{
"epoch": 0.2885980129317142,
"grad_norm": 0.6671140193939209,
"learning_rate": 9.453883879133048e-05,
"loss": 0.8405,
"step": 1830
},
{
"epoch": 0.2933291278978079,
"grad_norm": 0.7057023048400879,
"learning_rate": 9.444391710172441e-05,
"loss": 0.7822,
"step": 1860
},
{
"epoch": 0.2980602428639016,
"grad_norm": 0.8120527267456055,
"learning_rate": 9.434899541211833e-05,
"loss": 0.8416,
"step": 1890
},
{
"epoch": 0.3027913578299953,
"grad_norm": 0.622718334197998,
"learning_rate": 9.425407372251228e-05,
"loss": 0.8174,
"step": 1920
},
{
"epoch": 0.30752247279608896,
"grad_norm": 0.6605896353721619,
"learning_rate": 9.41591520329062e-05,
"loss": 0.8003,
"step": 1950
},
{
"epoch": 0.3122535877621826,
"grad_norm": 0.7473495006561279,
"learning_rate": 9.406423034330012e-05,
"loss": 0.798,
"step": 1980
},
{
"epoch": 0.31540766440624507,
"eval_loss": 0.8976284861564636,
"eval_runtime": 3.7537,
"eval_samples_per_second": 26.907,
"eval_steps_per_second": 3.463,
"step": 2000
},
{
"epoch": 0.3169847027282763,
"grad_norm": 0.7177520394325256,
"learning_rate": 9.396930865369404e-05,
"loss": 0.8168,
"step": 2010
},
{
"epoch": 0.32171581769436997,
"grad_norm": 0.7600869536399841,
"learning_rate": 9.387438696408797e-05,
"loss": 0.7918,
"step": 2040
},
{
"epoch": 0.32644693266046365,
"grad_norm": 0.7001503109931946,
"learning_rate": 9.377946527448189e-05,
"loss": 0.7906,
"step": 2070
},
{
"epoch": 0.33117804762655734,
"grad_norm": 0.6279382705688477,
"learning_rate": 9.368454358487582e-05,
"loss": 0.7624,
"step": 2100
},
{
"epoch": 0.335909162592651,
"grad_norm": 0.7481889128684998,
"learning_rate": 9.358962189526974e-05,
"loss": 0.7849,
"step": 2130
},
{
"epoch": 0.34064027755874465,
"grad_norm": 0.6797828078269958,
"learning_rate": 9.349470020566367e-05,
"loss": 0.7899,
"step": 2160
},
{
"epoch": 0.34537139252483834,
"grad_norm": 0.6929941177368164,
"learning_rate": 9.339977851605759e-05,
"loss": 0.7703,
"step": 2190
},
{
"epoch": 0.3469484308468696,
"eval_loss": 0.8858568072319031,
"eval_runtime": 3.7538,
"eval_samples_per_second": 26.906,
"eval_steps_per_second": 3.463,
"step": 2200
},
{
"epoch": 0.350102507490932,
"grad_norm": 0.698906660079956,
"learning_rate": 9.330485682645152e-05,
"loss": 0.7724,
"step": 2220
},
{
"epoch": 0.3548336224570257,
"grad_norm": 0.779211163520813,
"learning_rate": 9.320993513684544e-05,
"loss": 0.7875,
"step": 2250
},
{
"epoch": 0.3595647374231194,
"grad_norm": 0.7313475608825684,
"learning_rate": 9.311817750355957e-05,
"loss": 0.794,
"step": 2280
},
{
"epoch": 0.3642958523892131,
"grad_norm": 0.6143506169319153,
"learning_rate": 9.30232558139535e-05,
"loss": 0.7742,
"step": 2310
},
{
"epoch": 0.3690269673553067,
"grad_norm": 0.6775010824203491,
"learning_rate": 9.292833412434741e-05,
"loss": 0.7822,
"step": 2340
},
{
"epoch": 0.3737580823214004,
"grad_norm": 0.7151722311973572,
"learning_rate": 9.283341243474134e-05,
"loss": 0.7617,
"step": 2370
},
{
"epoch": 0.3784891972874941,
"grad_norm": 0.6855128407478333,
"learning_rate": 9.273849074513526e-05,
"loss": 0.7668,
"step": 2400
},
{
"epoch": 0.3784891972874941,
"eval_loss": 0.8862702250480652,
"eval_runtime": 3.7541,
"eval_samples_per_second": 26.904,
"eval_steps_per_second": 3.463,
"step": 2400
},
{
"epoch": 0.38322031225358777,
"grad_norm": 0.743325412273407,
"learning_rate": 9.26435690555292e-05,
"loss": 0.7885,
"step": 2430
},
{
"epoch": 0.38795142721968146,
"grad_norm": 0.6186659932136536,
"learning_rate": 9.254864736592311e-05,
"loss": 0.7619,
"step": 2460
},
{
"epoch": 0.39268254218577514,
"grad_norm": 0.6791619062423706,
"learning_rate": 9.245372567631704e-05,
"loss": 0.8084,
"step": 2490
},
{
"epoch": 0.39741365715186877,
"grad_norm": 0.6537867784500122,
"learning_rate": 9.235880398671097e-05,
"loss": 0.7641,
"step": 2520
},
{
"epoch": 0.40214477211796246,
"grad_norm": 0.6688680052757263,
"learning_rate": 9.22638822971049e-05,
"loss": 0.7634,
"step": 2550
},
{
"epoch": 0.40687588708405614,
"grad_norm": 0.6369423866271973,
"learning_rate": 9.216896060749882e-05,
"loss": 0.7407,
"step": 2580
},
{
"epoch": 0.4100299637281186,
"eval_loss": 0.8817442059516907,
"eval_runtime": 3.7541,
"eval_samples_per_second": 26.904,
"eval_steps_per_second": 3.463,
"step": 2600
},
{
"epoch": 0.41160700205014983,
"grad_norm": 0.6841573119163513,
"learning_rate": 9.207403891789275e-05,
"loss": 0.7572,
"step": 2610
},
{
"epoch": 0.4163381170162435,
"grad_norm": 0.625957727432251,
"learning_rate": 9.197911722828667e-05,
"loss": 0.7493,
"step": 2640
},
{
"epoch": 0.42106923198233714,
"grad_norm": 0.7467941641807556,
"learning_rate": 9.18841955386806e-05,
"loss": 0.7468,
"step": 2670
},
{
"epoch": 0.42580034694843083,
"grad_norm": 0.6891815662384033,
"learning_rate": 9.178927384907452e-05,
"loss": 0.7698,
"step": 2700
},
{
"epoch": 0.4305314619145245,
"grad_norm": 0.6197889447212219,
"learning_rate": 9.169435215946845e-05,
"loss": 0.7588,
"step": 2730
},
{
"epoch": 0.4352625768806182,
"grad_norm": 0.7140328884124756,
"learning_rate": 9.159943046986237e-05,
"loss": 0.7569,
"step": 2760
},
{
"epoch": 0.4399936918467119,
"grad_norm": 0.7718496322631836,
"learning_rate": 9.15045087802563e-05,
"loss": 0.7448,
"step": 2790
},
{
"epoch": 0.4415707301687431,
"eval_loss": 0.8855557441711426,
"eval_runtime": 3.7544,
"eval_samples_per_second": 26.902,
"eval_steps_per_second": 3.463,
"step": 2800
},
{
"epoch": 0.4447248068128056,
"grad_norm": 0.6447039246559143,
"learning_rate": 9.140958709065022e-05,
"loss": 0.7623,
"step": 2820
},
{
"epoch": 0.4494559217788992,
"grad_norm": 0.6694769859313965,
"learning_rate": 9.131466540104415e-05,
"loss": 0.7081,
"step": 2850
},
{
"epoch": 0.4541870367449929,
"grad_norm": 0.6863081455230713,
"learning_rate": 9.121974371143806e-05,
"loss": 0.7228,
"step": 2880
},
{
"epoch": 0.4589181517110866,
"grad_norm": 0.7198454737663269,
"learning_rate": 9.1124822021832e-05,
"loss": 0.7356,
"step": 2910
},
{
"epoch": 0.46364926667718026,
"grad_norm": 0.6542885303497314,
"learning_rate": 9.102990033222591e-05,
"loss": 0.7606,
"step": 2940
},
{
"epoch": 0.46838038164327395,
"grad_norm": 0.657539963722229,
"learning_rate": 9.093497864261984e-05,
"loss": 0.7255,
"step": 2970
},
{
"epoch": 0.47311149660936763,
"grad_norm": 0.819503664970398,
"learning_rate": 9.084005695301376e-05,
"loss": 0.7184,
"step": 3000
},
{
"epoch": 0.47311149660936763,
"eval_loss": 0.8140414357185364,
"eval_runtime": 3.7531,
"eval_samples_per_second": 26.911,
"eval_steps_per_second": 3.464,
"step": 3000
},
{
"epoch": 0.47784261157546126,
"grad_norm": 0.7199704647064209,
"learning_rate": 9.074513526340769e-05,
"loss": 0.7227,
"step": 3030
},
{
"epoch": 0.48257372654155495,
"grad_norm": 0.7655025720596313,
"learning_rate": 9.065021357380162e-05,
"loss": 0.7217,
"step": 3060
},
{
"epoch": 0.48730484150764863,
"grad_norm": 0.7312873601913452,
"learning_rate": 9.055845594051574e-05,
"loss": 0.7059,
"step": 3090
},
{
"epoch": 0.4920359564737423,
"grad_norm": 0.5961809158325195,
"learning_rate": 9.046353425090967e-05,
"loss": 0.7033,
"step": 3120
},
{
"epoch": 0.496767071439836,
"grad_norm": 0.6955564022064209,
"learning_rate": 9.03686125613036e-05,
"loss": 0.7289,
"step": 3150
},
{
"epoch": 0.5014981864059297,
"grad_norm": 0.6622660160064697,
"learning_rate": 9.027369087169752e-05,
"loss": 0.6935,
"step": 3180
},
{
"epoch": 0.5046522630499921,
"eval_loss": 0.7775673270225525,
"eval_runtime": 3.754,
"eval_samples_per_second": 26.904,
"eval_steps_per_second": 3.463,
"step": 3200
},
{
"epoch": 0.5062293013720234,
"grad_norm": 0.7262014746665955,
"learning_rate": 9.017876918209145e-05,
"loss": 0.6906,
"step": 3210
},
{
"epoch": 0.5109604163381171,
"grad_norm": 0.7221697568893433,
"learning_rate": 9.008384749248537e-05,
"loss": 0.7079,
"step": 3240
},
{
"epoch": 0.5156915313042106,
"grad_norm": 0.7115603089332581,
"learning_rate": 8.99889258028793e-05,
"loss": 0.7191,
"step": 3270
},
{
"epoch": 0.5204226462703043,
"grad_norm": 0.7292232513427734,
"learning_rate": 8.989400411327322e-05,
"loss": 0.6702,
"step": 3300
},
{
"epoch": 0.525153761236398,
"grad_norm": 0.741580605506897,
"learning_rate": 8.979908242366715e-05,
"loss": 0.6762,
"step": 3330
},
{
"epoch": 0.5298848762024917,
"grad_norm": 0.7870708107948303,
"learning_rate": 8.970416073406108e-05,
"loss": 0.6838,
"step": 3360
},
{
"epoch": 0.5346159911685854,
"grad_norm": 0.71812903881073,
"learning_rate": 8.9609239044455e-05,
"loss": 0.7174,
"step": 3390
},
{
"epoch": 0.5361930294906166,
"eval_loss": 0.7375061511993408,
"eval_runtime": 3.7548,
"eval_samples_per_second": 26.899,
"eval_steps_per_second": 3.462,
"step": 3400
},
{
"epoch": 0.5393471061346791,
"grad_norm": 0.7266995906829834,
"learning_rate": 8.951431735484893e-05,
"loss": 0.6763,
"step": 3420
},
{
"epoch": 0.5440782211007728,
"grad_norm": 0.7786857485771179,
"learning_rate": 8.941939566524284e-05,
"loss": 0.7149,
"step": 3450
},
{
"epoch": 0.5488093360668664,
"grad_norm": 0.7807109355926514,
"learning_rate": 8.932447397563677e-05,
"loss": 0.6534,
"step": 3480
},
{
"epoch": 0.5535404510329601,
"grad_norm": 0.6960239410400391,
"learning_rate": 8.922955228603069e-05,
"loss": 0.7313,
"step": 3510
},
{
"epoch": 0.5582715659990538,
"grad_norm": 0.586615264415741,
"learning_rate": 8.913463059642462e-05,
"loss": 0.6579,
"step": 3540
},
{
"epoch": 0.5630026809651475,
"grad_norm": 0.9740248918533325,
"learning_rate": 8.903970890681854e-05,
"loss": 0.7013,
"step": 3570
},
{
"epoch": 0.5677337959312412,
"grad_norm": 0.6628558039665222,
"learning_rate": 8.894478721721247e-05,
"loss": 0.6546,
"step": 3600
},
{
"epoch": 0.5677337959312412,
"eval_loss": 0.7031014561653137,
"eval_runtime": 3.7542,
"eval_samples_per_second": 26.903,
"eval_steps_per_second": 3.463,
"step": 3600
},
{
"epoch": 0.5724649108973348,
"grad_norm": 0.6030669808387756,
"learning_rate": 8.884986552760639e-05,
"loss": 0.7146,
"step": 3630
},
{
"epoch": 0.5771960258634284,
"grad_norm": 0.6010313034057617,
"learning_rate": 8.875494383800032e-05,
"loss": 0.6816,
"step": 3660
},
{
"epoch": 0.5819271408295221,
"grad_norm": 0.6319311857223511,
"learning_rate": 8.866002214839425e-05,
"loss": 0.6642,
"step": 3690
},
{
"epoch": 0.5866582557956158,
"grad_norm": 0.6059941053390503,
"learning_rate": 8.856510045878817e-05,
"loss": 0.6998,
"step": 3720
},
{
"epoch": 0.5913893707617095,
"grad_norm": 0.5976997017860413,
"learning_rate": 8.84701787691821e-05,
"loss": 0.6694,
"step": 3750
},
{
"epoch": 0.5961204857278032,
"grad_norm": 0.6985177993774414,
"learning_rate": 8.837525707957602e-05,
"loss": 0.6402,
"step": 3780
},
{
"epoch": 0.5992745623718656,
"eval_loss": 0.6977850198745728,
"eval_runtime": 3.7545,
"eval_samples_per_second": 26.901,
"eval_steps_per_second": 3.462,
"step": 3800
},
{
"epoch": 0.6008516006938969,
"grad_norm": 0.7076742053031921,
"learning_rate": 8.828033538996995e-05,
"loss": 0.6749,
"step": 3810
},
{
"epoch": 0.6055827156599906,
"grad_norm": 0.9254401326179504,
"learning_rate": 8.818541370036387e-05,
"loss": 0.6481,
"step": 3840
},
{
"epoch": 0.6103138306260842,
"grad_norm": 0.7403334379196167,
"learning_rate": 8.80904920107578e-05,
"loss": 0.6704,
"step": 3870
},
{
"epoch": 0.6150449455921779,
"grad_norm": 0.6302973628044128,
"learning_rate": 8.799557032115171e-05,
"loss": 0.6717,
"step": 3900
},
{
"epoch": 0.6197760605582716,
"grad_norm": 0.7587308287620544,
"learning_rate": 8.790064863154565e-05,
"loss": 0.6526,
"step": 3930
},
{
"epoch": 0.6245071755243652,
"grad_norm": 0.768151581287384,
"learning_rate": 8.780572694193956e-05,
"loss": 0.6614,
"step": 3960
},
{
"epoch": 0.6292382904904589,
"grad_norm": 0.662624716758728,
"learning_rate": 8.77108052523335e-05,
"loss": 0.6471,
"step": 3990
},
{
"epoch": 0.6308153288124901,
"eval_loss": 0.6685364246368408,
"eval_runtime": 3.7533,
"eval_samples_per_second": 26.909,
"eval_steps_per_second": 3.464,
"step": 4000
},
{
"epoch": 0.6339694054565526,
"grad_norm": 0.614434540271759,
"learning_rate": 8.761588356272743e-05,
"loss": 0.6305,
"step": 4020
},
{
"epoch": 0.6387005204226462,
"grad_norm": 0.7292618751525879,
"learning_rate": 8.752096187312134e-05,
"loss": 0.632,
"step": 4050
},
{
"epoch": 0.6434316353887399,
"grad_norm": 0.5890663862228394,
"learning_rate": 8.742604018351527e-05,
"loss": 0.6594,
"step": 4080
},
{
"epoch": 0.6481627503548336,
"grad_norm": 0.6511669158935547,
"learning_rate": 8.733111849390919e-05,
"loss": 0.6417,
"step": 4110
},
{
"epoch": 0.6528938653209273,
"grad_norm": 0.6794877648353577,
"learning_rate": 8.723619680430312e-05,
"loss": 0.6472,
"step": 4140
},
{
"epoch": 0.657624980287021,
"grad_norm": 0.5826547145843506,
"learning_rate": 8.714127511469704e-05,
"loss": 0.6255,
"step": 4170
},
{
"epoch": 0.6623560952531147,
"grad_norm": 0.8411812782287598,
"learning_rate": 8.704635342509097e-05,
"loss": 0.6368,
"step": 4200
},
{
"epoch": 0.6623560952531147,
"eval_loss": 0.6538847088813782,
"eval_runtime": 3.7543,
"eval_samples_per_second": 26.903,
"eval_steps_per_second": 3.463,
"step": 4200
},
{
"epoch": 0.6670872102192084,
"grad_norm": 0.5682166218757629,
"learning_rate": 8.69514317354849e-05,
"loss": 0.6269,
"step": 4230
},
{
"epoch": 0.671818325185302,
"grad_norm": 0.6340855360031128,
"learning_rate": 8.685651004587882e-05,
"loss": 0.6423,
"step": 4260
},
{
"epoch": 0.6765494401513957,
"grad_norm": 0.6693681478500366,
"learning_rate": 8.676158835627275e-05,
"loss": 0.6471,
"step": 4290
},
{
"epoch": 0.6812805551174893,
"grad_norm": 0.6101056337356567,
"learning_rate": 8.666666666666667e-05,
"loss": 0.6168,
"step": 4320
},
{
"epoch": 0.686011670083583,
"grad_norm": 0.6096228361129761,
"learning_rate": 8.65717449770606e-05,
"loss": 0.6494,
"step": 4350
},
{
"epoch": 0.6907427850496767,
"grad_norm": 0.6632306575775146,
"learning_rate": 8.647682328745452e-05,
"loss": 0.664,
"step": 4380
},
{
"epoch": 0.6938968616937392,
"eval_loss": 0.6377571225166321,
"eval_runtime": 3.756,
"eval_samples_per_second": 26.89,
"eval_steps_per_second": 3.461,
"step": 4400
},
{
"epoch": 0.6954739000157704,
"grad_norm": 0.6547721028327942,
"learning_rate": 8.638190159784845e-05,
"loss": 0.6091,
"step": 4410
},
{
"epoch": 0.700205014981864,
"grad_norm": 0.6063847541809082,
"learning_rate": 8.628697990824237e-05,
"loss": 0.6055,
"step": 4440
},
{
"epoch": 0.7049361299479577,
"grad_norm": 0.6687933802604675,
"learning_rate": 8.61920582186363e-05,
"loss": 0.601,
"step": 4470
},
{
"epoch": 0.7096672449140514,
"grad_norm": 0.701770007610321,
"learning_rate": 8.609713652903021e-05,
"loss": 0.6064,
"step": 4500
},
{
"epoch": 0.7143983598801451,
"grad_norm": 0.6652805209159851,
"learning_rate": 8.600221483942414e-05,
"loss": 0.653,
"step": 4530
},
{
"epoch": 0.7191294748462388,
"grad_norm": 0.6469018459320068,
"learning_rate": 8.590729314981806e-05,
"loss": 0.6019,
"step": 4560
},
{
"epoch": 0.7238605898123325,
"grad_norm": 0.6343564391136169,
"learning_rate": 8.5812371460212e-05,
"loss": 0.6083,
"step": 4590
},
{
"epoch": 0.7254376281343636,
"eval_loss": 0.6411118507385254,
"eval_runtime": 3.754,
"eval_samples_per_second": 26.905,
"eval_steps_per_second": 3.463,
"step": 4600
},
{
"epoch": 0.7285917047784262,
"grad_norm": 0.5817134976387024,
"learning_rate": 8.571744977060592e-05,
"loss": 0.602,
"step": 4620
},
{
"epoch": 0.7333228197445197,
"grad_norm": 0.5552039742469788,
"learning_rate": 8.562252808099984e-05,
"loss": 0.6223,
"step": 4650
},
{
"epoch": 0.7380539347106134,
"grad_norm": 0.6455065011978149,
"learning_rate": 8.552760639139377e-05,
"loss": 0.5865,
"step": 4680
},
{
"epoch": 0.7427850496767071,
"grad_norm": 0.6448588371276855,
"learning_rate": 8.543268470178769e-05,
"loss": 0.6126,
"step": 4710
},
{
"epoch": 0.7475161646428008,
"grad_norm": 0.6447100639343262,
"learning_rate": 8.533776301218162e-05,
"loss": 0.6167,
"step": 4740
},
{
"epoch": 0.7522472796088945,
"grad_norm": 0.6894412636756897,
"learning_rate": 8.524284132257555e-05,
"loss": 0.5851,
"step": 4770
},
{
"epoch": 0.7569783945749882,
"grad_norm": 0.6036236882209778,
"learning_rate": 8.514791963296947e-05,
"loss": 0.6025,
"step": 4800
},
{
"epoch": 0.7569783945749882,
"eval_loss": 0.6117845177650452,
"eval_runtime": 3.7554,
"eval_samples_per_second": 26.894,
"eval_steps_per_second": 3.462,
"step": 4800
},
{
"epoch": 0.7617095095410819,
"grad_norm": 0.6214340925216675,
"learning_rate": 8.50529979433634e-05,
"loss": 0.6145,
"step": 4830
},
{
"epoch": 0.7664406245071755,
"grad_norm": 0.6933445334434509,
"learning_rate": 8.495807625375732e-05,
"loss": 0.6184,
"step": 4860
},
{
"epoch": 0.7711717394732692,
"grad_norm": 0.5649739503860474,
"learning_rate": 8.486315456415125e-05,
"loss": 0.5996,
"step": 4890
},
{
"epoch": 0.7759028544393629,
"grad_norm": 0.6250168085098267,
"learning_rate": 8.476823287454517e-05,
"loss": 0.5762,
"step": 4920
},
{
"epoch": 0.7806339694054566,
"grad_norm": 1.7125053405761719,
"learning_rate": 8.46733111849391e-05,
"loss": 0.5716,
"step": 4950
},
{
"epoch": 0.7853650843715503,
"grad_norm": 0.5721966028213501,
"learning_rate": 8.457838949533302e-05,
"loss": 0.5612,
"step": 4980
},
{
"epoch": 0.7885191610156127,
"eval_loss": 0.5980841517448425,
"eval_runtime": 3.7547,
"eval_samples_per_second": 26.9,
"eval_steps_per_second": 3.462,
"step": 5000
},
{
"epoch": 0.7900961993376439,
"grad_norm": 0.6716078519821167,
"learning_rate": 8.448346780572695e-05,
"loss": 0.5765,
"step": 5010
},
{
"epoch": 0.7948273143037375,
"grad_norm": 0.6005885601043701,
"learning_rate": 8.438854611612086e-05,
"loss": 0.5941,
"step": 5040
},
{
"epoch": 0.7995584292698312,
"grad_norm": 0.6507188081741333,
"learning_rate": 8.42936244265148e-05,
"loss": 0.5827,
"step": 5070
},
{
"epoch": 0.8042895442359249,
"grad_norm": 0.7276827096939087,
"learning_rate": 8.419870273690871e-05,
"loss": 0.5555,
"step": 5100
},
{
"epoch": 0.8090206592020186,
"grad_norm": 0.6792399287223816,
"learning_rate": 8.410378104730264e-05,
"loss": 0.5724,
"step": 5130
},
{
"epoch": 0.8137517741681123,
"grad_norm": 0.7074045538902283,
"learning_rate": 8.400885935769656e-05,
"loss": 0.5724,
"step": 5160
},
{
"epoch": 0.818482889134206,
"grad_norm": 0.6056311130523682,
"learning_rate": 8.391393766809049e-05,
"loss": 0.5546,
"step": 5190
},
{
"epoch": 0.8200599274562372,
"eval_loss": 0.5805890560150146,
"eval_runtime": 3.7561,
"eval_samples_per_second": 26.889,
"eval_steps_per_second": 3.461,
"step": 5200
},
{
"epoch": 0.8232140041002997,
"grad_norm": 0.6667674779891968,
"learning_rate": 8.381901597848441e-05,
"loss": 0.6173,
"step": 5220
},
{
"epoch": 0.8279451190663933,
"grad_norm": 0.607284426689148,
"learning_rate": 8.372409428887834e-05,
"loss": 0.5781,
"step": 5250
},
{
"epoch": 0.832676234032487,
"grad_norm": 0.6476745009422302,
"learning_rate": 8.362917259927227e-05,
"loss": 0.5667,
"step": 5280
},
{
"epoch": 0.8374073489985807,
"grad_norm": 0.6668260097503662,
"learning_rate": 8.35342509096662e-05,
"loss": 0.5456,
"step": 5310
},
{
"epoch": 0.8421384639646743,
"grad_norm": 0.585110068321228,
"learning_rate": 8.343932922006012e-05,
"loss": 0.5648,
"step": 5340
},
{
"epoch": 0.846869578930768,
"grad_norm": 0.6268571019172668,
"learning_rate": 8.334757158677425e-05,
"loss": 0.555,
"step": 5370
},
{
"epoch": 0.8516006938968617,
"grad_norm": 0.6197232604026794,
"learning_rate": 8.325264989716818e-05,
"loss": 0.5333,
"step": 5400
},
{
"epoch": 0.8516006938968617,
"eval_loss": 0.5601951479911804,
"eval_runtime": 3.7534,
"eval_samples_per_second": 26.909,
"eval_steps_per_second": 3.464,
"step": 5400
},
{
"epoch": 0.8563318088629553,
"grad_norm": 0.63880455493927,
"learning_rate": 8.31577282075621e-05,
"loss": 0.5602,
"step": 5430
},
{
"epoch": 0.861062923829049,
"grad_norm": 0.6235695481300354,
"learning_rate": 8.306280651795603e-05,
"loss": 0.5604,
"step": 5460
},
{
"epoch": 0.8657940387951427,
"grad_norm": 0.9000911712646484,
"learning_rate": 8.296788482834995e-05,
"loss": 0.5654,
"step": 5490
},
{
"epoch": 0.8705251537612364,
"grad_norm": 0.6557802557945251,
"learning_rate": 8.287612719506408e-05,
"loss": 0.5962,
"step": 5520
},
{
"epoch": 0.8752562687273301,
"grad_norm": 0.6231096982955933,
"learning_rate": 8.278120550545801e-05,
"loss": 0.5636,
"step": 5550
},
{
"epoch": 0.8799873836934238,
"grad_norm": 0.5984258651733398,
"learning_rate": 8.268628381585192e-05,
"loss": 0.5616,
"step": 5580
},
{
"epoch": 0.8831414603374862,
"eval_loss": 0.5611711740493774,
"eval_runtime": 3.7542,
"eval_samples_per_second": 26.903,
"eval_steps_per_second": 3.463,
"step": 5600
},
{
"epoch": 0.8847184986595175,
"grad_norm": 0.5818042159080505,
"learning_rate": 8.259452618256605e-05,
"loss": 0.5316,
"step": 5610
},
{
"epoch": 0.8894496136256111,
"grad_norm": 0.7120912671089172,
"learning_rate": 8.249960449295998e-05,
"loss": 0.5556,
"step": 5640
},
{
"epoch": 0.8941807285917048,
"grad_norm": 0.6223446130752563,
"learning_rate": 8.24046828033539e-05,
"loss": 0.5452,
"step": 5670
},
{
"epoch": 0.8989118435577984,
"grad_norm": 0.6196858286857605,
"learning_rate": 8.230976111374783e-05,
"loss": 0.5601,
"step": 5700
},
{
"epoch": 0.9036429585238921,
"grad_norm": 0.6353973150253296,
"learning_rate": 8.221483942414175e-05,
"loss": 0.5402,
"step": 5730
},
{
"epoch": 0.9083740734899858,
"grad_norm": 0.6631510257720947,
"learning_rate": 8.211991773453568e-05,
"loss": 0.5382,
"step": 5760
},
{
"epoch": 0.9131051884560795,
"grad_norm": 0.6404465436935425,
"learning_rate": 8.20249960449296e-05,
"loss": 0.5298,
"step": 5790
},
{
"epoch": 0.9146822267781107,
"eval_loss": 0.560188353061676,
"eval_runtime": 3.7541,
"eval_samples_per_second": 26.904,
"eval_steps_per_second": 3.463,
"step": 5800
},
{
"epoch": 0.9178363034221731,
"grad_norm": 0.6810153126716614,
"learning_rate": 8.193007435532353e-05,
"loss": 0.5159,
"step": 5820
},
{
"epoch": 0.9225674183882668,
"grad_norm": 0.5828801989555359,
"learning_rate": 8.183515266571745e-05,
"loss": 0.5155,
"step": 5850
},
{
"epoch": 0.9272985333543605,
"grad_norm": 0.538987934589386,
"learning_rate": 8.174023097611138e-05,
"loss": 0.5273,
"step": 5880
},
{
"epoch": 0.9320296483204542,
"grad_norm": 0.6222363114356995,
"learning_rate": 8.16453092865053e-05,
"loss": 0.526,
"step": 5910
},
{
"epoch": 0.9367607632865479,
"grad_norm": 0.542966902256012,
"learning_rate": 8.155038759689923e-05,
"loss": 0.5653,
"step": 5940
},
{
"epoch": 0.9414918782526416,
"grad_norm": 0.7064533829689026,
"learning_rate": 8.145546590729315e-05,
"loss": 0.5207,
"step": 5970
},
{
"epoch": 0.9462229932187353,
"grad_norm": 0.6652514934539795,
"learning_rate": 8.136054421768708e-05,
"loss": 0.5342,
"step": 6000
},
{
"epoch": 0.9462229932187353,
"eval_loss": 0.5476773977279663,
"eval_runtime": 3.7543,
"eval_samples_per_second": 26.902,
"eval_steps_per_second": 3.463,
"step": 6000
},
{
"epoch": 0.9509541081848288,
"grad_norm": 0.6436010003089905,
"learning_rate": 8.126562252808101e-05,
"loss": 0.536,
"step": 6030
},
{
"epoch": 0.9556852231509225,
"grad_norm": 0.5532657504081726,
"learning_rate": 8.117070083847494e-05,
"loss": 0.5261,
"step": 6060
},
{
"epoch": 0.9604163381170162,
"grad_norm": 0.6539950370788574,
"learning_rate": 8.107577914886886e-05,
"loss": 0.5226,
"step": 6090
},
{
"epoch": 0.9651474530831099,
"grad_norm": 0.5767289996147156,
"learning_rate": 8.098085745926279e-05,
"loss": 0.534,
"step": 6120
},
{
"epoch": 0.9698785680492036,
"grad_norm": 0.6355389356613159,
"learning_rate": 8.08859357696567e-05,
"loss": 0.5282,
"step": 6150
},
{
"epoch": 0.9746096830152973,
"grad_norm": 0.6711322665214539,
"learning_rate": 8.079101408005064e-05,
"loss": 0.5384,
"step": 6180
},
{
"epoch": 0.9777637596593597,
"eval_loss": 0.5372142195701599,
"eval_runtime": 3.7547,
"eval_samples_per_second": 26.899,
"eval_steps_per_second": 3.462,
"step": 6200
},
{
"epoch": 0.979340797981391,
"grad_norm": 0.5990795493125916,
"learning_rate": 8.069609239044455e-05,
"loss": 0.4624,
"step": 6210
},
{
"epoch": 0.9840719129474846,
"grad_norm": 0.6971167325973511,
"learning_rate": 8.060117070083848e-05,
"loss": 0.5015,
"step": 6240
},
{
"epoch": 0.9888030279135783,
"grad_norm": 0.6699081659317017,
"learning_rate": 8.05062490112324e-05,
"loss": 0.5325,
"step": 6270
},
{
"epoch": 0.993534142879672,
"grad_norm": 0.6347541213035583,
"learning_rate": 8.041132732162633e-05,
"loss": 0.5255,
"step": 6300
},
{
"epoch": 0.9982652578457657,
"grad_norm": 0.7587487101554871,
"learning_rate": 8.031640563202025e-05,
"loss": 0.5154,
"step": 6330
}
],
"logging_steps": 30,
"max_steps": 31705,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.9843838888449147e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}