byt5-small-ain-jpn-mt / trainer_state.json
koki's picture
moved files
e2ebefe
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.0,
"global_step": 262976,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 0.00019998960218803238,
"loss": 1.7263,
"step": 500
},
{
"epoch": 0.03,
"learning_rate": 0.00019997920437606474,
"loss": 1.3861,
"step": 1000
},
{
"epoch": 0.04,
"learning_rate": 0.0001999688065640971,
"loss": 1.2883,
"step": 1500
},
{
"epoch": 0.05,
"learning_rate": 0.00019995840875212947,
"loss": 1.2357,
"step": 2000
},
{
"epoch": 0.07,
"learning_rate": 0.00019994801094016184,
"loss": 1.2031,
"step": 2500
},
{
"epoch": 0.08,
"learning_rate": 0.0001999376131281942,
"loss": 1.1688,
"step": 3000
},
{
"epoch": 0.09,
"learning_rate": 0.00019992721531622657,
"loss": 1.1213,
"step": 3500
},
{
"epoch": 0.11,
"learning_rate": 0.00019991681750425894,
"loss": 1.1088,
"step": 4000
},
{
"epoch": 0.12,
"learning_rate": 0.0001999064196922913,
"loss": 1.1039,
"step": 4500
},
{
"epoch": 0.13,
"learning_rate": 0.00019989602188032367,
"loss": 1.0876,
"step": 5000
},
{
"epoch": 0.15,
"learning_rate": 0.00019988562406835603,
"loss": 1.0792,
"step": 5500
},
{
"epoch": 0.16,
"learning_rate": 0.00019987522625638843,
"loss": 1.0654,
"step": 6000
},
{
"epoch": 0.17,
"learning_rate": 0.0001998648284444208,
"loss": 1.0561,
"step": 6500
},
{
"epoch": 0.19,
"learning_rate": 0.00019985443063245316,
"loss": 1.0406,
"step": 7000
},
{
"epoch": 0.2,
"learning_rate": 0.00019984403282048553,
"loss": 1.0274,
"step": 7500
},
{
"epoch": 0.21,
"learning_rate": 0.00019983363500851792,
"loss": 1.0271,
"step": 8000
},
{
"epoch": 0.23,
"learning_rate": 0.00019982323719655028,
"loss": 1.0028,
"step": 8500
},
{
"epoch": 0.24,
"learning_rate": 0.00019981283938458265,
"loss": 0.9957,
"step": 9000
},
{
"epoch": 0.25,
"learning_rate": 0.00019980244157261502,
"loss": 0.9892,
"step": 9500
},
{
"epoch": 0.27,
"learning_rate": 0.00019979204376064738,
"loss": 0.9524,
"step": 10000
},
{
"epoch": 0.28,
"learning_rate": 0.00019978164594867975,
"loss": 0.9618,
"step": 10500
},
{
"epoch": 0.29,
"learning_rate": 0.00019977124813671211,
"loss": 0.9499,
"step": 11000
},
{
"epoch": 0.31,
"learning_rate": 0.00019976085032474448,
"loss": 0.929,
"step": 11500
},
{
"epoch": 0.32,
"learning_rate": 0.00019975045251277685,
"loss": 0.9172,
"step": 12000
},
{
"epoch": 0.33,
"learning_rate": 0.0001997400547008092,
"loss": 0.917,
"step": 12500
},
{
"epoch": 0.35,
"learning_rate": 0.00019972965688884158,
"loss": 0.8794,
"step": 13000
},
{
"epoch": 0.36,
"learning_rate": 0.00019971925907687394,
"loss": 0.8818,
"step": 13500
},
{
"epoch": 0.37,
"learning_rate": 0.0001997088612649063,
"loss": 0.8747,
"step": 14000
},
{
"epoch": 0.39,
"learning_rate": 0.00019969846345293868,
"loss": 0.8592,
"step": 14500
},
{
"epoch": 0.4,
"learning_rate": 0.00019968806564097104,
"loss": 0.8561,
"step": 15000
},
{
"epoch": 0.41,
"learning_rate": 0.0001996776678290034,
"loss": 0.8538,
"step": 15500
},
{
"epoch": 0.43,
"learning_rate": 0.00019966727001703577,
"loss": 0.8396,
"step": 16000
},
{
"epoch": 0.44,
"learning_rate": 0.00019965687220506814,
"loss": 0.8225,
"step": 16500
},
{
"epoch": 0.45,
"learning_rate": 0.0001996464743931005,
"loss": 0.8246,
"step": 17000
},
{
"epoch": 0.47,
"learning_rate": 0.00019963607658113287,
"loss": 0.8278,
"step": 17500
},
{
"epoch": 0.48,
"learning_rate": 0.00019962567876916524,
"loss": 0.8071,
"step": 18000
},
{
"epoch": 0.49,
"learning_rate": 0.0001996152809571976,
"loss": 0.7961,
"step": 18500
},
{
"epoch": 0.51,
"learning_rate": 0.00019960488314522997,
"loss": 0.7681,
"step": 19000
},
{
"epoch": 0.52,
"learning_rate": 0.00019959448533326236,
"loss": 0.7806,
"step": 19500
},
{
"epoch": 0.53,
"learning_rate": 0.00019958408752129473,
"loss": 0.754,
"step": 20000
},
{
"epoch": 0.55,
"learning_rate": 0.0001995736897093271,
"loss": 0.7402,
"step": 20500
},
{
"epoch": 0.56,
"learning_rate": 0.00019956329189735946,
"loss": 0.7455,
"step": 21000
},
{
"epoch": 0.57,
"learning_rate": 0.00019955289408539183,
"loss": 0.7404,
"step": 21500
},
{
"epoch": 0.59,
"learning_rate": 0.00019954249627342422,
"loss": 0.7372,
"step": 22000
},
{
"epoch": 0.6,
"learning_rate": 0.00019953209846145658,
"loss": 0.7389,
"step": 22500
},
{
"epoch": 0.61,
"learning_rate": 0.00019952170064948895,
"loss": 0.7227,
"step": 23000
},
{
"epoch": 0.63,
"learning_rate": 0.00019951130283752132,
"loss": 0.7496,
"step": 23500
},
{
"epoch": 0.64,
"learning_rate": 0.00019950090502555368,
"loss": 0.7116,
"step": 24000
},
{
"epoch": 0.65,
"learning_rate": 0.00019949050721358605,
"loss": 0.7025,
"step": 24500
},
{
"epoch": 0.67,
"learning_rate": 0.00019948010940161841,
"loss": 0.6993,
"step": 25000
},
{
"epoch": 0.68,
"learning_rate": 0.00019946971158965078,
"loss": 0.6999,
"step": 25500
},
{
"epoch": 0.69,
"learning_rate": 0.00019945931377768315,
"loss": 0.6792,
"step": 26000
},
{
"epoch": 0.71,
"learning_rate": 0.0001994489159657155,
"loss": 0.6936,
"step": 26500
},
{
"epoch": 0.72,
"learning_rate": 0.00019943851815374788,
"loss": 0.6906,
"step": 27000
},
{
"epoch": 0.73,
"learning_rate": 0.00019942812034178024,
"loss": 0.6717,
"step": 27500
},
{
"epoch": 0.75,
"learning_rate": 0.0001994177225298126,
"loss": 0.6746,
"step": 28000
},
{
"epoch": 0.76,
"learning_rate": 0.00019940732471784498,
"loss": 0.674,
"step": 28500
},
{
"epoch": 0.77,
"learning_rate": 0.00019939692690587734,
"loss": 0.6781,
"step": 29000
},
{
"epoch": 0.79,
"learning_rate": 0.0001993865290939097,
"loss": 0.6897,
"step": 29500
},
{
"epoch": 0.8,
"learning_rate": 0.00019937613128194207,
"loss": 0.6656,
"step": 30000
},
{
"epoch": 0.81,
"learning_rate": 0.00019936573346997444,
"loss": 0.6772,
"step": 30500
},
{
"epoch": 0.83,
"learning_rate": 0.0001993553356580068,
"loss": 0.6434,
"step": 31000
},
{
"epoch": 0.84,
"learning_rate": 0.00019934493784603917,
"loss": 0.6644,
"step": 31500
},
{
"epoch": 0.85,
"learning_rate": 0.00019933454003407156,
"loss": 0.6459,
"step": 32000
},
{
"epoch": 0.87,
"learning_rate": 0.00019932414222210393,
"loss": 0.6428,
"step": 32500
},
{
"epoch": 0.88,
"learning_rate": 0.0001993137444101363,
"loss": 0.6313,
"step": 33000
},
{
"epoch": 0.89,
"learning_rate": 0.00019930334659816866,
"loss": 0.641,
"step": 33500
},
{
"epoch": 0.91,
"learning_rate": 0.00019929294878620103,
"loss": 0.6335,
"step": 34000
},
{
"epoch": 0.92,
"learning_rate": 0.0001992825509742334,
"loss": 0.6468,
"step": 34500
},
{
"epoch": 0.93,
"learning_rate": 0.00019927215316226576,
"loss": 0.6108,
"step": 35000
},
{
"epoch": 0.94,
"learning_rate": 0.00019926175535029815,
"loss": 0.64,
"step": 35500
},
{
"epoch": 0.96,
"learning_rate": 0.00019925135753833052,
"loss": 0.6094,
"step": 36000
},
{
"epoch": 0.97,
"learning_rate": 0.00019924095972636289,
"loss": 0.6081,
"step": 36500
},
{
"epoch": 0.98,
"learning_rate": 0.00019923056191439525,
"loss": 0.609,
"step": 37000
},
{
"epoch": 1.0,
"learning_rate": 0.00019922016410242762,
"loss": 0.6386,
"step": 37500
},
{
"epoch": 1.0,
"eval_bleu": 0.0649,
"eval_gen_len": 18.2942,
"eval_loss": 0.5461284518241882,
"eval_runtime": 1943.184,
"eval_samples_per_second": 4.834,
"eval_steps_per_second": 2.417,
"step": 37568
},
{
"epoch": 1.01,
"learning_rate": 0.00019920976629045998,
"loss": 0.5822,
"step": 38000
},
{
"epoch": 1.02,
"learning_rate": 0.00019919936847849235,
"loss": 0.5875,
"step": 38500
},
{
"epoch": 1.04,
"learning_rate": 0.00019918897066652471,
"loss": 0.5772,
"step": 39000
},
{
"epoch": 1.05,
"learning_rate": 0.00019917857285455708,
"loss": 0.5732,
"step": 39500
},
{
"epoch": 1.06,
"learning_rate": 0.00019916817504258945,
"loss": 0.5781,
"step": 40000
},
{
"epoch": 1.08,
"learning_rate": 0.0001991577772306218,
"loss": 0.5841,
"step": 40500
},
{
"epoch": 1.09,
"learning_rate": 0.00019914737941865418,
"loss": 0.5801,
"step": 41000
},
{
"epoch": 1.1,
"learning_rate": 0.00019913698160668654,
"loss": 0.57,
"step": 41500
},
{
"epoch": 1.12,
"learning_rate": 0.0001991265837947189,
"loss": 0.5653,
"step": 42000
},
{
"epoch": 1.13,
"learning_rate": 0.00019911618598275128,
"loss": 0.5885,
"step": 42500
},
{
"epoch": 1.14,
"learning_rate": 0.00019910578817078364,
"loss": 0.5605,
"step": 43000
},
{
"epoch": 1.16,
"learning_rate": 0.000199095390358816,
"loss": 0.5882,
"step": 43500
},
{
"epoch": 1.17,
"learning_rate": 0.00019908499254684837,
"loss": 0.5702,
"step": 44000
},
{
"epoch": 1.18,
"learning_rate": 0.00019907459473488074,
"loss": 0.5631,
"step": 44500
},
{
"epoch": 1.2,
"learning_rate": 0.00019906419692291313,
"loss": 0.5557,
"step": 45000
},
{
"epoch": 1.21,
"learning_rate": 0.0001990537991109455,
"loss": 0.5603,
"step": 45500
},
{
"epoch": 1.22,
"learning_rate": 0.00019904340129897787,
"loss": 0.5578,
"step": 46000
},
{
"epoch": 1.24,
"learning_rate": 0.00019903300348701023,
"loss": 0.5589,
"step": 46500
},
{
"epoch": 1.25,
"learning_rate": 0.0001990226056750426,
"loss": 0.5489,
"step": 47000
},
{
"epoch": 1.26,
"learning_rate": 0.00019901220786307496,
"loss": 0.5662,
"step": 47500
},
{
"epoch": 1.28,
"learning_rate": 0.00019900181005110733,
"loss": 0.5345,
"step": 48000
},
{
"epoch": 1.29,
"learning_rate": 0.0001989914122391397,
"loss": 0.5629,
"step": 48500
},
{
"epoch": 1.3,
"learning_rate": 0.0001989810144271721,
"loss": 0.5505,
"step": 49000
},
{
"epoch": 1.32,
"learning_rate": 0.00019897061661520445,
"loss": 0.5443,
"step": 49500
},
{
"epoch": 1.33,
"learning_rate": 0.00019896021880323682,
"loss": 0.5426,
"step": 50000
},
{
"epoch": 1.34,
"learning_rate": 0.00019894982099126919,
"loss": 0.5445,
"step": 50500
},
{
"epoch": 1.36,
"learning_rate": 0.00019893942317930155,
"loss": 0.5402,
"step": 51000
},
{
"epoch": 1.37,
"learning_rate": 0.00019892902536733392,
"loss": 0.5744,
"step": 51500
},
{
"epoch": 1.38,
"learning_rate": 0.00019891862755536628,
"loss": 0.5523,
"step": 52000
},
{
"epoch": 1.4,
"learning_rate": 0.00019890822974339865,
"loss": 0.5395,
"step": 52500
},
{
"epoch": 1.41,
"learning_rate": 0.00019889783193143102,
"loss": 0.5469,
"step": 53000
},
{
"epoch": 1.42,
"learning_rate": 0.00019888743411946338,
"loss": 0.5314,
"step": 53500
},
{
"epoch": 1.44,
"learning_rate": 0.00019887703630749575,
"loss": 0.5437,
"step": 54000
},
{
"epoch": 1.45,
"learning_rate": 0.0001988666384955281,
"loss": 0.5466,
"step": 54500
},
{
"epoch": 1.46,
"learning_rate": 0.00019885624068356048,
"loss": 0.5351,
"step": 55000
},
{
"epoch": 1.48,
"learning_rate": 0.00019884584287159284,
"loss": 0.5495,
"step": 55500
},
{
"epoch": 1.49,
"learning_rate": 0.0001988354450596252,
"loss": 0.5189,
"step": 56000
},
{
"epoch": 1.5,
"learning_rate": 0.00019882504724765758,
"loss": 0.5568,
"step": 56500
},
{
"epoch": 1.52,
"learning_rate": 0.00019881464943568994,
"loss": 0.523,
"step": 57000
},
{
"epoch": 1.53,
"learning_rate": 0.00019880425162372234,
"loss": 0.5247,
"step": 57500
},
{
"epoch": 1.54,
"learning_rate": 0.0001987938538117547,
"loss": 0.5274,
"step": 58000
},
{
"epoch": 1.56,
"learning_rate": 0.00019878345599978707,
"loss": 0.538,
"step": 58500
},
{
"epoch": 1.57,
"learning_rate": 0.00019877305818781943,
"loss": 0.5319,
"step": 59000
},
{
"epoch": 1.58,
"learning_rate": 0.0001987626603758518,
"loss": 0.5145,
"step": 59500
},
{
"epoch": 1.6,
"learning_rate": 0.00019875226256388417,
"loss": 0.534,
"step": 60000
},
{
"epoch": 1.61,
"learning_rate": 0.00019874186475191653,
"loss": 0.5331,
"step": 60500
},
{
"epoch": 1.62,
"learning_rate": 0.0001987314669399489,
"loss": 0.5081,
"step": 61000
},
{
"epoch": 1.64,
"learning_rate": 0.00019872106912798126,
"loss": 0.5151,
"step": 61500
},
{
"epoch": 1.65,
"learning_rate": 0.00019871067131601363,
"loss": 0.5192,
"step": 62000
},
{
"epoch": 1.66,
"learning_rate": 0.000198700273504046,
"loss": 0.4988,
"step": 62500
},
{
"epoch": 1.68,
"learning_rate": 0.0001986898756920784,
"loss": 0.5186,
"step": 63000
},
{
"epoch": 1.69,
"learning_rate": 0.00019867947788011075,
"loss": 0.5215,
"step": 63500
},
{
"epoch": 1.7,
"learning_rate": 0.00019866908006814312,
"loss": 0.5082,
"step": 64000
},
{
"epoch": 1.72,
"learning_rate": 0.00019865868225617549,
"loss": 0.5186,
"step": 64500
},
{
"epoch": 1.73,
"learning_rate": 0.00019864828444420785,
"loss": 0.5192,
"step": 65000
},
{
"epoch": 1.74,
"learning_rate": 0.00019863788663224022,
"loss": 0.5033,
"step": 65500
},
{
"epoch": 1.76,
"learning_rate": 0.00019862748882027258,
"loss": 0.5196,
"step": 66000
},
{
"epoch": 1.77,
"learning_rate": 0.00019861709100830495,
"loss": 0.5259,
"step": 66500
},
{
"epoch": 1.78,
"learning_rate": 0.00019860669319633732,
"loss": 0.5013,
"step": 67000
},
{
"epoch": 1.8,
"learning_rate": 0.00019859629538436968,
"loss": 0.497,
"step": 67500
},
{
"epoch": 1.81,
"learning_rate": 0.00019858589757240205,
"loss": 0.5142,
"step": 68000
},
{
"epoch": 1.82,
"learning_rate": 0.0001985754997604344,
"loss": 0.4933,
"step": 68500
},
{
"epoch": 1.84,
"learning_rate": 0.00019856510194846678,
"loss": 0.5157,
"step": 69000
},
{
"epoch": 1.85,
"learning_rate": 0.00019855470413649915,
"loss": 0.5244,
"step": 69500
},
{
"epoch": 1.86,
"learning_rate": 0.0001985443063245315,
"loss": 0.4957,
"step": 70000
},
{
"epoch": 1.88,
"learning_rate": 0.0001985339085125639,
"loss": 0.4948,
"step": 70500
},
{
"epoch": 1.89,
"learning_rate": 0.00019852351070059627,
"loss": 0.497,
"step": 71000
},
{
"epoch": 1.9,
"learning_rate": 0.00019851311288862864,
"loss": 0.4975,
"step": 71500
},
{
"epoch": 1.92,
"learning_rate": 0.000198502715076661,
"loss": 0.4931,
"step": 72000
},
{
"epoch": 1.93,
"learning_rate": 0.00019849231726469337,
"loss": 0.4886,
"step": 72500
},
{
"epoch": 1.94,
"learning_rate": 0.00019848191945272573,
"loss": 0.4972,
"step": 73000
},
{
"epoch": 1.96,
"learning_rate": 0.0001984715216407581,
"loss": 0.4861,
"step": 73500
},
{
"epoch": 1.97,
"learning_rate": 0.00019846112382879047,
"loss": 0.5027,
"step": 74000
},
{
"epoch": 1.98,
"learning_rate": 0.00019845072601682283,
"loss": 0.4816,
"step": 74500
},
{
"epoch": 2.0,
"learning_rate": 0.0001984403282048552,
"loss": 0.489,
"step": 75000
},
{
"epoch": 2.0,
"eval_bleu": 0.0787,
"eval_gen_len": 18.4245,
"eval_loss": 0.4464058578014374,
"eval_runtime": 1938.9705,
"eval_samples_per_second": 4.844,
"eval_steps_per_second": 2.422,
"step": 75136
},
{
"epoch": 2.01,
"learning_rate": 0.00019842993039288756,
"loss": 0.4753,
"step": 75500
},
{
"epoch": 2.02,
"learning_rate": 0.00019841953258091993,
"loss": 0.4532,
"step": 76000
},
{
"epoch": 2.04,
"learning_rate": 0.00019840913476895232,
"loss": 0.4509,
"step": 76500
},
{
"epoch": 2.05,
"learning_rate": 0.0001983987369569847,
"loss": 0.4569,
"step": 77000
},
{
"epoch": 2.06,
"learning_rate": 0.00019838833914501705,
"loss": 0.4517,
"step": 77500
},
{
"epoch": 2.08,
"learning_rate": 0.00019837794133304942,
"loss": 0.432,
"step": 78000
},
{
"epoch": 2.09,
"learning_rate": 0.00019836754352108179,
"loss": 0.4639,
"step": 78500
},
{
"epoch": 2.1,
"learning_rate": 0.00019835714570911415,
"loss": 0.4425,
"step": 79000
},
{
"epoch": 2.12,
"learning_rate": 0.00019834674789714652,
"loss": 0.4403,
"step": 79500
},
{
"epoch": 2.13,
"learning_rate": 0.00019833635008517888,
"loss": 0.4653,
"step": 80000
},
{
"epoch": 2.14,
"learning_rate": 0.00019832595227321125,
"loss": 0.4584,
"step": 80500
},
{
"epoch": 2.16,
"learning_rate": 0.00019831555446124362,
"loss": 0.4514,
"step": 81000
},
{
"epoch": 2.17,
"learning_rate": 0.00019830515664927598,
"loss": 0.4542,
"step": 81500
},
{
"epoch": 2.18,
"learning_rate": 0.00019829475883730835,
"loss": 0.4473,
"step": 82000
},
{
"epoch": 2.2,
"learning_rate": 0.00019828436102534071,
"loss": 0.4642,
"step": 82500
},
{
"epoch": 2.21,
"learning_rate": 0.0001982739632133731,
"loss": 0.4382,
"step": 83000
},
{
"epoch": 2.22,
"learning_rate": 0.00019826356540140547,
"loss": 0.4541,
"step": 83500
},
{
"epoch": 2.24,
"learning_rate": 0.00019825316758943784,
"loss": 0.4621,
"step": 84000
},
{
"epoch": 2.25,
"learning_rate": 0.0001982427697774702,
"loss": 0.4585,
"step": 84500
},
{
"epoch": 2.26,
"learning_rate": 0.00019823237196550257,
"loss": 0.4595,
"step": 85000
},
{
"epoch": 2.28,
"learning_rate": 0.00019822197415353494,
"loss": 0.4554,
"step": 85500
},
{
"epoch": 2.29,
"learning_rate": 0.0001982115763415673,
"loss": 0.4545,
"step": 86000
},
{
"epoch": 2.3,
"learning_rate": 0.00019820117852959967,
"loss": 0.448,
"step": 86500
},
{
"epoch": 2.32,
"learning_rate": 0.00019819078071763203,
"loss": 0.4582,
"step": 87000
},
{
"epoch": 2.33,
"learning_rate": 0.0001981803829056644,
"loss": 0.4362,
"step": 87500
},
{
"epoch": 2.34,
"learning_rate": 0.00019816998509369677,
"loss": 0.4386,
"step": 88000
},
{
"epoch": 2.36,
"learning_rate": 0.00019815958728172913,
"loss": 0.4437,
"step": 88500
},
{
"epoch": 2.37,
"learning_rate": 0.0001981491894697615,
"loss": 0.4485,
"step": 89000
},
{
"epoch": 2.38,
"learning_rate": 0.00019813879165779386,
"loss": 0.4489,
"step": 89500
},
{
"epoch": 2.4,
"learning_rate": 0.00019812839384582626,
"loss": 0.4528,
"step": 90000
},
{
"epoch": 2.41,
"learning_rate": 0.00019811799603385862,
"loss": 0.4558,
"step": 90500
},
{
"epoch": 2.42,
"learning_rate": 0.000198107598221891,
"loss": 0.4408,
"step": 91000
},
{
"epoch": 2.44,
"learning_rate": 0.00019809720040992335,
"loss": 0.4403,
"step": 91500
},
{
"epoch": 2.45,
"learning_rate": 0.00019808680259795572,
"loss": 0.4336,
"step": 92000
},
{
"epoch": 2.46,
"learning_rate": 0.00019807640478598809,
"loss": 0.4327,
"step": 92500
},
{
"epoch": 2.48,
"learning_rate": 0.00019806600697402045,
"loss": 0.4551,
"step": 93000
},
{
"epoch": 2.49,
"learning_rate": 0.00019805560916205282,
"loss": 0.45,
"step": 93500
},
{
"epoch": 2.5,
"learning_rate": 0.00019804521135008518,
"loss": 0.4313,
"step": 94000
},
{
"epoch": 2.52,
"learning_rate": 0.00019803481353811755,
"loss": 0.4422,
"step": 94500
},
{
"epoch": 2.53,
"learning_rate": 0.00019802441572614992,
"loss": 0.4528,
"step": 95000
},
{
"epoch": 2.54,
"learning_rate": 0.00019801401791418228,
"loss": 0.4507,
"step": 95500
},
{
"epoch": 2.56,
"learning_rate": 0.00019800362010221468,
"loss": 0.4476,
"step": 96000
},
{
"epoch": 2.57,
"learning_rate": 0.00019799322229024704,
"loss": 0.4405,
"step": 96500
},
{
"epoch": 2.58,
"learning_rate": 0.0001979828244782794,
"loss": 0.425,
"step": 97000
},
{
"epoch": 2.6,
"learning_rate": 0.00019797242666631177,
"loss": 0.4496,
"step": 97500
},
{
"epoch": 2.61,
"learning_rate": 0.00019796202885434414,
"loss": 0.4452,
"step": 98000
},
{
"epoch": 2.62,
"learning_rate": 0.0001979516310423765,
"loss": 0.4269,
"step": 98500
},
{
"epoch": 2.64,
"learning_rate": 0.00019794123323040887,
"loss": 0.4406,
"step": 99000
},
{
"epoch": 2.65,
"learning_rate": 0.00019793083541844124,
"loss": 0.4477,
"step": 99500
},
{
"epoch": 2.66,
"learning_rate": 0.0001979204376064736,
"loss": 0.4266,
"step": 100000
},
{
"epoch": 2.68,
"learning_rate": 0.00019791003979450597,
"loss": 0.4395,
"step": 100500
},
{
"epoch": 2.69,
"learning_rate": 0.00019789964198253833,
"loss": 0.4569,
"step": 101000
},
{
"epoch": 2.7,
"learning_rate": 0.0001978892441705707,
"loss": 0.4366,
"step": 101500
},
{
"epoch": 2.72,
"learning_rate": 0.00019787884635860307,
"loss": 0.4321,
"step": 102000
},
{
"epoch": 2.73,
"learning_rate": 0.00019786844854663543,
"loss": 0.439,
"step": 102500
},
{
"epoch": 2.74,
"learning_rate": 0.0001978580507346678,
"loss": 0.4426,
"step": 103000
},
{
"epoch": 2.76,
"learning_rate": 0.00019784765292270016,
"loss": 0.4348,
"step": 103500
},
{
"epoch": 2.77,
"learning_rate": 0.00019783725511073256,
"loss": 0.4365,
"step": 104000
},
{
"epoch": 2.78,
"learning_rate": 0.00019782685729876492,
"loss": 0.4439,
"step": 104500
},
{
"epoch": 2.79,
"learning_rate": 0.0001978164594867973,
"loss": 0.4375,
"step": 105000
},
{
"epoch": 2.81,
"learning_rate": 0.00019780606167482965,
"loss": 0.4506,
"step": 105500
},
{
"epoch": 2.82,
"learning_rate": 0.00019779566386286202,
"loss": 0.44,
"step": 106000
},
{
"epoch": 2.83,
"learning_rate": 0.0001977852660508944,
"loss": 0.4203,
"step": 106500
},
{
"epoch": 2.85,
"learning_rate": 0.00019777486823892675,
"loss": 0.4346,
"step": 107000
},
{
"epoch": 2.86,
"learning_rate": 0.00019776447042695912,
"loss": 0.4432,
"step": 107500
},
{
"epoch": 2.87,
"learning_rate": 0.00019775407261499148,
"loss": 0.4415,
"step": 108000
},
{
"epoch": 2.89,
"learning_rate": 0.00019774367480302388,
"loss": 0.4418,
"step": 108500
},
{
"epoch": 2.9,
"learning_rate": 0.00019773327699105624,
"loss": 0.441,
"step": 109000
},
{
"epoch": 2.91,
"learning_rate": 0.0001977228791790886,
"loss": 0.4352,
"step": 109500
},
{
"epoch": 2.93,
"learning_rate": 0.00019771248136712098,
"loss": 0.4348,
"step": 110000
},
{
"epoch": 2.94,
"learning_rate": 0.00019770208355515334,
"loss": 0.4143,
"step": 110500
},
{
"epoch": 2.95,
"learning_rate": 0.0001976916857431857,
"loss": 0.4189,
"step": 111000
},
{
"epoch": 2.97,
"learning_rate": 0.00019768128793121807,
"loss": 0.4232,
"step": 111500
},
{
"epoch": 2.98,
"learning_rate": 0.00019767089011925044,
"loss": 0.4272,
"step": 112000
},
{
"epoch": 2.99,
"learning_rate": 0.0001976604923072828,
"loss": 0.4389,
"step": 112500
},
{
"epoch": 3.0,
"eval_bleu": 0.0877,
"eval_gen_len": 18.3533,
"eval_loss": 0.4100983440876007,
"eval_runtime": 1952.3856,
"eval_samples_per_second": 4.811,
"eval_steps_per_second": 2.406,
"step": 112704
},
{
"epoch": 3.01,
"learning_rate": 0.00019765009449531517,
"loss": 0.4031,
"step": 113000
},
{
"epoch": 3.02,
"learning_rate": 0.00019763969668334754,
"loss": 0.3936,
"step": 113500
},
{
"epoch": 3.03,
"learning_rate": 0.0001976292988713799,
"loss": 0.3902,
"step": 114000
},
{
"epoch": 3.05,
"learning_rate": 0.00019761890105941227,
"loss": 0.3927,
"step": 114500
},
{
"epoch": 3.06,
"learning_rate": 0.00019760850324744463,
"loss": 0.3834,
"step": 115000
},
{
"epoch": 3.07,
"learning_rate": 0.000197598105435477,
"loss": 0.3901,
"step": 115500
},
{
"epoch": 3.09,
"learning_rate": 0.00019758770762350937,
"loss": 0.3836,
"step": 116000
},
{
"epoch": 3.1,
"learning_rate": 0.00019757730981154173,
"loss": 0.395,
"step": 116500
},
{
"epoch": 3.11,
"learning_rate": 0.0001975669119995741,
"loss": 0.3999,
"step": 117000
},
{
"epoch": 3.13,
"learning_rate": 0.0001975565141876065,
"loss": 0.3829,
"step": 117500
},
{
"epoch": 3.14,
"learning_rate": 0.00019754611637563886,
"loss": 0.3975,
"step": 118000
},
{
"epoch": 3.15,
"learning_rate": 0.00019753571856367122,
"loss": 0.3784,
"step": 118500
},
{
"epoch": 3.17,
"learning_rate": 0.0001975253207517036,
"loss": 0.4048,
"step": 119000
},
{
"epoch": 3.18,
"learning_rate": 0.00019751492293973596,
"loss": 0.3909,
"step": 119500
},
{
"epoch": 3.19,
"learning_rate": 0.00019750452512776832,
"loss": 0.4005,
"step": 120000
},
{
"epoch": 3.21,
"learning_rate": 0.0001974941273158007,
"loss": 0.3976,
"step": 120500
},
{
"epoch": 3.22,
"learning_rate": 0.00019748372950383305,
"loss": 0.3971,
"step": 121000
},
{
"epoch": 3.23,
"learning_rate": 0.00019747333169186545,
"loss": 0.3968,
"step": 121500
},
{
"epoch": 3.25,
"learning_rate": 0.0001974629338798978,
"loss": 0.3856,
"step": 122000
},
{
"epoch": 3.26,
"learning_rate": 0.00019745253606793018,
"loss": 0.3833,
"step": 122500
},
{
"epoch": 3.27,
"learning_rate": 0.00019744213825596254,
"loss": 0.4037,
"step": 123000
},
{
"epoch": 3.29,
"learning_rate": 0.0001974317404439949,
"loss": 0.3821,
"step": 123500
},
{
"epoch": 3.3,
"learning_rate": 0.00019742134263202728,
"loss": 0.3986,
"step": 124000
},
{
"epoch": 3.31,
"learning_rate": 0.00019741094482005964,
"loss": 0.3838,
"step": 124500
},
{
"epoch": 3.33,
"learning_rate": 0.000197400547008092,
"loss": 0.3827,
"step": 125000
},
{
"epoch": 3.34,
"learning_rate": 0.00019739014919612437,
"loss": 0.3914,
"step": 125500
},
{
"epoch": 3.35,
"learning_rate": 0.00019737975138415674,
"loss": 0.3923,
"step": 126000
},
{
"epoch": 3.37,
"learning_rate": 0.0001973693535721891,
"loss": 0.3982,
"step": 126500
},
{
"epoch": 3.38,
"learning_rate": 0.00019735895576022147,
"loss": 0.3915,
"step": 127000
},
{
"epoch": 3.39,
"learning_rate": 0.00019734855794825384,
"loss": 0.4022,
"step": 127500
},
{
"epoch": 3.41,
"learning_rate": 0.0001973381601362862,
"loss": 0.3888,
"step": 128000
},
{
"epoch": 3.42,
"learning_rate": 0.00019732776232431857,
"loss": 0.3855,
"step": 128500
},
{
"epoch": 3.43,
"learning_rate": 0.00019731736451235093,
"loss": 0.3947,
"step": 129000
},
{
"epoch": 3.45,
"learning_rate": 0.0001973069667003833,
"loss": 0.3877,
"step": 129500
},
{
"epoch": 3.46,
"learning_rate": 0.00019729656888841567,
"loss": 0.3921,
"step": 130000
},
{
"epoch": 3.47,
"learning_rate": 0.00019728617107644803,
"loss": 0.3994,
"step": 130500
},
{
"epoch": 3.49,
"learning_rate": 0.00019727577326448043,
"loss": 0.3949,
"step": 131000
},
{
"epoch": 3.5,
"learning_rate": 0.0001972653754525128,
"loss": 0.3901,
"step": 131500
},
{
"epoch": 3.51,
"learning_rate": 0.00019725497764054516,
"loss": 0.3954,
"step": 132000
},
{
"epoch": 3.53,
"learning_rate": 0.00019724457982857752,
"loss": 0.3962,
"step": 132500
},
{
"epoch": 3.54,
"learning_rate": 0.0001972341820166099,
"loss": 0.405,
"step": 133000
},
{
"epoch": 3.55,
"learning_rate": 0.00019722378420464226,
"loss": 0.3985,
"step": 133500
},
{
"epoch": 3.57,
"learning_rate": 0.00019721338639267465,
"loss": 0.3763,
"step": 134000
},
{
"epoch": 3.58,
"learning_rate": 0.00019720298858070701,
"loss": 0.4068,
"step": 134500
},
{
"epoch": 3.59,
"learning_rate": 0.00019719259076873938,
"loss": 0.3885,
"step": 135000
},
{
"epoch": 3.61,
"learning_rate": 0.00019718219295677175,
"loss": 0.3833,
"step": 135500
},
{
"epoch": 3.62,
"learning_rate": 0.0001971717951448041,
"loss": 0.3864,
"step": 136000
},
{
"epoch": 3.63,
"learning_rate": 0.00019716139733283648,
"loss": 0.3765,
"step": 136500
},
{
"epoch": 3.65,
"learning_rate": 0.00019715099952086884,
"loss": 0.3988,
"step": 137000
},
{
"epoch": 3.66,
"learning_rate": 0.0001971406017089012,
"loss": 0.3867,
"step": 137500
},
{
"epoch": 3.67,
"learning_rate": 0.00019713020389693358,
"loss": 0.3901,
"step": 138000
},
{
"epoch": 3.69,
"learning_rate": 0.00019711980608496594,
"loss": 0.3822,
"step": 138500
},
{
"epoch": 3.7,
"learning_rate": 0.0001971094082729983,
"loss": 0.3864,
"step": 139000
},
{
"epoch": 3.71,
"learning_rate": 0.00019709901046103067,
"loss": 0.3886,
"step": 139500
},
{
"epoch": 3.73,
"learning_rate": 0.00019708861264906304,
"loss": 0.3715,
"step": 140000
},
{
"epoch": 3.74,
"learning_rate": 0.0001970782148370954,
"loss": 0.388,
"step": 140500
},
{
"epoch": 3.75,
"learning_rate": 0.00019706781702512777,
"loss": 0.3941,
"step": 141000
},
{
"epoch": 3.77,
"learning_rate": 0.00019705741921316014,
"loss": 0.3922,
"step": 141500
},
{
"epoch": 3.78,
"learning_rate": 0.0001970470214011925,
"loss": 0.3918,
"step": 142000
},
{
"epoch": 3.79,
"learning_rate": 0.00019703662358922487,
"loss": 0.3898,
"step": 142500
},
{
"epoch": 3.81,
"learning_rate": 0.00019702622577725724,
"loss": 0.384,
"step": 143000
},
{
"epoch": 3.82,
"learning_rate": 0.0001970158279652896,
"loss": 0.3875,
"step": 143500
},
{
"epoch": 3.83,
"learning_rate": 0.00019700543015332197,
"loss": 0.3893,
"step": 144000
},
{
"epoch": 3.85,
"learning_rate": 0.00019699503234135433,
"loss": 0.3883,
"step": 144500
},
{
"epoch": 3.86,
"learning_rate": 0.00019698463452938673,
"loss": 0.3866,
"step": 145000
},
{
"epoch": 3.87,
"learning_rate": 0.0001969742367174191,
"loss": 0.3901,
"step": 145500
},
{
"epoch": 3.89,
"learning_rate": 0.00019696383890545146,
"loss": 0.4022,
"step": 146000
},
{
"epoch": 3.9,
"learning_rate": 0.00019695344109348382,
"loss": 0.4021,
"step": 146500
},
{
"epoch": 3.91,
"learning_rate": 0.00019694304328151622,
"loss": 0.3873,
"step": 147000
},
{
"epoch": 3.93,
"learning_rate": 0.00019693264546954858,
"loss": 0.3844,
"step": 147500
},
{
"epoch": 3.94,
"learning_rate": 0.00019692224765758095,
"loss": 0.3824,
"step": 148000
},
{
"epoch": 3.95,
"learning_rate": 0.00019691184984561331,
"loss": 0.392,
"step": 148500
},
{
"epoch": 3.97,
"learning_rate": 0.00019690145203364568,
"loss": 0.407,
"step": 149000
},
{
"epoch": 3.98,
"learning_rate": 0.00019689105422167805,
"loss": 0.3864,
"step": 149500
},
{
"epoch": 3.99,
"learning_rate": 0.0001968806564097104,
"loss": 0.3812,
"step": 150000
},
{
"epoch": 4.0,
"eval_bleu": 0.09,
"eval_gen_len": 18.3779,
"eval_loss": 0.39251887798309326,
"eval_runtime": 1926.9186,
"eval_samples_per_second": 4.875,
"eval_steps_per_second": 2.438,
"step": 150272
},
{
"epoch": 4.01,
"learning_rate": 0.00019687025859774278,
"loss": 0.366,
"step": 150500
},
{
"epoch": 4.02,
"learning_rate": 0.00019685986078577514,
"loss": 0.3288,
"step": 151000
},
{
"epoch": 4.03,
"learning_rate": 0.0001968494629738075,
"loss": 0.3431,
"step": 151500
},
{
"epoch": 4.05,
"learning_rate": 0.00019683906516183988,
"loss": 0.3462,
"step": 152000
},
{
"epoch": 4.06,
"learning_rate": 0.00019682866734987224,
"loss": 0.3496,
"step": 152500
},
{
"epoch": 4.07,
"learning_rate": 0.0001968182695379046,
"loss": 0.3385,
"step": 153000
},
{
"epoch": 4.09,
"learning_rate": 0.00019680787172593697,
"loss": 0.3384,
"step": 153500
},
{
"epoch": 4.1,
"learning_rate": 0.00019679747391396934,
"loss": 0.3389,
"step": 154000
},
{
"epoch": 4.11,
"learning_rate": 0.0001967870761020017,
"loss": 0.3292,
"step": 154500
},
{
"epoch": 4.13,
"learning_rate": 0.00019677667829003407,
"loss": 0.337,
"step": 155000
},
{
"epoch": 4.14,
"learning_rate": 0.00019676628047806644,
"loss": 0.3476,
"step": 155500
},
{
"epoch": 4.15,
"learning_rate": 0.0001967558826660988,
"loss": 0.3404,
"step": 156000
},
{
"epoch": 4.17,
"learning_rate": 0.00019674548485413117,
"loss": 0.3466,
"step": 156500
},
{
"epoch": 4.18,
"learning_rate": 0.00019673508704216354,
"loss": 0.3427,
"step": 157000
},
{
"epoch": 4.19,
"learning_rate": 0.0001967246892301959,
"loss": 0.3435,
"step": 157500
},
{
"epoch": 4.21,
"learning_rate": 0.00019671429141822827,
"loss": 0.3516,
"step": 158000
},
{
"epoch": 4.22,
"learning_rate": 0.00019670389360626066,
"loss": 0.3553,
"step": 158500
},
{
"epoch": 4.23,
"learning_rate": 0.00019669349579429303,
"loss": 0.3511,
"step": 159000
},
{
"epoch": 4.25,
"learning_rate": 0.0001966830979823254,
"loss": 0.3376,
"step": 159500
},
{
"epoch": 4.26,
"learning_rate": 0.00019667270017035776,
"loss": 0.3475,
"step": 160000
},
{
"epoch": 4.27,
"learning_rate": 0.00019666230235839015,
"loss": 0.3548,
"step": 160500
},
{
"epoch": 4.29,
"learning_rate": 0.00019665190454642252,
"loss": 0.3595,
"step": 161000
},
{
"epoch": 4.3,
"learning_rate": 0.00019664150673445488,
"loss": 0.339,
"step": 161500
},
{
"epoch": 4.31,
"learning_rate": 0.00019663110892248725,
"loss": 0.3568,
"step": 162000
},
{
"epoch": 4.33,
"learning_rate": 0.00019662071111051962,
"loss": 0.3578,
"step": 162500
},
{
"epoch": 4.34,
"learning_rate": 0.00019661031329855198,
"loss": 0.3465,
"step": 163000
},
{
"epoch": 4.35,
"learning_rate": 0.00019659991548658435,
"loss": 0.3601,
"step": 163500
},
{
"epoch": 4.37,
"learning_rate": 0.0001965895176746167,
"loss": 0.3594,
"step": 164000
},
{
"epoch": 4.38,
"learning_rate": 0.00019657911986264908,
"loss": 0.3442,
"step": 164500
},
{
"epoch": 4.39,
"learning_rate": 0.00019656872205068144,
"loss": 0.3395,
"step": 165000
},
{
"epoch": 4.41,
"learning_rate": 0.0001965583242387138,
"loss": 0.3419,
"step": 165500
},
{
"epoch": 4.42,
"learning_rate": 0.00019654792642674618,
"loss": 0.3438,
"step": 166000
},
{
"epoch": 4.43,
"learning_rate": 0.00019653752861477854,
"loss": 0.3533,
"step": 166500
},
{
"epoch": 4.45,
"learning_rate": 0.0001965271308028109,
"loss": 0.3508,
"step": 167000
},
{
"epoch": 4.46,
"learning_rate": 0.00019651673299084327,
"loss": 0.3589,
"step": 167500
},
{
"epoch": 4.47,
"learning_rate": 0.00019650633517887564,
"loss": 0.3492,
"step": 168000
},
{
"epoch": 4.49,
"learning_rate": 0.000196495937366908,
"loss": 0.3466,
"step": 168500
},
{
"epoch": 4.5,
"learning_rate": 0.00019648553955494037,
"loss": 0.3509,
"step": 169000
},
{
"epoch": 4.51,
"learning_rate": 0.00019647514174297274,
"loss": 0.3584,
"step": 169500
},
{
"epoch": 4.53,
"learning_rate": 0.0001964647439310051,
"loss": 0.3563,
"step": 170000
},
{
"epoch": 4.54,
"learning_rate": 0.00019645434611903747,
"loss": 0.3431,
"step": 170500
},
{
"epoch": 4.55,
"learning_rate": 0.00019644394830706984,
"loss": 0.3626,
"step": 171000
},
{
"epoch": 4.57,
"learning_rate": 0.0001964335504951022,
"loss": 0.3505,
"step": 171500
},
{
"epoch": 4.58,
"learning_rate": 0.0001964231526831346,
"loss": 0.3638,
"step": 172000
},
{
"epoch": 4.59,
"learning_rate": 0.00019641275487116696,
"loss": 0.3485,
"step": 172500
},
{
"epoch": 4.6,
"learning_rate": 0.00019640235705919933,
"loss": 0.3613,
"step": 173000
},
{
"epoch": 4.62,
"learning_rate": 0.0001963919592472317,
"loss": 0.3576,
"step": 173500
},
{
"epoch": 4.63,
"learning_rate": 0.00019638156143526406,
"loss": 0.3556,
"step": 174000
},
{
"epoch": 4.64,
"learning_rate": 0.00019637116362329645,
"loss": 0.3552,
"step": 174500
},
{
"epoch": 4.66,
"learning_rate": 0.00019636076581132882,
"loss": 0.3521,
"step": 175000
},
{
"epoch": 4.67,
"learning_rate": 0.00019635036799936118,
"loss": 0.3629,
"step": 175500
},
{
"epoch": 4.68,
"learning_rate": 0.00019633997018739355,
"loss": 0.3416,
"step": 176000
},
{
"epoch": 4.7,
"learning_rate": 0.00019632957237542592,
"loss": 0.3583,
"step": 176500
},
{
"epoch": 4.71,
"learning_rate": 0.00019631917456345828,
"loss": 0.3697,
"step": 177000
},
{
"epoch": 4.72,
"learning_rate": 0.00019630877675149065,
"loss": 0.3403,
"step": 177500
},
{
"epoch": 4.74,
"learning_rate": 0.000196298378939523,
"loss": 0.3639,
"step": 178000
},
{
"epoch": 4.75,
"learning_rate": 0.00019628798112755538,
"loss": 0.3585,
"step": 178500
},
{
"epoch": 4.76,
"learning_rate": 0.00019627758331558774,
"loss": 0.3365,
"step": 179000
},
{
"epoch": 4.78,
"learning_rate": 0.0001962671855036201,
"loss": 0.3497,
"step": 179500
},
{
"epoch": 4.79,
"learning_rate": 0.00019625678769165248,
"loss": 0.3505,
"step": 180000
},
{
"epoch": 4.8,
"learning_rate": 0.00019624638987968484,
"loss": 0.3592,
"step": 180500
},
{
"epoch": 4.82,
"learning_rate": 0.0001962359920677172,
"loss": 0.3585,
"step": 181000
},
{
"epoch": 4.83,
"learning_rate": 0.00019622559425574957,
"loss": 0.355,
"step": 181500
},
{
"epoch": 4.84,
"learning_rate": 0.00019621519644378194,
"loss": 0.3688,
"step": 182000
},
{
"epoch": 4.86,
"learning_rate": 0.0001962047986318143,
"loss": 0.3431,
"step": 182500
},
{
"epoch": 4.87,
"learning_rate": 0.00019619440081984667,
"loss": 0.3733,
"step": 183000
},
{
"epoch": 4.88,
"learning_rate": 0.00019618400300787904,
"loss": 0.3615,
"step": 183500
},
{
"epoch": 4.9,
"learning_rate": 0.0001961736051959114,
"loss": 0.3455,
"step": 184000
},
{
"epoch": 4.91,
"learning_rate": 0.00019616320738394377,
"loss": 0.3517,
"step": 184500
},
{
"epoch": 4.92,
"learning_rate": 0.00019615280957197614,
"loss": 0.3566,
"step": 185000
},
{
"epoch": 4.94,
"learning_rate": 0.00019614241176000853,
"loss": 0.3476,
"step": 185500
},
{
"epoch": 4.95,
"learning_rate": 0.0001961320139480409,
"loss": 0.3465,
"step": 186000
},
{
"epoch": 4.96,
"learning_rate": 0.00019612161613607326,
"loss": 0.3526,
"step": 186500
},
{
"epoch": 4.98,
"learning_rate": 0.00019611121832410563,
"loss": 0.3492,
"step": 187000
},
{
"epoch": 4.99,
"learning_rate": 0.000196100820512138,
"loss": 0.3724,
"step": 187500
},
{
"epoch": 5.0,
"eval_bleu": 0.0937,
"eval_gen_len": 18.3788,
"eval_loss": 0.37562912702560425,
"eval_runtime": 1978.9757,
"eval_samples_per_second": 4.746,
"eval_steps_per_second": 2.373,
"step": 187840
},
{
"epoch": 5.0,
"learning_rate": 0.00019609042270017039,
"loss": 0.3439,
"step": 188000
},
{
"epoch": 5.02,
"learning_rate": 0.00019608002488820275,
"loss": 0.3028,
"step": 188500
},
{
"epoch": 5.03,
"learning_rate": 0.00019606962707623512,
"loss": 0.3119,
"step": 189000
},
{
"epoch": 5.04,
"learning_rate": 0.00019605922926426748,
"loss": 0.3091,
"step": 189500
},
{
"epoch": 5.06,
"learning_rate": 0.00019604883145229985,
"loss": 0.3061,
"step": 190000
},
{
"epoch": 5.07,
"learning_rate": 0.00019603843364033222,
"loss": 0.3135,
"step": 190500
},
{
"epoch": 5.08,
"learning_rate": 0.00019602803582836458,
"loss": 0.3028,
"step": 191000
},
{
"epoch": 5.1,
"learning_rate": 0.00019601763801639695,
"loss": 0.3112,
"step": 191500
},
{
"epoch": 5.11,
"learning_rate": 0.0001960072402044293,
"loss": 0.3101,
"step": 192000
},
{
"epoch": 5.12,
"learning_rate": 0.00019599684239246168,
"loss": 0.3264,
"step": 192500
},
{
"epoch": 5.14,
"learning_rate": 0.00019598644458049405,
"loss": 0.3087,
"step": 193000
},
{
"epoch": 5.15,
"learning_rate": 0.0001959760467685264,
"loss": 0.3197,
"step": 193500
},
{
"epoch": 5.16,
"learning_rate": 0.00019596564895655878,
"loss": 0.3071,
"step": 194000
},
{
"epoch": 5.18,
"learning_rate": 0.00019595525114459114,
"loss": 0.3106,
"step": 194500
},
{
"epoch": 5.19,
"learning_rate": 0.0001959448533326235,
"loss": 0.3187,
"step": 195000
},
{
"epoch": 5.2,
"learning_rate": 0.00019593445552065587,
"loss": 0.3206,
"step": 195500
},
{
"epoch": 5.22,
"learning_rate": 0.00019592405770868824,
"loss": 0.3081,
"step": 196000
},
{
"epoch": 5.23,
"learning_rate": 0.0001959136598967206,
"loss": 0.3191,
"step": 196500
},
{
"epoch": 5.24,
"learning_rate": 0.00019590326208475297,
"loss": 0.3177,
"step": 197000
},
{
"epoch": 5.26,
"learning_rate": 0.00019589286427278534,
"loss": 0.3197,
"step": 197500
},
{
"epoch": 5.27,
"learning_rate": 0.00019588246646081773,
"loss": 0.3189,
"step": 198000
},
{
"epoch": 5.28,
"learning_rate": 0.0001958720686488501,
"loss": 0.3172,
"step": 198500
},
{
"epoch": 5.3,
"learning_rate": 0.00019586167083688246,
"loss": 0.3183,
"step": 199000
},
{
"epoch": 5.31,
"learning_rate": 0.00019585127302491483,
"loss": 0.3166,
"step": 199500
},
{
"epoch": 5.32,
"learning_rate": 0.0001958408752129472,
"loss": 0.3154,
"step": 200000
},
{
"epoch": 5.34,
"learning_rate": 0.00019583047740097956,
"loss": 0.3263,
"step": 200500
},
{
"epoch": 5.35,
"learning_rate": 0.00019582007958901193,
"loss": 0.3001,
"step": 201000
},
{
"epoch": 5.36,
"learning_rate": 0.00019580968177704432,
"loss": 0.3107,
"step": 201500
},
{
"epoch": 5.38,
"learning_rate": 0.00019579928396507669,
"loss": 0.3084,
"step": 202000
},
{
"epoch": 5.39,
"learning_rate": 0.00019578888615310905,
"loss": 0.3189,
"step": 202500
},
{
"epoch": 5.4,
"learning_rate": 0.00019577848834114142,
"loss": 0.3198,
"step": 203000
},
{
"epoch": 5.42,
"learning_rate": 0.00019576809052917378,
"loss": 0.3187,
"step": 203500
},
{
"epoch": 5.43,
"learning_rate": 0.00019575769271720615,
"loss": 0.3137,
"step": 204000
},
{
"epoch": 5.44,
"learning_rate": 0.00019574729490523852,
"loss": 0.3232,
"step": 204500
},
{
"epoch": 5.46,
"learning_rate": 0.00019573689709327088,
"loss": 0.3193,
"step": 205000
},
{
"epoch": 5.47,
"learning_rate": 0.00019572649928130325,
"loss": 0.3097,
"step": 205500
},
{
"epoch": 5.48,
"learning_rate": 0.00019571610146933561,
"loss": 0.3185,
"step": 206000
},
{
"epoch": 5.5,
"learning_rate": 0.00019570570365736798,
"loss": 0.3319,
"step": 206500
},
{
"epoch": 5.51,
"learning_rate": 0.00019569530584540035,
"loss": 0.328,
"step": 207000
},
{
"epoch": 5.52,
"learning_rate": 0.0001956849080334327,
"loss": 0.3228,
"step": 207500
},
{
"epoch": 5.54,
"learning_rate": 0.00019567451022146508,
"loss": 0.3299,
"step": 208000
},
{
"epoch": 5.55,
"learning_rate": 0.00019566411240949744,
"loss": 0.3154,
"step": 208500
},
{
"epoch": 5.56,
"learning_rate": 0.0001956537145975298,
"loss": 0.3183,
"step": 209000
},
{
"epoch": 5.58,
"learning_rate": 0.00019564331678556218,
"loss": 0.3235,
"step": 209500
},
{
"epoch": 5.59,
"learning_rate": 0.00019563291897359454,
"loss": 0.3179,
"step": 210000
},
{
"epoch": 5.6,
"learning_rate": 0.0001956225211616269,
"loss": 0.3211,
"step": 210500
},
{
"epoch": 5.62,
"learning_rate": 0.0001956121233496593,
"loss": 0.315,
"step": 211000
},
{
"epoch": 5.63,
"learning_rate": 0.00019560172553769167,
"loss": 0.3191,
"step": 211500
},
{
"epoch": 5.64,
"learning_rate": 0.00019559132772572403,
"loss": 0.3341,
"step": 212000
},
{
"epoch": 5.66,
"learning_rate": 0.0001955809299137564,
"loss": 0.3229,
"step": 212500
},
{
"epoch": 5.67,
"learning_rate": 0.00019557053210178876,
"loss": 0.3277,
"step": 213000
},
{
"epoch": 5.68,
"learning_rate": 0.00019556013428982113,
"loss": 0.3233,
"step": 213500
},
{
"epoch": 5.7,
"learning_rate": 0.0001955497364778535,
"loss": 0.3302,
"step": 214000
},
{
"epoch": 5.71,
"learning_rate": 0.00019553933866588586,
"loss": 0.324,
"step": 214500
},
{
"epoch": 5.72,
"learning_rate": 0.00019552894085391823,
"loss": 0.3315,
"step": 215000
},
{
"epoch": 5.74,
"learning_rate": 0.00019551854304195062,
"loss": 0.3342,
"step": 215500
},
{
"epoch": 5.75,
"learning_rate": 0.000195508145229983,
"loss": 0.3189,
"step": 216000
},
{
"epoch": 5.76,
"learning_rate": 0.00019549774741801535,
"loss": 0.3323,
"step": 216500
},
{
"epoch": 5.78,
"learning_rate": 0.00019548734960604772,
"loss": 0.3163,
"step": 217000
},
{
"epoch": 5.79,
"learning_rate": 0.00019547695179408008,
"loss": 0.3265,
"step": 217500
},
{
"epoch": 5.8,
"learning_rate": 0.00019546655398211245,
"loss": 0.3341,
"step": 218000
},
{
"epoch": 5.82,
"learning_rate": 0.00019545615617014482,
"loss": 0.3283,
"step": 218500
},
{
"epoch": 5.83,
"learning_rate": 0.00019544575835817718,
"loss": 0.3181,
"step": 219000
},
{
"epoch": 5.84,
"learning_rate": 0.00019543536054620955,
"loss": 0.298,
"step": 219500
},
{
"epoch": 5.86,
"learning_rate": 0.00019542496273424191,
"loss": 0.323,
"step": 220000
},
{
"epoch": 5.87,
"learning_rate": 0.00019541456492227428,
"loss": 0.325,
"step": 220500
},
{
"epoch": 5.88,
"learning_rate": 0.00019540416711030665,
"loss": 0.3159,
"step": 221000
},
{
"epoch": 5.9,
"learning_rate": 0.000195393769298339,
"loss": 0.3243,
"step": 221500
},
{
"epoch": 5.91,
"learning_rate": 0.00019538337148637138,
"loss": 0.3195,
"step": 222000
},
{
"epoch": 5.92,
"learning_rate": 0.00019537297367440374,
"loss": 0.3318,
"step": 222500
},
{
"epoch": 5.94,
"learning_rate": 0.0001953625758624361,
"loss": 0.3332,
"step": 223000
},
{
"epoch": 5.95,
"learning_rate": 0.0001953521780504685,
"loss": 0.338,
"step": 223500
},
{
"epoch": 5.96,
"learning_rate": 0.00019534178023850087,
"loss": 0.3242,
"step": 224000
},
{
"epoch": 5.98,
"learning_rate": 0.00019533138242653323,
"loss": 0.3194,
"step": 224500
},
{
"epoch": 5.99,
"learning_rate": 0.0001953209846145656,
"loss": 0.3195,
"step": 225000
},
{
"epoch": 6.0,
"eval_bleu": 0.0981,
"eval_gen_len": 18.3471,
"eval_loss": 0.37137025594711304,
"eval_runtime": 1997.3462,
"eval_samples_per_second": 4.703,
"eval_steps_per_second": 2.352,
"step": 225408
},
{
"epoch": 6.0,
"learning_rate": 0.00019531058680259797,
"loss": 0.3295,
"step": 225500
},
{
"epoch": 6.02,
"learning_rate": 0.00019530018899063033,
"loss": 0.27,
"step": 226000
},
{
"epoch": 6.03,
"learning_rate": 0.0001952897911786627,
"loss": 0.2778,
"step": 226500
},
{
"epoch": 6.04,
"learning_rate": 0.00019527939336669506,
"loss": 0.2797,
"step": 227000
},
{
"epoch": 6.06,
"learning_rate": 0.00019526899555472743,
"loss": 0.2667,
"step": 227500
},
{
"epoch": 6.07,
"learning_rate": 0.0001952585977427598,
"loss": 0.2811,
"step": 228000
},
{
"epoch": 6.08,
"learning_rate": 0.00019524819993079216,
"loss": 0.2732,
"step": 228500
},
{
"epoch": 6.1,
"learning_rate": 0.00019523780211882456,
"loss": 0.2763,
"step": 229000
},
{
"epoch": 6.11,
"learning_rate": 0.00019522740430685692,
"loss": 0.2905,
"step": 229500
},
{
"epoch": 6.12,
"learning_rate": 0.0001952170064948893,
"loss": 0.2811,
"step": 230000
},
{
"epoch": 6.14,
"learning_rate": 0.00019520660868292165,
"loss": 0.2916,
"step": 230500
},
{
"epoch": 6.15,
"learning_rate": 0.00019519621087095402,
"loss": 0.2845,
"step": 231000
},
{
"epoch": 6.16,
"learning_rate": 0.00019518581305898638,
"loss": 0.288,
"step": 231500
},
{
"epoch": 6.18,
"learning_rate": 0.00019517541524701875,
"loss": 0.2876,
"step": 232000
},
{
"epoch": 6.19,
"learning_rate": 0.00019516501743505112,
"loss": 0.2796,
"step": 232500
},
{
"epoch": 6.2,
"learning_rate": 0.00019515461962308348,
"loss": 0.2823,
"step": 233000
},
{
"epoch": 6.22,
"learning_rate": 0.00019514422181111585,
"loss": 0.303,
"step": 233500
},
{
"epoch": 6.23,
"learning_rate": 0.00019513382399914821,
"loss": 0.2855,
"step": 234000
},
{
"epoch": 6.24,
"learning_rate": 0.00019512342618718058,
"loss": 0.2981,
"step": 234500
},
{
"epoch": 6.26,
"learning_rate": 0.00019511302837521295,
"loss": 0.286,
"step": 235000
},
{
"epoch": 6.27,
"learning_rate": 0.0001951026305632453,
"loss": 0.2718,
"step": 235500
},
{
"epoch": 6.28,
"learning_rate": 0.00019509223275127768,
"loss": 0.2918,
"step": 236000
},
{
"epoch": 6.3,
"learning_rate": 0.00019508183493931007,
"loss": 0.2779,
"step": 236500
},
{
"epoch": 6.31,
"learning_rate": 0.00019507143712734244,
"loss": 0.2986,
"step": 237000
},
{
"epoch": 6.32,
"learning_rate": 0.0001950610393153748,
"loss": 0.2819,
"step": 237500
},
{
"epoch": 6.34,
"learning_rate": 0.00019505064150340717,
"loss": 0.2828,
"step": 238000
},
{
"epoch": 6.35,
"learning_rate": 0.00019504024369143953,
"loss": 0.2901,
"step": 238500
},
{
"epoch": 6.36,
"learning_rate": 0.0001950298458794719,
"loss": 0.2908,
"step": 239000
},
{
"epoch": 6.38,
"learning_rate": 0.00019501944806750427,
"loss": 0.2803,
"step": 239500
},
{
"epoch": 6.39,
"learning_rate": 0.00019500905025553663,
"loss": 0.3002,
"step": 240000
},
{
"epoch": 6.4,
"learning_rate": 0.000194998652443569,
"loss": 0.2918,
"step": 240500
},
{
"epoch": 6.42,
"learning_rate": 0.00019498825463160136,
"loss": 0.2911,
"step": 241000
},
{
"epoch": 6.43,
"learning_rate": 0.00019497785681963373,
"loss": 0.2897,
"step": 241500
},
{
"epoch": 6.44,
"learning_rate": 0.0001949674590076661,
"loss": 0.2784,
"step": 242000
},
{
"epoch": 6.45,
"learning_rate": 0.0001949570611956985,
"loss": 0.2749,
"step": 242500
},
{
"epoch": 6.47,
"learning_rate": 0.00019494666338373086,
"loss": 0.3055,
"step": 243000
},
{
"epoch": 6.48,
"learning_rate": 0.00019493626557176322,
"loss": 0.2917,
"step": 243500
},
{
"epoch": 6.49,
"learning_rate": 0.0001949258677597956,
"loss": 0.2856,
"step": 244000
},
{
"epoch": 6.51,
"learning_rate": 0.00019491546994782795,
"loss": 0.2957,
"step": 244500
},
{
"epoch": 6.52,
"learning_rate": 0.00019490507213586032,
"loss": 0.2897,
"step": 245000
},
{
"epoch": 6.53,
"learning_rate": 0.00019489467432389268,
"loss": 0.2983,
"step": 245500
},
{
"epoch": 6.55,
"learning_rate": 0.00019488427651192505,
"loss": 0.2911,
"step": 246000
},
{
"epoch": 6.56,
"learning_rate": 0.00019487387869995742,
"loss": 0.2985,
"step": 246500
},
{
"epoch": 6.57,
"learning_rate": 0.00019486348088798978,
"loss": 0.2989,
"step": 247000
},
{
"epoch": 6.59,
"learning_rate": 0.00019485308307602215,
"loss": 0.295,
"step": 247500
},
{
"epoch": 6.6,
"learning_rate": 0.00019484268526405451,
"loss": 0.2949,
"step": 248000
},
{
"epoch": 6.61,
"learning_rate": 0.00019483228745208688,
"loss": 0.2943,
"step": 248500
},
{
"epoch": 6.63,
"learning_rate": 0.00019482188964011927,
"loss": 0.303,
"step": 249000
},
{
"epoch": 6.64,
"learning_rate": 0.00019481149182815164,
"loss": 0.297,
"step": 249500
},
{
"epoch": 6.65,
"learning_rate": 0.000194801094016184,
"loss": 0.3019,
"step": 250000
},
{
"epoch": 6.67,
"learning_rate": 0.00019479069620421637,
"loss": 0.3005,
"step": 250500
},
{
"epoch": 6.68,
"learning_rate": 0.00019478029839224874,
"loss": 0.294,
"step": 251000
},
{
"epoch": 6.69,
"learning_rate": 0.0001947699005802811,
"loss": 0.2977,
"step": 251500
},
{
"epoch": 6.71,
"learning_rate": 0.00019475950276831347,
"loss": 0.294,
"step": 252000
},
{
"epoch": 6.72,
"learning_rate": 0.00019474910495634584,
"loss": 0.2967,
"step": 252500
},
{
"epoch": 6.73,
"learning_rate": 0.0001947387071443782,
"loss": 0.2987,
"step": 253000
},
{
"epoch": 6.75,
"learning_rate": 0.00019472830933241057,
"loss": 0.2929,
"step": 253500
},
{
"epoch": 6.76,
"learning_rate": 0.00019471791152044293,
"loss": 0.2994,
"step": 254000
},
{
"epoch": 6.77,
"learning_rate": 0.0001947075137084753,
"loss": 0.3023,
"step": 254500
},
{
"epoch": 6.79,
"learning_rate": 0.00019469711589650766,
"loss": 0.2961,
"step": 255000
},
{
"epoch": 6.8,
"learning_rate": 0.00019468671808454003,
"loss": 0.2927,
"step": 255500
},
{
"epoch": 6.81,
"learning_rate": 0.0001946763202725724,
"loss": 0.295,
"step": 256000
},
{
"epoch": 6.83,
"learning_rate": 0.0001946659224606048,
"loss": 0.2967,
"step": 256500
},
{
"epoch": 6.84,
"learning_rate": 0.00019465552464863716,
"loss": 0.3147,
"step": 257000
},
{
"epoch": 6.85,
"learning_rate": 0.00019464512683666952,
"loss": 0.3064,
"step": 257500
},
{
"epoch": 6.87,
"learning_rate": 0.0001946347290247019,
"loss": 0.2907,
"step": 258000
},
{
"epoch": 6.88,
"learning_rate": 0.00019462433121273425,
"loss": 0.3076,
"step": 258500
},
{
"epoch": 6.89,
"learning_rate": 0.00019461393340076662,
"loss": 0.288,
"step": 259000
},
{
"epoch": 6.91,
"learning_rate": 0.00019460353558879899,
"loss": 0.3036,
"step": 259500
},
{
"epoch": 6.92,
"learning_rate": 0.00019459313777683135,
"loss": 0.3001,
"step": 260000
},
{
"epoch": 6.93,
"learning_rate": 0.00019458273996486372,
"loss": 0.2986,
"step": 260500
},
{
"epoch": 6.95,
"learning_rate": 0.00019457234215289608,
"loss": 0.2964,
"step": 261000
},
{
"epoch": 6.96,
"learning_rate": 0.00019456194434092845,
"loss": 0.2935,
"step": 261500
},
{
"epoch": 6.97,
"learning_rate": 0.00019455154652896084,
"loss": 0.3099,
"step": 262000
},
{
"epoch": 6.99,
"learning_rate": 0.0001945411487169932,
"loss": 0.2951,
"step": 262500
},
{
"epoch": 7.0,
"eval_bleu": 0.1004,
"eval_gen_len": 18.389,
"eval_loss": 0.37594369053840637,
"eval_runtime": 1995.597,
"eval_samples_per_second": 4.707,
"eval_steps_per_second": 2.354,
"step": 262976
}
],
"max_steps": 9617408,
"num_train_epochs": 256,
"total_flos": 4.369756739154739e+16,
"trial_name": null,
"trial_params": null
}