{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7601094557616297, "global_step": 210000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.996983692635866e-05, "loss": 7.2434, "step": 500 }, { "epoch": 0.0, "learning_rate": 4.9939673852717336e-05, "loss": 6.3951, "step": 1000 }, { "epoch": 0.0, "eval_loss": 6.350970268249512, "eval_runtime": 3.8385, "eval_samples_per_second": 260.78, "eval_steps_per_second": 16.413, "step": 1000 }, { "epoch": 0.01, "learning_rate": 4.9909510779075997e-05, "loss": 6.2791, "step": 1500 }, { "epoch": 0.01, "learning_rate": 4.9879347705434663e-05, "loss": 6.2011, "step": 2000 }, { "epoch": 0.01, "eval_loss": 6.223337173461914, "eval_runtime": 3.7358, "eval_samples_per_second": 267.946, "eval_steps_per_second": 16.864, "step": 2000 }, { "epoch": 0.01, "learning_rate": 4.984918463179333e-05, "loss": 6.15, "step": 2500 }, { "epoch": 0.01, "learning_rate": 4.9819021558152e-05, "loss": 6.121, "step": 3000 }, { "epoch": 0.01, "eval_loss": 6.134737014770508, "eval_runtime": 3.7442, "eval_samples_per_second": 267.346, "eval_steps_per_second": 16.826, "step": 3000 }, { "epoch": 0.01, "learning_rate": 4.978885848451066e-05, "loss": 6.0839, "step": 3500 }, { "epoch": 0.01, "learning_rate": 4.9758695410869325e-05, "loss": 6.0626, "step": 4000 }, { "epoch": 0.01, "eval_loss": 6.063777923583984, "eval_runtime": 4.0904, "eval_samples_per_second": 244.72, "eval_steps_per_second": 15.402, "step": 4000 }, { "epoch": 0.02, "learning_rate": 4.972853233722799e-05, "loss": 6.0263, "step": 4500 }, { "epoch": 0.02, "learning_rate": 4.969836926358666e-05, "loss": 6.0177, "step": 5000 }, { "epoch": 0.02, "eval_loss": 6.02394962310791, "eval_runtime": 3.9573, "eval_samples_per_second": 252.948, "eval_steps_per_second": 15.92, "step": 5000 }, { "epoch": 0.02, "learning_rate": 4.966820618994532e-05, "loss": 5.9988, "step": 5500 }, { "epoch": 0.02, "learning_rate": 4.963804311630399e-05, "loss": 5.9921, "step": 6000 }, { "epoch": 0.02, "eval_loss": 6.0168046951293945, "eval_runtime": 4.0579, "eval_samples_per_second": 246.677, "eval_steps_per_second": 15.525, "step": 6000 }, { "epoch": 0.02, "learning_rate": 4.9607940368809935e-05, "loss": 5.9751, "step": 6500 }, { "epoch": 0.03, "learning_rate": 4.9577837621315885e-05, "loss": 5.9632, "step": 7000 }, { "epoch": 0.03, "eval_loss": 5.997592926025391, "eval_runtime": 4.2019, "eval_samples_per_second": 238.224, "eval_steps_per_second": 14.993, "step": 7000 }, { "epoch": 0.03, "learning_rate": 4.954767454767455e-05, "loss": 5.9386, "step": 7500 }, { "epoch": 0.03, "learning_rate": 4.951751147403322e-05, "loss": 5.9524, "step": 8000 }, { "epoch": 0.03, "eval_loss": 5.969239711761475, "eval_runtime": 4.0231, "eval_samples_per_second": 248.812, "eval_steps_per_second": 15.659, "step": 8000 }, { "epoch": 0.03, "learning_rate": 4.948734840039188e-05, "loss": 5.9376, "step": 8500 }, { "epoch": 0.03, "learning_rate": 4.9457185326750546e-05, "loss": 5.9328, "step": 9000 }, { "epoch": 0.03, "eval_loss": 5.940969944000244, "eval_runtime": 3.8844, "eval_samples_per_second": 257.697, "eval_steps_per_second": 16.219, "step": 9000 }, { "epoch": 0.03, "learning_rate": 4.942702225310921e-05, "loss": 5.9177, "step": 9500 }, { "epoch": 0.04, "learning_rate": 4.939685917946787e-05, "loss": 5.9162, "step": 10000 }, { "epoch": 0.04, "eval_loss": 5.932608604431152, "eval_runtime": 4.0642, "eval_samples_per_second": 246.298, "eval_steps_per_second": 15.501, "step": 10000 }, { "epoch": 0.04, "learning_rate": 4.936669610582654e-05, "loss": 5.9054, "step": 10500 }, { "epoch": 0.04, "learning_rate": 4.933659335833249e-05, "loss": 5.8906, "step": 11000 }, { "epoch": 0.04, "eval_loss": 5.921965599060059, "eval_runtime": 4.2691, "eval_samples_per_second": 234.477, "eval_steps_per_second": 14.757, "step": 11000 }, { "epoch": 0.04, "learning_rate": 4.9306430284691156e-05, "loss": 5.8952, "step": 11500 }, { "epoch": 0.04, "learning_rate": 4.9276267211049823e-05, "loss": 5.8868, "step": 12000 }, { "epoch": 0.04, "eval_loss": 5.910747528076172, "eval_runtime": 4.3284, "eval_samples_per_second": 231.264, "eval_steps_per_second": 14.555, "step": 12000 }, { "epoch": 0.05, "learning_rate": 4.924610413740849e-05, "loss": 5.8829, "step": 12500 }, { "epoch": 0.05, "learning_rate": 4.921600138991443e-05, "loss": 5.8738, "step": 13000 }, { "epoch": 0.05, "eval_loss": 5.895788669586182, "eval_runtime": 4.2991, "eval_samples_per_second": 232.842, "eval_steps_per_second": 14.654, "step": 13000 }, { "epoch": 0.05, "learning_rate": 4.91858383162731e-05, "loss": 5.8717, "step": 13500 }, { "epoch": 0.05, "learning_rate": 4.915567524263177e-05, "loss": 5.8642, "step": 14000 }, { "epoch": 0.05, "eval_loss": 5.891711235046387, "eval_runtime": 4.3228, "eval_samples_per_second": 231.564, "eval_steps_per_second": 14.574, "step": 14000 }, { "epoch": 0.05, "learning_rate": 4.9125512168990434e-05, "loss": 5.8667, "step": 14500 }, { "epoch": 0.05, "learning_rate": 4.909540942149638e-05, "loss": 5.8494, "step": 15000 }, { "epoch": 0.05, "eval_loss": 5.886107921600342, "eval_runtime": 4.4064, "eval_samples_per_second": 227.167, "eval_steps_per_second": 14.297, "step": 15000 }, { "epoch": 0.06, "learning_rate": 4.906524634785505e-05, "loss": 5.8369, "step": 15500 }, { "epoch": 0.06, "learning_rate": 4.903508327421371e-05, "loss": 5.8414, "step": 16000 }, { "epoch": 0.06, "eval_loss": 5.89373254776001, "eval_runtime": 4.3188, "eval_samples_per_second": 231.776, "eval_steps_per_second": 14.587, "step": 16000 }, { "epoch": 0.06, "learning_rate": 4.900492020057238e-05, "loss": 5.851, "step": 16500 }, { "epoch": 0.06, "learning_rate": 4.8974757126931045e-05, "loss": 5.8512, "step": 17000 }, { "epoch": 0.06, "eval_loss": 5.9001264572143555, "eval_runtime": 4.3173, "eval_samples_per_second": 231.857, "eval_steps_per_second": 14.592, "step": 17000 }, { "epoch": 0.06, "learning_rate": 4.8944594053289705e-05, "loss": 5.8471, "step": 17500 }, { "epoch": 0.07, "learning_rate": 4.8914491305795654e-05, "loss": 5.8319, "step": 18000 }, { "epoch": 0.07, "eval_loss": 5.870309829711914, "eval_runtime": 4.2282, "eval_samples_per_second": 236.743, "eval_steps_per_second": 14.9, "step": 18000 }, { "epoch": 0.07, "learning_rate": 4.888432823215432e-05, "loss": 5.8285, "step": 18500 }, { "epoch": 0.07, "learning_rate": 4.885416515851299e-05, "loss": 5.8282, "step": 19000 }, { "epoch": 0.07, "eval_loss": 5.834588050842285, "eval_runtime": 4.2534, "eval_samples_per_second": 235.344, "eval_steps_per_second": 14.812, "step": 19000 }, { "epoch": 0.07, "learning_rate": 4.882400208487165e-05, "loss": 5.8305, "step": 19500 }, { "epoch": 0.07, "learning_rate": 4.879383901123032e-05, "loss": 5.8159, "step": 20000 }, { "epoch": 0.07, "eval_loss": 5.825201034545898, "eval_runtime": 4.2051, "eval_samples_per_second": 238.046, "eval_steps_per_second": 14.982, "step": 20000 }, { "epoch": 0.07, "learning_rate": 4.876367593758898e-05, "loss": 5.8109, "step": 20500 }, { "epoch": 0.08, "learning_rate": 4.873357319009493e-05, "loss": 5.8206, "step": 21000 }, { "epoch": 0.08, "eval_loss": 5.839576244354248, "eval_runtime": 4.2748, "eval_samples_per_second": 234.162, "eval_steps_per_second": 14.737, "step": 21000 }, { "epoch": 0.08, "learning_rate": 4.87034101164536e-05, "loss": 5.8187, "step": 21500 }, { "epoch": 0.08, "learning_rate": 4.8673247042812266e-05, "loss": 5.826, "step": 22000 }, { "epoch": 0.08, "eval_loss": 5.854990482330322, "eval_runtime": 6.3825, "eval_samples_per_second": 156.835, "eval_steps_per_second": 9.871, "step": 22000 }, { "epoch": 0.08, "learning_rate": 4.8643083969170926e-05, "loss": 5.8221, "step": 22500 }, { "epoch": 0.08, "learning_rate": 4.861292089552959e-05, "loss": 5.8109, "step": 23000 }, { "epoch": 0.08, "eval_loss": 5.85267448425293, "eval_runtime": 4.2815, "eval_samples_per_second": 233.795, "eval_steps_per_second": 14.714, "step": 23000 }, { "epoch": 0.09, "learning_rate": 4.858275782188826e-05, "loss": 5.8124, "step": 23500 }, { "epoch": 0.09, "learning_rate": 4.855259474824693e-05, "loss": 5.8002, "step": 24000 }, { "epoch": 0.09, "eval_loss": 5.857957363128662, "eval_runtime": 4.2472, "eval_samples_per_second": 235.683, "eval_steps_per_second": 14.833, "step": 24000 }, { "epoch": 0.09, "learning_rate": 4.852243167460559e-05, "loss": 5.7933, "step": 24500 }, { "epoch": 0.09, "learning_rate": 4.849232892711154e-05, "loss": 5.7957, "step": 25000 }, { "epoch": 0.09, "eval_loss": 5.843862533569336, "eval_runtime": 4.2251, "eval_samples_per_second": 236.92, "eval_steps_per_second": 14.911, "step": 25000 }, { "epoch": 0.09, "learning_rate": 4.8462165853470204e-05, "loss": 5.7919, "step": 25500 }, { "epoch": 0.09, "learning_rate": 4.8432002779828864e-05, "loss": 5.8001, "step": 26000 }, { "epoch": 0.09, "eval_loss": 5.8175740242004395, "eval_runtime": 4.5898, "eval_samples_per_second": 218.094, "eval_steps_per_second": 13.726, "step": 26000 }, { "epoch": 0.1, "learning_rate": 4.840183970618754e-05, "loss": 5.7874, "step": 26500 }, { "epoch": 0.1, "learning_rate": 4.837173695869348e-05, "loss": 5.7836, "step": 27000 }, { "epoch": 0.1, "eval_loss": 5.7963480949401855, "eval_runtime": 4.2794, "eval_samples_per_second": 233.909, "eval_steps_per_second": 14.722, "step": 27000 }, { "epoch": 0.1, "learning_rate": 4.834157388505215e-05, "loss": 5.7915, "step": 27500 }, { "epoch": 0.1, "learning_rate": 4.8311410811410814e-05, "loss": 5.7797, "step": 28000 }, { "epoch": 0.1, "eval_loss": 5.796699047088623, "eval_runtime": 4.2933, "eval_samples_per_second": 233.155, "eval_steps_per_second": 14.674, "step": 28000 }, { "epoch": 0.1, "learning_rate": 4.828124773776948e-05, "loss": 5.7768, "step": 28500 }, { "epoch": 0.1, "learning_rate": 4.8251144990275424e-05, "loss": 5.7876, "step": 29000 }, { "epoch": 0.1, "eval_loss": 5.799530506134033, "eval_runtime": 4.2207, "eval_samples_per_second": 237.163, "eval_steps_per_second": 14.926, "step": 29000 }, { "epoch": 0.11, "learning_rate": 4.82209819166341e-05, "loss": 5.7904, "step": 29500 }, { "epoch": 0.11, "learning_rate": 4.819081884299276e-05, "loss": 5.7837, "step": 30000 }, { "epoch": 0.11, "eval_loss": 5.798836708068848, "eval_runtime": 4.2831, "eval_samples_per_second": 233.71, "eval_steps_per_second": 14.709, "step": 30000 }, { "epoch": 0.11, "learning_rate": 4.8160655769351425e-05, "loss": 5.778, "step": 30500 }, { "epoch": 0.11, "learning_rate": 4.813049269571009e-05, "loss": 5.7681, "step": 31000 }, { "epoch": 0.11, "eval_loss": 5.811764240264893, "eval_runtime": 4.3287, "eval_samples_per_second": 231.246, "eval_steps_per_second": 14.554, "step": 31000 }, { "epoch": 0.11, "learning_rate": 4.810032962206876e-05, "loss": 5.7695, "step": 31500 }, { "epoch": 0.12, "learning_rate": 4.80702268745747e-05, "loss": 5.7698, "step": 32000 }, { "epoch": 0.12, "eval_loss": 5.799587726593018, "eval_runtime": 6.2437, "eval_samples_per_second": 160.322, "eval_steps_per_second": 10.09, "step": 32000 }, { "epoch": 0.12, "learning_rate": 4.804006380093337e-05, "loss": 5.7613, "step": 32500 }, { "epoch": 0.12, "learning_rate": 4.8009900727292036e-05, "loss": 5.7875, "step": 33000 }, { "epoch": 0.12, "eval_loss": 5.817694664001465, "eval_runtime": 4.2624, "eval_samples_per_second": 234.843, "eval_steps_per_second": 14.78, "step": 33000 }, { "epoch": 0.12, "learning_rate": 4.7979737653650696e-05, "loss": 5.7868, "step": 33500 }, { "epoch": 0.12, "learning_rate": 4.794957458000936e-05, "loss": 5.7672, "step": 34000 }, { "epoch": 0.12, "eval_loss": 5.817095756530762, "eval_runtime": 4.3001, "eval_samples_per_second": 232.788, "eval_steps_per_second": 14.651, "step": 34000 }, { "epoch": 0.12, "learning_rate": 4.791947183251531e-05, "loss": 5.7585, "step": 34500 }, { "epoch": 0.13, "learning_rate": 4.788930875887398e-05, "loss": 5.77, "step": 35000 }, { "epoch": 0.13, "eval_loss": 5.829194068908691, "eval_runtime": 4.4257, "eval_samples_per_second": 226.177, "eval_steps_per_second": 14.235, "step": 35000 }, { "epoch": 0.13, "learning_rate": 4.785914568523264e-05, "loss": 5.7525, "step": 35500 }, { "epoch": 0.13, "learning_rate": 4.782898261159131e-05, "loss": 5.7534, "step": 36000 }, { "epoch": 0.13, "eval_loss": 5.81154203414917, "eval_runtime": 4.3328, "eval_samples_per_second": 231.028, "eval_steps_per_second": 14.54, "step": 36000 }, { "epoch": 0.13, "learning_rate": 4.7798819537949973e-05, "loss": 5.7679, "step": 36500 }, { "epoch": 0.13, "learning_rate": 4.776871679045592e-05, "loss": 5.7461, "step": 37000 }, { "epoch": 0.13, "eval_loss": 5.76007604598999, "eval_runtime": 4.2778, "eval_samples_per_second": 233.997, "eval_steps_per_second": 14.727, "step": 37000 }, { "epoch": 0.14, "learning_rate": 4.773855371681459e-05, "loss": 5.7546, "step": 37500 }, { "epoch": 0.14, "learning_rate": 4.770839064317326e-05, "loss": 5.7542, "step": 38000 }, { "epoch": 0.14, "eval_loss": 5.81342077255249, "eval_runtime": 4.3112, "eval_samples_per_second": 232.186, "eval_steps_per_second": 14.613, "step": 38000 }, { "epoch": 0.14, "learning_rate": 4.767822756953192e-05, "loss": 5.7467, "step": 38500 }, { "epoch": 0.14, "learning_rate": 4.764806449589059e-05, "loss": 5.7487, "step": 39000 }, { "epoch": 0.14, "eval_loss": 5.810147762298584, "eval_runtime": 4.2762, "eval_samples_per_second": 234.084, "eval_steps_per_second": 14.733, "step": 39000 }, { "epoch": 0.14, "learning_rate": 4.7617961748396534e-05, "loss": 5.7516, "step": 39500 }, { "epoch": 0.14, "learning_rate": 4.7587798674755194e-05, "loss": 5.7464, "step": 40000 }, { "epoch": 0.14, "eval_loss": 5.790091514587402, "eval_runtime": 6.3564, "eval_samples_per_second": 157.479, "eval_steps_per_second": 9.911, "step": 40000 }, { "epoch": 0.15, "learning_rate": 4.755763560111387e-05, "loss": 5.7518, "step": 40500 }, { "epoch": 0.15, "learning_rate": 4.752747252747253e-05, "loss": 5.7573, "step": 41000 }, { "epoch": 0.15, "eval_loss": 5.804274082183838, "eval_runtime": 4.355, "eval_samples_per_second": 229.852, "eval_steps_per_second": 14.466, "step": 41000 }, { "epoch": 0.15, "learning_rate": 4.7497309453831195e-05, "loss": 5.7512, "step": 41500 }, { "epoch": 0.15, "learning_rate": 4.746714638018986e-05, "loss": 5.7431, "step": 42000 }, { "epoch": 0.15, "eval_loss": 5.761765480041504, "eval_runtime": 4.3203, "eval_samples_per_second": 231.698, "eval_steps_per_second": 14.582, "step": 42000 }, { "epoch": 0.15, "learning_rate": 4.743704363269581e-05, "loss": 5.7573, "step": 42500 }, { "epoch": 0.16, "learning_rate": 4.740688055905447e-05, "loss": 5.7202, "step": 43000 }, { "epoch": 0.16, "eval_loss": 5.7786478996276855, "eval_runtime": 4.3352, "eval_samples_per_second": 230.9, "eval_steps_per_second": 14.532, "step": 43000 }, { "epoch": 0.16, "learning_rate": 4.737671748541314e-05, "loss": 5.7353, "step": 43500 }, { "epoch": 0.16, "learning_rate": 4.7346554411771805e-05, "loss": 5.7476, "step": 44000 }, { "epoch": 0.16, "eval_loss": 5.780131816864014, "eval_runtime": 4.4034, "eval_samples_per_second": 227.326, "eval_steps_per_second": 14.307, "step": 44000 }, { "epoch": 0.16, "learning_rate": 4.731639133813047e-05, "loss": 5.7345, "step": 44500 }, { "epoch": 0.16, "learning_rate": 4.7286288590636415e-05, "loss": 5.7416, "step": 45000 }, { "epoch": 0.16, "eval_loss": 5.7368245124816895, "eval_runtime": 4.2102, "eval_samples_per_second": 237.757, "eval_steps_per_second": 14.964, "step": 45000 }, { "epoch": 0.16, "learning_rate": 4.725612551699509e-05, "loss": 5.7451, "step": 45500 }, { "epoch": 0.17, "learning_rate": 4.722596244335375e-05, "loss": 5.7401, "step": 46000 }, { "epoch": 0.17, "eval_loss": 5.775325298309326, "eval_runtime": 4.2649, "eval_samples_per_second": 234.709, "eval_steps_per_second": 14.772, "step": 46000 }, { "epoch": 0.17, "learning_rate": 4.7195799369712416e-05, "loss": 5.7308, "step": 46500 }, { "epoch": 0.17, "learning_rate": 4.716563629607108e-05, "loss": 5.7401, "step": 47000 }, { "epoch": 0.17, "eval_loss": 5.7712297439575195, "eval_runtime": 4.3345, "eval_samples_per_second": 230.935, "eval_steps_per_second": 14.534, "step": 47000 }, { "epoch": 0.17, "learning_rate": 4.7135533548577026e-05, "loss": 5.7317, "step": 47500 }, { "epoch": 0.17, "learning_rate": 4.710537047493569e-05, "loss": 5.7336, "step": 48000 }, { "epoch": 0.17, "eval_loss": 5.742641925811768, "eval_runtime": 4.2969, "eval_samples_per_second": 232.958, "eval_steps_per_second": 14.662, "step": 48000 }, { "epoch": 0.18, "learning_rate": 4.707520740129436e-05, "loss": 5.7449, "step": 48500 }, { "epoch": 0.18, "learning_rate": 4.704510465380031e-05, "loss": 5.7297, "step": 49000 }, { "epoch": 0.18, "eval_loss": 5.769224643707275, "eval_runtime": 4.3211, "eval_samples_per_second": 231.651, "eval_steps_per_second": 14.579, "step": 49000 }, { "epoch": 0.18, "learning_rate": 4.701494158015897e-05, "loss": 5.7413, "step": 49500 }, { "epoch": 0.18, "learning_rate": 4.698477850651764e-05, "loss": 5.7347, "step": 50000 }, { "epoch": 0.18, "eval_loss": 5.765679359436035, "eval_runtime": 4.2691, "eval_samples_per_second": 234.477, "eval_steps_per_second": 14.757, "step": 50000 }, { "epoch": 0.18, "learning_rate": 4.69546154328763e-05, "loss": 5.7256, "step": 50500 }, { "epoch": 0.18, "learning_rate": 4.692445235923497e-05, "loss": 5.7183, "step": 51000 }, { "epoch": 0.18, "eval_loss": 5.7880048751831055, "eval_runtime": 8.1705, "eval_samples_per_second": 122.514, "eval_steps_per_second": 7.711, "step": 51000 }, { "epoch": 0.19, "learning_rate": 4.689428928559364e-05, "loss": 5.7377, "step": 51500 }, { "epoch": 0.19, "learning_rate": 4.6864126211952304e-05, "loss": 5.7239, "step": 52000 }, { "epoch": 0.19, "eval_loss": 5.784894943237305, "eval_runtime": 4.3979, "eval_samples_per_second": 227.609, "eval_steps_per_second": 14.325, "step": 52000 }, { "epoch": 0.19, "learning_rate": 4.6833963138310964e-05, "loss": 5.7332, "step": 52500 }, { "epoch": 0.19, "learning_rate": 4.680380006466963e-05, "loss": 5.7316, "step": 53000 }, { "epoch": 0.19, "eval_loss": 5.74093770980835, "eval_runtime": 4.3725, "eval_samples_per_second": 228.931, "eval_steps_per_second": 14.408, "step": 53000 }, { "epoch": 0.19, "learning_rate": 4.67736369910283e-05, "loss": 5.7227, "step": 53500 }, { "epoch": 0.2, "learning_rate": 4.6743473917386965e-05, "loss": 5.7202, "step": 54000 }, { "epoch": 0.2, "eval_loss": 5.766174793243408, "eval_runtime": 4.2835, "eval_samples_per_second": 233.689, "eval_steps_per_second": 14.708, "step": 54000 }, { "epoch": 0.2, "learning_rate": 4.671337116989291e-05, "loss": 5.7194, "step": 54500 }, { "epoch": 0.2, "learning_rate": 4.668326842239886e-05, "loss": 5.7073, "step": 55000 }, { "epoch": 0.2, "eval_loss": 5.740816593170166, "eval_runtime": 4.2663, "eval_samples_per_second": 234.632, "eval_steps_per_second": 14.767, "step": 55000 }, { "epoch": 0.2, "learning_rate": 4.6653105348757525e-05, "loss": 5.7176, "step": 55500 }, { "epoch": 0.2, "learning_rate": 4.662294227511619e-05, "loss": 5.7113, "step": 56000 }, { "epoch": 0.2, "eval_loss": 5.764217376708984, "eval_runtime": 4.4652, "eval_samples_per_second": 224.178, "eval_steps_per_second": 14.109, "step": 56000 }, { "epoch": 0.2, "learning_rate": 4.659277920147486e-05, "loss": 5.7094, "step": 56500 }, { "epoch": 0.21, "learning_rate": 4.656261612783352e-05, "loss": 5.722, "step": 57000 }, { "epoch": 0.21, "eval_loss": 5.7554473876953125, "eval_runtime": 4.3539, "eval_samples_per_second": 229.907, "eval_steps_per_second": 14.47, "step": 57000 }, { "epoch": 0.21, "learning_rate": 4.6532453054192186e-05, "loss": 5.7145, "step": 57500 }, { "epoch": 0.21, "learning_rate": 4.650228998055085e-05, "loss": 5.7077, "step": 58000 }, { "epoch": 0.21, "eval_loss": 5.74335241317749, "eval_runtime": 4.3356, "eval_samples_per_second": 230.882, "eval_steps_per_second": 14.531, "step": 58000 }, { "epoch": 0.21, "learning_rate": 4.64721872330568e-05, "loss": 5.7123, "step": 58500 }, { "epoch": 0.21, "learning_rate": 4.644202415941546e-05, "loss": 5.7163, "step": 59000 }, { "epoch": 0.21, "eval_loss": 5.77748441696167, "eval_runtime": 4.4579, "eval_samples_per_second": 224.543, "eval_steps_per_second": 14.132, "step": 59000 }, { "epoch": 0.22, "learning_rate": 4.6411861085774136e-05, "loss": 5.6978, "step": 59500 }, { "epoch": 0.22, "learning_rate": 4.6381698012132796e-05, "loss": 5.7185, "step": 60000 }, { "epoch": 0.22, "eval_loss": 5.755308151245117, "eval_runtime": 4.4657, "eval_samples_per_second": 224.155, "eval_steps_per_second": 14.108, "step": 60000 }, { "epoch": 0.22, "learning_rate": 4.635153493849146e-05, "loss": 5.7133, "step": 60500 }, { "epoch": 0.22, "learning_rate": 4.632137186485013e-05, "loss": 5.7126, "step": 61000 }, { "epoch": 0.22, "eval_loss": 5.742791175842285, "eval_runtime": 4.4874, "eval_samples_per_second": 223.069, "eval_steps_per_second": 14.039, "step": 61000 }, { "epoch": 0.22, "learning_rate": 4.62912087912088e-05, "loss": 5.7105, "step": 61500 }, { "epoch": 0.22, "learning_rate": 4.626104571756746e-05, "loss": 5.6955, "step": 62000 }, { "epoch": 0.22, "eval_loss": 5.750290870666504, "eval_runtime": 4.4288, "eval_samples_per_second": 226.023, "eval_steps_per_second": 14.225, "step": 62000 }, { "epoch": 0.23, "learning_rate": 4.6230882643926124e-05, "loss": 5.7056, "step": 62500 }, { "epoch": 0.23, "learning_rate": 4.6200779896432074e-05, "loss": 5.7177, "step": 63000 }, { "epoch": 0.23, "eval_loss": 5.7632975578308105, "eval_runtime": 4.4407, "eval_samples_per_second": 225.416, "eval_steps_per_second": 14.187, "step": 63000 }, { "epoch": 0.23, "learning_rate": 4.6170616822790734e-05, "loss": 5.6985, "step": 63500 }, { "epoch": 0.23, "learning_rate": 4.61404537491494e-05, "loss": 5.7103, "step": 64000 }, { "epoch": 0.23, "eval_loss": 5.735149383544922, "eval_runtime": 4.3259, "eval_samples_per_second": 231.399, "eval_steps_per_second": 14.564, "step": 64000 }, { "epoch": 0.23, "learning_rate": 4.611029067550807e-05, "loss": 5.7056, "step": 64500 }, { "epoch": 0.24, "learning_rate": 4.6080127601866735e-05, "loss": 5.7036, "step": 65000 }, { "epoch": 0.24, "eval_loss": 5.733882427215576, "eval_runtime": 4.2423, "eval_samples_per_second": 235.959, "eval_steps_per_second": 14.851, "step": 65000 }, { "epoch": 0.24, "learning_rate": 4.605002485437268e-05, "loss": 5.7037, "step": 65500 }, { "epoch": 0.24, "learning_rate": 4.601986178073135e-05, "loss": 5.7035, "step": 66000 }, { "epoch": 0.24, "eval_loss": 5.746898651123047, "eval_runtime": 4.2969, "eval_samples_per_second": 232.956, "eval_steps_per_second": 14.662, "step": 66000 }, { "epoch": 0.24, "learning_rate": 4.598969870709001e-05, "loss": 5.6979, "step": 66500 }, { "epoch": 0.24, "learning_rate": 4.595953563344868e-05, "loss": 5.6953, "step": 67000 }, { "epoch": 0.24, "eval_loss": 5.743001461029053, "eval_runtime": 6.5732, "eval_samples_per_second": 152.285, "eval_steps_per_second": 9.584, "step": 67000 }, { "epoch": 0.24, "learning_rate": 4.592943288595463e-05, "loss": 5.7201, "step": 67500 }, { "epoch": 0.25, "learning_rate": 4.5899269812313295e-05, "loss": 5.704, "step": 68000 }, { "epoch": 0.25, "eval_loss": 5.756634712219238, "eval_runtime": 4.4285, "eval_samples_per_second": 226.037, "eval_steps_per_second": 14.226, "step": 68000 }, { "epoch": 0.25, "learning_rate": 4.5869106738671955e-05, "loss": 5.7095, "step": 68500 }, { "epoch": 0.25, "learning_rate": 4.583894366503063e-05, "loss": 5.712, "step": 69000 }, { "epoch": 0.25, "eval_loss": 5.764980316162109, "eval_runtime": 4.3235, "eval_samples_per_second": 231.523, "eval_steps_per_second": 14.571, "step": 69000 }, { "epoch": 0.25, "learning_rate": 4.580884091753657e-05, "loss": 5.6976, "step": 69500 }, { "epoch": 0.25, "learning_rate": 4.577867784389523e-05, "loss": 5.7046, "step": 70000 }, { "epoch": 0.25, "eval_loss": 5.744942665100098, "eval_runtime": 4.3426, "eval_samples_per_second": 230.505, "eval_steps_per_second": 14.507, "step": 70000 }, { "epoch": 0.26, "learning_rate": 4.5748514770253906e-05, "loss": 5.7025, "step": 70500 }, { "epoch": 0.26, "learning_rate": 4.5718351696612566e-05, "loss": 5.7031, "step": 71000 }, { "epoch": 0.26, "eval_loss": 5.732196807861328, "eval_runtime": 4.4018, "eval_samples_per_second": 227.407, "eval_steps_per_second": 14.312, "step": 71000 }, { "epoch": 0.26, "learning_rate": 4.568818862297123e-05, "loss": 5.6871, "step": 71500 }, { "epoch": 0.26, "learning_rate": 4.56580255493299e-05, "loss": 5.6842, "step": 72000 }, { "epoch": 0.26, "eval_loss": 5.773156642913818, "eval_runtime": 4.3306, "eval_samples_per_second": 231.146, "eval_steps_per_second": 14.548, "step": 72000 }, { "epoch": 0.26, "learning_rate": 4.562786247568857e-05, "loss": 5.6853, "step": 72500 }, { "epoch": 0.26, "learning_rate": 4.559769940204723e-05, "loss": 5.7022, "step": 73000 }, { "epoch": 0.26, "eval_loss": 5.724014759063721, "eval_runtime": 4.268, "eval_samples_per_second": 234.536, "eval_steps_per_second": 14.761, "step": 73000 }, { "epoch": 0.27, "learning_rate": 4.5567536328405894e-05, "loss": 5.7029, "step": 73500 }, { "epoch": 0.27, "learning_rate": 4.553737325476456e-05, "loss": 5.707, "step": 74000 }, { "epoch": 0.27, "eval_loss": 5.71505880355835, "eval_runtime": 4.2511, "eval_samples_per_second": 235.471, "eval_steps_per_second": 14.82, "step": 74000 }, { "epoch": 0.27, "learning_rate": 4.550727050727051e-05, "loss": 5.7014, "step": 74500 }, { "epoch": 0.27, "learning_rate": 4.547710743362917e-05, "loss": 5.7068, "step": 75000 }, { "epoch": 0.27, "eval_loss": 5.703161239624023, "eval_runtime": 4.3019, "eval_samples_per_second": 232.687, "eval_steps_per_second": 14.645, "step": 75000 }, { "epoch": 0.27, "learning_rate": 4.5446944359987844e-05, "loss": 5.6946, "step": 75500 }, { "epoch": 0.28, "learning_rate": 4.5416781286346505e-05, "loss": 5.6892, "step": 76000 }, { "epoch": 0.28, "eval_loss": 5.7333149909973145, "eval_runtime": 4.2526, "eval_samples_per_second": 235.387, "eval_steps_per_second": 14.815, "step": 76000 }, { "epoch": 0.28, "learning_rate": 4.5386678538852454e-05, "loss": 5.6814, "step": 76500 }, { "epoch": 0.28, "learning_rate": 4.535651546521112e-05, "loss": 5.6979, "step": 77000 }, { "epoch": 0.28, "eval_loss": 5.712403297424316, "eval_runtime": 4.3234, "eval_samples_per_second": 231.532, "eval_steps_per_second": 14.572, "step": 77000 }, { "epoch": 0.28, "learning_rate": 4.532635239156979e-05, "loss": 5.6959, "step": 77500 }, { "epoch": 0.28, "learning_rate": 4.529618931792845e-05, "loss": 5.6791, "step": 78000 }, { "epoch": 0.28, "eval_loss": 5.754906177520752, "eval_runtime": 4.3238, "eval_samples_per_second": 231.508, "eval_steps_per_second": 14.57, "step": 78000 }, { "epoch": 0.28, "learning_rate": 4.5266026244287115e-05, "loss": 5.6984, "step": 78500 }, { "epoch": 0.29, "learning_rate": 4.5235923496793065e-05, "loss": 5.6953, "step": 79000 }, { "epoch": 0.29, "eval_loss": 5.720048427581787, "eval_runtime": 4.3352, "eval_samples_per_second": 230.899, "eval_steps_per_second": 14.532, "step": 79000 }, { "epoch": 0.29, "learning_rate": 4.5205760423151725e-05, "loss": 5.6924, "step": 79500 }, { "epoch": 0.29, "learning_rate": 4.51755973495104e-05, "loss": 5.6845, "step": 80000 }, { "epoch": 0.29, "eval_loss": 5.744686126708984, "eval_runtime": 4.3839, "eval_samples_per_second": 228.335, "eval_steps_per_second": 14.371, "step": 80000 }, { "epoch": 0.29, "learning_rate": 4.514543427586906e-05, "loss": 5.695, "step": 80500 }, { "epoch": 0.29, "learning_rate": 4.511533152837501e-05, "loss": 5.6872, "step": 81000 }, { "epoch": 0.29, "eval_loss": 5.723191738128662, "eval_runtime": 4.2942, "eval_samples_per_second": 233.107, "eval_steps_per_second": 14.671, "step": 81000 }, { "epoch": 0.29, "learning_rate": 4.5085168454733675e-05, "loss": 5.6841, "step": 81500 }, { "epoch": 0.3, "learning_rate": 4.505500538109234e-05, "loss": 5.687, "step": 82000 }, { "epoch": 0.3, "eval_loss": 5.738052845001221, "eval_runtime": 4.282, "eval_samples_per_second": 233.769, "eval_steps_per_second": 14.713, "step": 82000 }, { "epoch": 0.3, "learning_rate": 4.5024842307451e-05, "loss": 5.6913, "step": 82500 }, { "epoch": 0.3, "learning_rate": 4.499467923380967e-05, "loss": 5.69, "step": 83000 }, { "epoch": 0.3, "eval_loss": 5.709909915924072, "eval_runtime": 4.3445, "eval_samples_per_second": 230.406, "eval_steps_per_second": 14.501, "step": 83000 }, { "epoch": 0.3, "learning_rate": 4.496457648631562e-05, "loss": 5.6785, "step": 83500 }, { "epoch": 0.3, "learning_rate": 4.4934413412674286e-05, "loss": 5.6831, "step": 84000 }, { "epoch": 0.3, "eval_loss": 5.698389053344727, "eval_runtime": 6.3376, "eval_samples_per_second": 157.947, "eval_steps_per_second": 9.941, "step": 84000 }, { "epoch": 0.31, "learning_rate": 4.4904250339032946e-05, "loss": 5.6893, "step": 84500 }, { "epoch": 0.31, "learning_rate": 4.4874147591538896e-05, "loss": 5.682, "step": 85000 }, { "epoch": 0.31, "eval_loss": 5.747517108917236, "eval_runtime": 4.4117, "eval_samples_per_second": 226.897, "eval_steps_per_second": 14.28, "step": 85000 }, { "epoch": 0.31, "learning_rate": 4.484398451789756e-05, "loss": 5.6907, "step": 85500 }, { "epoch": 0.31, "learning_rate": 4.481382144425623e-05, "loss": 5.6771, "step": 86000 }, { "epoch": 0.31, "eval_loss": 5.721287727355957, "eval_runtime": 4.3352, "eval_samples_per_second": 230.902, "eval_steps_per_second": 14.532, "step": 86000 }, { "epoch": 0.31, "learning_rate": 4.47836583706149e-05, "loss": 5.6743, "step": 86500 }, { "epoch": 0.31, "learning_rate": 4.475349529697356e-05, "loss": 5.6818, "step": 87000 }, { "epoch": 0.31, "eval_loss": 5.736617088317871, "eval_runtime": 4.3397, "eval_samples_per_second": 230.66, "eval_steps_per_second": 14.517, "step": 87000 }, { "epoch": 0.32, "learning_rate": 4.4723332223332224e-05, "loss": 5.6836, "step": 87500 }, { "epoch": 0.32, "learning_rate": 4.469316914969089e-05, "loss": 5.6862, "step": 88000 }, { "epoch": 0.32, "eval_loss": 5.706295490264893, "eval_runtime": 4.3399, "eval_samples_per_second": 230.65, "eval_steps_per_second": 14.516, "step": 88000 }, { "epoch": 0.32, "learning_rate": 4.466300607604956e-05, "loss": 5.6792, "step": 88500 }, { "epoch": 0.32, "learning_rate": 4.46329033285555e-05, "loss": 5.6706, "step": 89000 }, { "epoch": 0.32, "eval_loss": 5.7102370262146, "eval_runtime": 4.2947, "eval_samples_per_second": 233.076, "eval_steps_per_second": 14.669, "step": 89000 }, { "epoch": 0.32, "learning_rate": 4.4602740254914174e-05, "loss": 5.6775, "step": 89500 }, { "epoch": 0.33, "learning_rate": 4.4572577181272834e-05, "loss": 5.6662, "step": 90000 }, { "epoch": 0.33, "eval_loss": 5.69236421585083, "eval_runtime": 6.3959, "eval_samples_per_second": 156.506, "eval_steps_per_second": 9.85, "step": 90000 }, { "epoch": 0.33, "learning_rate": 4.45424141076315e-05, "loss": 5.6691, "step": 90500 }, { "epoch": 0.33, "learning_rate": 4.4512371686284734e-05, "loss": 5.6839, "step": 91000 }, { "epoch": 0.33, "eval_loss": 5.711584091186523, "eval_runtime": 4.2918, "eval_samples_per_second": 233.234, "eval_steps_per_second": 14.679, "step": 91000 }, { "epoch": 0.33, "learning_rate": 4.44822086126434e-05, "loss": 5.6725, "step": 91500 }, { "epoch": 0.33, "learning_rate": 4.445204553900206e-05, "loss": 5.6789, "step": 92000 }, { "epoch": 0.33, "eval_loss": 5.663390159606934, "eval_runtime": 4.3322, "eval_samples_per_second": 231.063, "eval_steps_per_second": 14.542, "step": 92000 }, { "epoch": 0.33, "learning_rate": 4.442188246536073e-05, "loss": 5.6765, "step": 92500 }, { "epoch": 0.34, "learning_rate": 4.4391719391719395e-05, "loss": 5.6618, "step": 93000 }, { "epoch": 0.34, "eval_loss": 5.73824405670166, "eval_runtime": 4.2539, "eval_samples_per_second": 235.311, "eval_steps_per_second": 14.81, "step": 93000 }, { "epoch": 0.34, "learning_rate": 4.4361556318078055e-05, "loss": 5.6636, "step": 93500 }, { "epoch": 0.34, "learning_rate": 4.433139324443673e-05, "loss": 5.6733, "step": 94000 }, { "epoch": 0.34, "eval_loss": 5.712129592895508, "eval_runtime": 4.3264, "eval_samples_per_second": 231.369, "eval_steps_per_second": 14.562, "step": 94000 }, { "epoch": 0.34, "learning_rate": 4.430123017079539e-05, "loss": 5.6787, "step": 94500 }, { "epoch": 0.34, "learning_rate": 4.427112742330134e-05, "loss": 5.6709, "step": 95000 }, { "epoch": 0.34, "eval_loss": 5.736870288848877, "eval_runtime": 4.381, "eval_samples_per_second": 228.487, "eval_steps_per_second": 14.38, "step": 95000 }, { "epoch": 0.35, "learning_rate": 4.4240964349660005e-05, "loss": 5.6618, "step": 95500 }, { "epoch": 0.35, "learning_rate": 4.421080127601867e-05, "loss": 5.6627, "step": 96000 }, { "epoch": 0.35, "eval_loss": 5.6889142990112305, "eval_runtime": 4.2821, "eval_samples_per_second": 233.762, "eval_steps_per_second": 14.712, "step": 96000 }, { "epoch": 0.35, "learning_rate": 4.418063820237733e-05, "loss": 5.6751, "step": 96500 }, { "epoch": 0.35, "learning_rate": 4.415053545488328e-05, "loss": 5.655, "step": 97000 }, { "epoch": 0.35, "eval_loss": 5.685154438018799, "eval_runtime": 6.9191, "eval_samples_per_second": 144.671, "eval_steps_per_second": 9.105, "step": 97000 }, { "epoch": 0.35, "learning_rate": 4.412037238124195e-05, "loss": 5.6734, "step": 97500 }, { "epoch": 0.35, "learning_rate": 4.4090209307600616e-05, "loss": 5.6649, "step": 98000 }, { "epoch": 0.35, "eval_loss": 5.713215351104736, "eval_runtime": 4.2937, "eval_samples_per_second": 233.132, "eval_steps_per_second": 14.673, "step": 98000 }, { "epoch": 0.36, "learning_rate": 4.4060046233959276e-05, "loss": 5.6724, "step": 98500 }, { "epoch": 0.36, "learning_rate": 4.4029943486465226e-05, "loss": 5.6634, "step": 99000 }, { "epoch": 0.36, "eval_loss": 5.674860000610352, "eval_runtime": 4.3043, "eval_samples_per_second": 232.557, "eval_steps_per_second": 14.636, "step": 99000 }, { "epoch": 0.36, "learning_rate": 4.399978041282389e-05, "loss": 5.6674, "step": 99500 }, { "epoch": 0.36, "learning_rate": 4.396961733918255e-05, "loss": 5.6625, "step": 100000 }, { "epoch": 0.36, "eval_loss": 5.694820880889893, "eval_runtime": 4.396, "eval_samples_per_second": 227.709, "eval_steps_per_second": 14.331, "step": 100000 }, { "epoch": 0.36, "learning_rate": 4.393951459168851e-05, "loss": 5.6711, "step": 100500 }, { "epoch": 0.37, "learning_rate": 4.390935151804717e-05, "loss": 5.6721, "step": 101000 }, { "epoch": 0.37, "eval_loss": 5.711126804351807, "eval_runtime": 6.4814, "eval_samples_per_second": 154.443, "eval_steps_per_second": 9.72, "step": 101000 }, { "epoch": 0.37, "learning_rate": 4.3879188444405836e-05, "loss": 5.6574, "step": 101500 }, { "epoch": 0.37, "learning_rate": 4.38490253707645e-05, "loss": 5.6608, "step": 102000 }, { "epoch": 0.37, "eval_loss": 5.73061990737915, "eval_runtime": 4.2517, "eval_samples_per_second": 235.437, "eval_steps_per_second": 14.818, "step": 102000 }, { "epoch": 0.37, "learning_rate": 4.381886229712317e-05, "loss": 5.6663, "step": 102500 }, { "epoch": 0.37, "learning_rate": 4.378869922348183e-05, "loss": 5.6685, "step": 103000 }, { "epoch": 0.37, "eval_loss": 5.769214630126953, "eval_runtime": 4.3149, "eval_samples_per_second": 231.988, "eval_steps_per_second": 14.601, "step": 103000 }, { "epoch": 0.37, "learning_rate": 4.3758536149840504e-05, "loss": 5.6631, "step": 103500 }, { "epoch": 0.38, "learning_rate": 4.3728373076199164e-05, "loss": 5.6557, "step": 104000 }, { "epoch": 0.38, "eval_loss": 5.712480545043945, "eval_runtime": 4.3273, "eval_samples_per_second": 231.325, "eval_steps_per_second": 14.559, "step": 104000 }, { "epoch": 0.38, "learning_rate": 4.3698270328705114e-05, "loss": 5.6767, "step": 104500 }, { "epoch": 0.38, "learning_rate": 4.3668167581211063e-05, "loss": 5.6665, "step": 105000 }, { "epoch": 0.38, "eval_loss": 5.739169120788574, "eval_runtime": 4.2992, "eval_samples_per_second": 232.836, "eval_steps_per_second": 14.654, "step": 105000 }, { "epoch": 0.38, "learning_rate": 4.363800450756973e-05, "loss": 5.6699, "step": 105500 }, { "epoch": 0.38, "learning_rate": 4.360784143392839e-05, "loss": 5.6603, "step": 106000 }, { "epoch": 0.38, "eval_loss": 5.6816325187683105, "eval_runtime": 4.353, "eval_samples_per_second": 229.954, "eval_steps_per_second": 14.473, "step": 106000 }, { "epoch": 0.39, "learning_rate": 4.357767836028706e-05, "loss": 5.6717, "step": 106500 }, { "epoch": 0.39, "learning_rate": 4.3547515286645725e-05, "loss": 5.6712, "step": 107000 }, { "epoch": 0.39, "eval_loss": 5.7074737548828125, "eval_runtime": 4.3743, "eval_samples_per_second": 228.834, "eval_steps_per_second": 14.402, "step": 107000 }, { "epoch": 0.39, "learning_rate": 4.3517352213004385e-05, "loss": 5.6676, "step": 107500 }, { "epoch": 0.39, "learning_rate": 4.348718913936305e-05, "loss": 5.6547, "step": 108000 }, { "epoch": 0.39, "eval_loss": 5.6991777420043945, "eval_runtime": 4.4989, "eval_samples_per_second": 222.5, "eval_steps_per_second": 14.004, "step": 108000 }, { "epoch": 0.39, "learning_rate": 4.3457086391869e-05, "loss": 5.6638, "step": 108500 }, { "epoch": 0.39, "learning_rate": 4.342692331822767e-05, "loss": 5.6511, "step": 109000 }, { "epoch": 0.39, "eval_loss": 5.679354190826416, "eval_runtime": 4.2833, "eval_samples_per_second": 233.696, "eval_steps_per_second": 14.708, "step": 109000 }, { "epoch": 0.4, "learning_rate": 4.3396760244586335e-05, "loss": 5.6673, "step": 109500 }, { "epoch": 0.4, "learning_rate": 4.3366597170945e-05, "loss": 5.6663, "step": 110000 }, { "epoch": 0.4, "eval_loss": 5.680385589599609, "eval_runtime": 4.3563, "eval_samples_per_second": 229.783, "eval_steps_per_second": 14.462, "step": 110000 }, { "epoch": 0.4, "learning_rate": 4.333643409730366e-05, "loss": 5.6682, "step": 110500 }, { "epoch": 0.4, "learning_rate": 4.330627102366233e-05, "loss": 5.6562, "step": 111000 }, { "epoch": 0.4, "eval_loss": 5.730945587158203, "eval_runtime": 6.374, "eval_samples_per_second": 157.044, "eval_steps_per_second": 9.884, "step": 111000 }, { "epoch": 0.4, "learning_rate": 4.3276107950020996e-05, "loss": 5.6656, "step": 111500 }, { "epoch": 0.41, "learning_rate": 4.3246005202526946e-05, "loss": 5.6546, "step": 112000 }, { "epoch": 0.41, "eval_loss": 5.708312034606934, "eval_runtime": 4.2353, "eval_samples_per_second": 236.345, "eval_steps_per_second": 14.875, "step": 112000 }, { "epoch": 0.41, "learning_rate": 4.3215842128885606e-05, "loss": 5.6529, "step": 112500 }, { "epoch": 0.41, "learning_rate": 4.318567905524428e-05, "loss": 5.656, "step": 113000 }, { "epoch": 0.41, "eval_loss": 5.711429119110107, "eval_runtime": 4.3481, "eval_samples_per_second": 230.216, "eval_steps_per_second": 14.489, "step": 113000 }, { "epoch": 0.41, "learning_rate": 4.315551598160294e-05, "loss": 5.6731, "step": 113500 }, { "epoch": 0.41, "learning_rate": 4.312535290796161e-05, "loss": 5.6712, "step": 114000 }, { "epoch": 0.41, "eval_loss": 5.6710896492004395, "eval_runtime": 4.2998, "eval_samples_per_second": 232.804, "eval_steps_per_second": 14.652, "step": 114000 }, { "epoch": 0.41, "learning_rate": 4.3095189834320274e-05, "loss": 5.6619, "step": 114500 }, { "epoch": 0.42, "learning_rate": 4.306502676067894e-05, "loss": 5.6473, "step": 115000 }, { "epoch": 0.42, "eval_loss": 5.6910600662231445, "eval_runtime": 4.3442, "eval_samples_per_second": 230.421, "eval_steps_per_second": 14.502, "step": 115000 }, { "epoch": 0.42, "learning_rate": 4.3034924013184884e-05, "loss": 5.6577, "step": 115500 }, { "epoch": 0.42, "learning_rate": 4.300476093954355e-05, "loss": 5.6352, "step": 116000 }, { "epoch": 0.42, "eval_loss": 5.7348408699035645, "eval_runtime": 4.3322, "eval_samples_per_second": 231.062, "eval_steps_per_second": 14.542, "step": 116000 }, { "epoch": 0.42, "learning_rate": 4.297459786590222e-05, "loss": 5.647, "step": 116500 }, { "epoch": 0.42, "learning_rate": 4.294443479226088e-05, "loss": 5.6602, "step": 117000 }, { "epoch": 0.42, "eval_loss": 5.7037835121154785, "eval_runtime": 6.3915, "eval_samples_per_second": 156.614, "eval_steps_per_second": 9.857, "step": 117000 }, { "epoch": 0.43, "learning_rate": 4.2914271718619545e-05, "loss": 5.6598, "step": 117500 }, { "epoch": 0.43, "learning_rate": 4.288410864497821e-05, "loss": 5.645, "step": 118000 }, { "epoch": 0.43, "eval_loss": 5.670388698577881, "eval_runtime": 4.2976, "eval_samples_per_second": 232.922, "eval_steps_per_second": 14.659, "step": 118000 }, { "epoch": 0.43, "learning_rate": 4.285394557133688e-05, "loss": 5.637, "step": 118500 }, { "epoch": 0.43, "learning_rate": 4.282378249769554e-05, "loss": 5.6611, "step": 119000 }, { "epoch": 0.43, "eval_loss": 5.681214332580566, "eval_runtime": 4.3365, "eval_samples_per_second": 230.833, "eval_steps_per_second": 14.528, "step": 119000 }, { "epoch": 0.43, "learning_rate": 4.2793679750201495e-05, "loss": 5.6413, "step": 119500 }, { "epoch": 0.43, "learning_rate": 4.276357700270744e-05, "loss": 5.6442, "step": 120000 }, { "epoch": 0.43, "eval_loss": 5.678561687469482, "eval_runtime": 4.2838, "eval_samples_per_second": 233.671, "eval_steps_per_second": 14.707, "step": 120000 }, { "epoch": 0.44, "learning_rate": 4.2733413929066105e-05, "loss": 5.6429, "step": 120500 }, { "epoch": 0.44, "learning_rate": 4.270325085542477e-05, "loss": 5.6572, "step": 121000 }, { "epoch": 0.44, "eval_loss": 5.70313024520874, "eval_runtime": 4.2374, "eval_samples_per_second": 236.231, "eval_steps_per_second": 14.868, "step": 121000 }, { "epoch": 0.44, "learning_rate": 4.267308778178344e-05, "loss": 5.6563, "step": 121500 }, { "epoch": 0.44, "learning_rate": 4.26429247081421e-05, "loss": 5.6631, "step": 122000 }, { "epoch": 0.44, "eval_loss": 5.692766189575195, "eval_runtime": 4.2623, "eval_samples_per_second": 234.85, "eval_steps_per_second": 14.781, "step": 122000 }, { "epoch": 0.44, "learning_rate": 4.261276163450077e-05, "loss": 5.6386, "step": 122500 }, { "epoch": 0.45, "learning_rate": 4.258259856085943e-05, "loss": 5.6553, "step": 123000 }, { "epoch": 0.45, "eval_loss": 5.708261013031006, "eval_runtime": 4.2737, "eval_samples_per_second": 234.224, "eval_steps_per_second": 14.741, "step": 123000 }, { "epoch": 0.45, "learning_rate": 4.2552495813365376e-05, "loss": 5.6524, "step": 123500 }, { "epoch": 0.45, "learning_rate": 4.252233273972405e-05, "loss": 5.6521, "step": 124000 }, { "epoch": 0.45, "eval_loss": 5.687132358551025, "eval_runtime": 4.4286, "eval_samples_per_second": 226.029, "eval_steps_per_second": 14.226, "step": 124000 }, { "epoch": 0.45, "learning_rate": 4.249216966608271e-05, "loss": 5.6559, "step": 124500 }, { "epoch": 0.45, "learning_rate": 4.2462006592441377e-05, "loss": 5.6644, "step": 125000 }, { "epoch": 0.45, "eval_loss": 5.712847709655762, "eval_runtime": 4.3174, "eval_samples_per_second": 231.852, "eval_steps_per_second": 14.592, "step": 125000 }, { "epoch": 0.45, "learning_rate": 4.2431843518800044e-05, "loss": 5.6407, "step": 125500 }, { "epoch": 0.46, "learning_rate": 4.240174077130599e-05, "loss": 5.6421, "step": 126000 }, { "epoch": 0.46, "eval_loss": 5.716492652893066, "eval_runtime": 4.2976, "eval_samples_per_second": 232.92, "eval_steps_per_second": 14.659, "step": 126000 }, { "epoch": 0.46, "learning_rate": 4.237157769766465e-05, "loss": 5.6653, "step": 126500 }, { "epoch": 0.46, "learning_rate": 4.234141462402332e-05, "loss": 5.6465, "step": 127000 }, { "epoch": 0.46, "eval_loss": 5.711600303649902, "eval_runtime": 4.3593, "eval_samples_per_second": 229.622, "eval_steps_per_second": 14.452, "step": 127000 }, { "epoch": 0.46, "learning_rate": 4.231125155038199e-05, "loss": 5.6545, "step": 127500 }, { "epoch": 0.46, "learning_rate": 4.2281088476740654e-05, "loss": 5.653, "step": 128000 }, { "epoch": 0.46, "eval_loss": 5.694251537322998, "eval_runtime": 6.4453, "eval_samples_per_second": 155.307, "eval_steps_per_second": 9.775, "step": 128000 }, { "epoch": 0.47, "learning_rate": 4.2250925403099314e-05, "loss": 5.6472, "step": 128500 }, { "epoch": 0.47, "learning_rate": 4.222076232945799e-05, "loss": 5.6546, "step": 129000 }, { "epoch": 0.47, "eval_loss": 5.6814446449279785, "eval_runtime": 4.3371, "eval_samples_per_second": 230.802, "eval_steps_per_second": 14.526, "step": 129000 }, { "epoch": 0.47, "learning_rate": 4.219059925581665e-05, "loss": 5.6483, "step": 129500 }, { "epoch": 0.47, "learning_rate": 4.216055683446988e-05, "loss": 5.654, "step": 130000 }, { "epoch": 0.47, "eval_loss": 5.7155609130859375, "eval_runtime": 4.3569, "eval_samples_per_second": 229.748, "eval_steps_per_second": 14.46, "step": 130000 }, { "epoch": 0.47, "learning_rate": 4.213039376082855e-05, "loss": 5.6427, "step": 130500 }, { "epoch": 0.47, "learning_rate": 4.210023068718721e-05, "loss": 5.6526, "step": 131000 }, { "epoch": 0.47, "eval_loss": 5.657435417175293, "eval_runtime": 4.288, "eval_samples_per_second": 233.441, "eval_steps_per_second": 14.692, "step": 131000 }, { "epoch": 0.48, "learning_rate": 4.2070067613545875e-05, "loss": 5.6495, "step": 131500 }, { "epoch": 0.48, "learning_rate": 4.203990453990454e-05, "loss": 5.649, "step": 132000 }, { "epoch": 0.48, "eval_loss": 5.6852569580078125, "eval_runtime": 4.3096, "eval_samples_per_second": 232.275, "eval_steps_per_second": 14.619, "step": 132000 }, { "epoch": 0.48, "learning_rate": 4.200980179241049e-05, "loss": 5.6591, "step": 132500 }, { "epoch": 0.48, "learning_rate": 4.197963871876915e-05, "loss": 5.6427, "step": 133000 }, { "epoch": 0.48, "eval_loss": 5.6808366775512695, "eval_runtime": 4.2828, "eval_samples_per_second": 233.726, "eval_steps_per_second": 14.71, "step": 133000 }, { "epoch": 0.48, "learning_rate": 4.1949475645127825e-05, "loss": 5.6409, "step": 133500 }, { "epoch": 0.49, "learning_rate": 4.1919312571486485e-05, "loss": 5.6436, "step": 134000 }, { "epoch": 0.49, "eval_loss": 5.664061546325684, "eval_runtime": 4.2839, "eval_samples_per_second": 233.663, "eval_steps_per_second": 14.706, "step": 134000 }, { "epoch": 0.49, "learning_rate": 4.188914949784515e-05, "loss": 5.6421, "step": 134500 }, { "epoch": 0.49, "learning_rate": 4.185898642420382e-05, "loss": 5.6319, "step": 135000 }, { "epoch": 0.49, "eval_loss": 5.68691349029541, "eval_runtime": 4.3423, "eval_samples_per_second": 230.522, "eval_steps_per_second": 14.508, "step": 135000 }, { "epoch": 0.49, "learning_rate": 4.1828823350562486e-05, "loss": 5.6542, "step": 135500 }, { "epoch": 0.49, "learning_rate": 4.1798660276921146e-05, "loss": 5.6393, "step": 136000 }, { "epoch": 0.49, "eval_loss": 5.684676647186279, "eval_runtime": 4.3763, "eval_samples_per_second": 228.73, "eval_steps_per_second": 14.396, "step": 136000 }, { "epoch": 0.49, "learning_rate": 4.176849720327981e-05, "loss": 5.6442, "step": 136500 }, { "epoch": 0.5, "learning_rate": 4.173839445578576e-05, "loss": 5.6363, "step": 137000 }, { "epoch": 0.5, "eval_loss": 5.680596828460693, "eval_runtime": 4.349, "eval_samples_per_second": 230.166, "eval_steps_per_second": 14.486, "step": 137000 }, { "epoch": 0.5, "learning_rate": 4.170823138214443e-05, "loss": 5.6416, "step": 137500 }, { "epoch": 0.5, "learning_rate": 4.167812863465038e-05, "loss": 5.648, "step": 138000 }, { "epoch": 0.5, "eval_loss": 5.648463249206543, "eval_runtime": 4.3014, "eval_samples_per_second": 232.713, "eval_steps_per_second": 14.646, "step": 138000 }, { "epoch": 0.5, "learning_rate": 4.164796556100904e-05, "loss": 5.6414, "step": 138500 }, { "epoch": 0.5, "learning_rate": 4.1617802487367706e-05, "loss": 5.6297, "step": 139000 }, { "epoch": 0.5, "eval_loss": 5.679803371429443, "eval_runtime": 4.2827, "eval_samples_per_second": 233.729, "eval_steps_per_second": 14.71, "step": 139000 }, { "epoch": 0.5, "learning_rate": 4.1587639413726373e-05, "loss": 5.6411, "step": 139500 }, { "epoch": 0.51, "learning_rate": 4.155747634008504e-05, "loss": 5.6442, "step": 140000 }, { "epoch": 0.51, "eval_loss": 5.673394203186035, "eval_runtime": 4.3412, "eval_samples_per_second": 230.582, "eval_steps_per_second": 14.512, "step": 140000 }, { "epoch": 0.51, "learning_rate": 4.15273132664437e-05, "loss": 5.6313, "step": 140500 }, { "epoch": 0.51, "learning_rate": 4.149715019280237e-05, "loss": 5.6546, "step": 141000 }, { "epoch": 0.51, "eval_loss": 5.6783528327941895, "eval_runtime": 4.4144, "eval_samples_per_second": 226.756, "eval_steps_per_second": 14.271, "step": 141000 }, { "epoch": 0.51, "learning_rate": 4.146704744530832e-05, "loss": 5.6374, "step": 141500 }, { "epoch": 0.51, "learning_rate": 4.1436884371666984e-05, "loss": 5.6325, "step": 142000 }, { "epoch": 0.51, "eval_loss": 5.645352840423584, "eval_runtime": 4.3033, "eval_samples_per_second": 232.612, "eval_steps_per_second": 14.64, "step": 142000 }, { "epoch": 0.52, "learning_rate": 4.1406721298025644e-05, "loss": 5.643, "step": 142500 }, { "epoch": 0.52, "learning_rate": 4.137655822438432e-05, "loss": 5.6386, "step": 143000 }, { "epoch": 0.52, "eval_loss": 5.695677757263184, "eval_runtime": 4.2949, "eval_samples_per_second": 233.065, "eval_steps_per_second": 14.668, "step": 143000 }, { "epoch": 0.52, "learning_rate": 4.134639515074298e-05, "loss": 5.6337, "step": 143500 }, { "epoch": 0.52, "learning_rate": 4.131629240324893e-05, "loss": 5.6236, "step": 144000 }, { "epoch": 0.52, "eval_loss": 5.678653240203857, "eval_runtime": 4.4357, "eval_samples_per_second": 225.669, "eval_steps_per_second": 14.203, "step": 144000 }, { "epoch": 0.52, "learning_rate": 4.1286129329607595e-05, "loss": 5.639, "step": 144500 }, { "epoch": 0.52, "learning_rate": 4.1255966255966255e-05, "loss": 5.6269, "step": 145000 }, { "epoch": 0.52, "eval_loss": 5.681842803955078, "eval_runtime": 4.3725, "eval_samples_per_second": 228.929, "eval_steps_per_second": 14.408, "step": 145000 }, { "epoch": 0.53, "learning_rate": 4.122580318232492e-05, "loss": 5.635, "step": 145500 }, { "epoch": 0.53, "learning_rate": 4.119564010868359e-05, "loss": 5.6285, "step": 146000 }, { "epoch": 0.53, "eval_loss": 5.673871040344238, "eval_runtime": 4.3771, "eval_samples_per_second": 228.689, "eval_steps_per_second": 14.393, "step": 146000 }, { "epoch": 0.53, "learning_rate": 4.1165477035042256e-05, "loss": 5.6383, "step": 146500 }, { "epoch": 0.53, "learning_rate": 4.11353742875482e-05, "loss": 5.65, "step": 147000 }, { "epoch": 0.53, "eval_loss": 5.630629539489746, "eval_runtime": 4.41, "eval_samples_per_second": 226.983, "eval_steps_per_second": 14.286, "step": 147000 }, { "epoch": 0.53, "learning_rate": 4.110521121390687e-05, "loss": 5.6381, "step": 147500 }, { "epoch": 0.54, "learning_rate": 4.107504814026553e-05, "loss": 5.6313, "step": 148000 }, { "epoch": 0.54, "eval_loss": 5.6462812423706055, "eval_runtime": 4.3445, "eval_samples_per_second": 230.407, "eval_steps_per_second": 14.501, "step": 148000 }, { "epoch": 0.54, "learning_rate": 4.10448850666242e-05, "loss": 5.6347, "step": 148500 }, { "epoch": 0.54, "learning_rate": 4.1014721992982866e-05, "loss": 5.6412, "step": 149000 }, { "epoch": 0.54, "eval_loss": 5.668566703796387, "eval_runtime": 4.3418, "eval_samples_per_second": 230.551, "eval_steps_per_second": 14.51, "step": 149000 }, { "epoch": 0.54, "learning_rate": 4.098455891934153e-05, "loss": 5.639, "step": 149500 }, { "epoch": 0.54, "learning_rate": 4.0954456171847476e-05, "loss": 5.6278, "step": 150000 }, { "epoch": 0.54, "eval_loss": 5.6881489753723145, "eval_runtime": 4.455, "eval_samples_per_second": 224.692, "eval_steps_per_second": 14.141, "step": 150000 }, { "epoch": 0.54, "learning_rate": 4.092429309820614e-05, "loss": 5.6408, "step": 150500 }, { "epoch": 0.55, "learning_rate": 4.089413002456481e-05, "loss": 5.637, "step": 151000 }, { "epoch": 0.55, "eval_loss": 5.688764572143555, "eval_runtime": 6.2867, "eval_samples_per_second": 159.224, "eval_steps_per_second": 10.021, "step": 151000 }, { "epoch": 0.55, "learning_rate": 4.086396695092348e-05, "loss": 5.646, "step": 151500 }, { "epoch": 0.55, "learning_rate": 4.083380387728214e-05, "loss": 5.626, "step": 152000 }, { "epoch": 0.55, "eval_loss": 5.671611785888672, "eval_runtime": 4.3616, "eval_samples_per_second": 229.503, "eval_steps_per_second": 14.444, "step": 152000 }, { "epoch": 0.55, "learning_rate": 4.080364080364081e-05, "loss": 5.6378, "step": 152500 }, { "epoch": 0.55, "learning_rate": 4.0773538056146754e-05, "loss": 5.6338, "step": 153000 }, { "epoch": 0.55, "eval_loss": 5.616012096405029, "eval_runtime": 4.4085, "eval_samples_per_second": 227.06, "eval_steps_per_second": 14.29, "step": 153000 }, { "epoch": 0.56, "learning_rate": 4.0743374982505414e-05, "loss": 5.6318, "step": 153500 }, { "epoch": 0.56, "learning_rate": 4.071321190886409e-05, "loss": 5.6361, "step": 154000 }, { "epoch": 0.56, "eval_loss": 5.667599201202393, "eval_runtime": 6.2583, "eval_samples_per_second": 159.947, "eval_steps_per_second": 10.067, "step": 154000 }, { "epoch": 0.56, "learning_rate": 4.068304883522275e-05, "loss": 5.625, "step": 154500 }, { "epoch": 0.56, "learning_rate": 4.0652885761581415e-05, "loss": 5.6336, "step": 155000 }, { "epoch": 0.56, "eval_loss": 5.684675216674805, "eval_runtime": 4.3154, "eval_samples_per_second": 231.962, "eval_steps_per_second": 14.599, "step": 155000 }, { "epoch": 0.56, "learning_rate": 4.062272268794008e-05, "loss": 5.6388, "step": 155500 }, { "epoch": 0.56, "learning_rate": 4.059255961429875e-05, "loss": 5.6351, "step": 156000 }, { "epoch": 0.56, "eval_loss": 5.675138473510742, "eval_runtime": 4.3055, "eval_samples_per_second": 232.494, "eval_steps_per_second": 14.632, "step": 156000 }, { "epoch": 0.57, "learning_rate": 4.056239654065741e-05, "loss": 5.6266, "step": 156500 }, { "epoch": 0.57, "learning_rate": 4.053229379316336e-05, "loss": 5.6408, "step": 157000 }, { "epoch": 0.57, "eval_loss": 5.615994453430176, "eval_runtime": 4.3074, "eval_samples_per_second": 232.39, "eval_steps_per_second": 14.626, "step": 157000 }, { "epoch": 0.57, "learning_rate": 4.0502130719522025e-05, "loss": 5.6399, "step": 157500 }, { "epoch": 0.57, "learning_rate": 4.047196764588069e-05, "loss": 5.6232, "step": 158000 }, { "epoch": 0.57, "eval_loss": 5.652904033660889, "eval_runtime": 4.3343, "eval_samples_per_second": 230.949, "eval_steps_per_second": 14.535, "step": 158000 }, { "epoch": 0.57, "learning_rate": 4.044186489838664e-05, "loss": 5.6249, "step": 158500 }, { "epoch": 0.58, "learning_rate": 4.041170182474531e-05, "loss": 5.6319, "step": 159000 }, { "epoch": 0.58, "eval_loss": 5.64531946182251, "eval_runtime": 5.6252, "eval_samples_per_second": 177.948, "eval_steps_per_second": 11.2, "step": 159000 }, { "epoch": 0.58, "learning_rate": 4.038153875110397e-05, "loss": 5.6244, "step": 159500 }, { "epoch": 0.58, "learning_rate": 4.0351375677462636e-05, "loss": 5.6204, "step": 160000 }, { "epoch": 0.58, "eval_loss": 5.661260604858398, "eval_runtime": 4.3573, "eval_samples_per_second": 229.732, "eval_steps_per_second": 14.459, "step": 160000 }, { "epoch": 0.58, "learning_rate": 4.03212126038213e-05, "loss": 5.6353, "step": 160500 }, { "epoch": 0.58, "learning_rate": 4.0291109856327246e-05, "loss": 5.6231, "step": 161000 }, { "epoch": 0.58, "eval_loss": 5.666103363037109, "eval_runtime": 4.5226, "eval_samples_per_second": 221.333, "eval_steps_per_second": 13.93, "step": 161000 }, { "epoch": 0.58, "learning_rate": 4.026094678268591e-05, "loss": 5.6318, "step": 161500 }, { "epoch": 0.59, "learning_rate": 4.023078370904458e-05, "loss": 5.6322, "step": 162000 }, { "epoch": 0.59, "eval_loss": 5.640702247619629, "eval_runtime": 4.3357, "eval_samples_per_second": 230.872, "eval_steps_per_second": 14.53, "step": 162000 }, { "epoch": 0.59, "learning_rate": 4.020062063540325e-05, "loss": 5.6135, "step": 162500 }, { "epoch": 0.59, "learning_rate": 4.017045756176191e-05, "loss": 5.6369, "step": 163000 }, { "epoch": 0.59, "eval_loss": 5.658245086669922, "eval_runtime": 4.353, "eval_samples_per_second": 229.957, "eval_steps_per_second": 14.473, "step": 163000 }, { "epoch": 0.59, "learning_rate": 4.014029448812058e-05, "loss": 5.6445, "step": 163500 }, { "epoch": 0.59, "learning_rate": 4.0110191740626523e-05, "loss": 5.6337, "step": 164000 }, { "epoch": 0.59, "eval_loss": 5.672937393188477, "eval_runtime": 4.4799, "eval_samples_per_second": 223.441, "eval_steps_per_second": 14.063, "step": 164000 }, { "epoch": 0.6, "learning_rate": 4.008002866698519e-05, "loss": 5.6349, "step": 164500 }, { "epoch": 0.6, "learning_rate": 4.004986559334386e-05, "loss": 5.6278, "step": 165000 }, { "epoch": 0.6, "eval_loss": 5.6635026931762695, "eval_runtime": 6.1168, "eval_samples_per_second": 163.647, "eval_steps_per_second": 10.299, "step": 165000 }, { "epoch": 0.6, "learning_rate": 4.0019702519702524e-05, "loss": 5.6318, "step": 165500 }, { "epoch": 0.6, "learning_rate": 3.9989539446061184e-05, "loss": 5.6371, "step": 166000 }, { "epoch": 0.6, "eval_loss": 5.643805503845215, "eval_runtime": 4.2716, "eval_samples_per_second": 234.336, "eval_steps_per_second": 14.748, "step": 166000 }, { "epoch": 0.6, "learning_rate": 3.995943669856714e-05, "loss": 5.6145, "step": 166500 }, { "epoch": 0.6, "learning_rate": 3.99292736249258e-05, "loss": 5.624, "step": 167000 }, { "epoch": 0.6, "eval_loss": 5.633095741271973, "eval_runtime": 4.3138, "eval_samples_per_second": 232.048, "eval_steps_per_second": 14.604, "step": 167000 }, { "epoch": 0.61, "learning_rate": 3.989911055128447e-05, "loss": 5.619, "step": 167500 }, { "epoch": 0.61, "learning_rate": 3.9868947477643135e-05, "loss": 5.6198, "step": 168000 }, { "epoch": 0.61, "eval_loss": 5.6540656089782715, "eval_runtime": 4.3595, "eval_samples_per_second": 229.615, "eval_steps_per_second": 14.451, "step": 168000 }, { "epoch": 0.61, "learning_rate": 3.98387844040018e-05, "loss": 5.624, "step": 168500 }, { "epoch": 0.61, "learning_rate": 3.9808681656507745e-05, "loss": 5.6243, "step": 169000 }, { "epoch": 0.61, "eval_loss": 5.651320457458496, "eval_runtime": 4.3133, "eval_samples_per_second": 232.072, "eval_steps_per_second": 14.606, "step": 169000 }, { "epoch": 0.61, "learning_rate": 3.977851858286641e-05, "loss": 5.6133, "step": 169500 }, { "epoch": 0.62, "learning_rate": 3.974835550922508e-05, "loss": 5.6118, "step": 170000 }, { "epoch": 0.62, "eval_loss": 5.682806491851807, "eval_runtime": 4.4555, "eval_samples_per_second": 224.668, "eval_steps_per_second": 14.14, "step": 170000 }, { "epoch": 0.62, "learning_rate": 3.971819243558374e-05, "loss": 5.6085, "step": 170500 }, { "epoch": 0.62, "learning_rate": 3.9688029361942406e-05, "loss": 5.6315, "step": 171000 }, { "epoch": 0.62, "eval_loss": 5.685708045959473, "eval_runtime": 4.3306, "eval_samples_per_second": 231.143, "eval_steps_per_second": 14.547, "step": 171000 }, { "epoch": 0.62, "learning_rate": 3.965786628830107e-05, "loss": 5.6271, "step": 171500 }, { "epoch": 0.62, "learning_rate": 3.962770321465974e-05, "loss": 5.6208, "step": 172000 }, { "epoch": 0.62, "eval_loss": 5.64992618560791, "eval_runtime": 4.409, "eval_samples_per_second": 227.033, "eval_steps_per_second": 14.289, "step": 172000 }, { "epoch": 0.62, "learning_rate": 3.95975401410184e-05, "loss": 5.6218, "step": 172500 }, { "epoch": 0.63, "learning_rate": 3.9567437393524356e-05, "loss": 5.6261, "step": 173000 }, { "epoch": 0.63, "eval_loss": 5.64516544342041, "eval_runtime": 4.4391, "eval_samples_per_second": 225.497, "eval_steps_per_second": 14.192, "step": 173000 }, { "epoch": 0.63, "learning_rate": 3.9537274319883016e-05, "loss": 5.6255, "step": 173500 }, { "epoch": 0.63, "learning_rate": 3.950711124624168e-05, "loss": 5.6247, "step": 174000 }, { "epoch": 0.63, "eval_loss": 5.677004814147949, "eval_runtime": 4.3652, "eval_samples_per_second": 229.316, "eval_steps_per_second": 14.432, "step": 174000 }, { "epoch": 0.63, "learning_rate": 3.947700849874763e-05, "loss": 5.6099, "step": 174500 }, { "epoch": 0.63, "learning_rate": 3.94468454251063e-05, "loss": 5.6204, "step": 175000 }, { "epoch": 0.63, "eval_loss": 5.616661548614502, "eval_runtime": 4.3551, "eval_samples_per_second": 229.845, "eval_steps_per_second": 14.466, "step": 175000 }, { "epoch": 0.64, "learning_rate": 3.941668235146496e-05, "loss": 5.6196, "step": 175500 }, { "epoch": 0.64, "learning_rate": 3.938651927782363e-05, "loss": 5.6166, "step": 176000 }, { "epoch": 0.64, "eval_loss": 5.683280944824219, "eval_runtime": 4.3154, "eval_samples_per_second": 231.959, "eval_steps_per_second": 14.599, "step": 176000 }, { "epoch": 0.64, "learning_rate": 3.9356356204182294e-05, "loss": 5.6303, "step": 176500 }, { "epoch": 0.64, "learning_rate": 3.932625345668824e-05, "loss": 5.6145, "step": 177000 }, { "epoch": 0.64, "eval_loss": 5.708868980407715, "eval_runtime": 4.3953, "eval_samples_per_second": 227.744, "eval_steps_per_second": 14.334, "step": 177000 }, { "epoch": 0.64, "learning_rate": 3.929609038304691e-05, "loss": 5.6208, "step": 177500 }, { "epoch": 0.64, "learning_rate": 3.926592730940557e-05, "loss": 5.6155, "step": 178000 }, { "epoch": 0.64, "eval_loss": 5.643332481384277, "eval_runtime": 4.3751, "eval_samples_per_second": 228.794, "eval_steps_per_second": 14.4, "step": 178000 }, { "epoch": 0.65, "learning_rate": 3.923576423576424e-05, "loss": 5.6361, "step": 178500 }, { "epoch": 0.65, "learning_rate": 3.9205601162122905e-05, "loss": 5.6162, "step": 179000 }, { "epoch": 0.65, "eval_loss": 5.648618221282959, "eval_runtime": 4.3904, "eval_samples_per_second": 227.999, "eval_steps_per_second": 14.35, "step": 179000 }, { "epoch": 0.65, "learning_rate": 3.917543808848157e-05, "loss": 5.6344, "step": 179500 }, { "epoch": 0.65, "learning_rate": 3.914527501484023e-05, "loss": 5.6144, "step": 180000 }, { "epoch": 0.65, "eval_loss": 5.633224964141846, "eval_runtime": 4.3929, "eval_samples_per_second": 227.865, "eval_steps_per_second": 14.341, "step": 180000 }, { "epoch": 0.65, "learning_rate": 3.911517226734618e-05, "loss": 5.6285, "step": 180500 }, { "epoch": 0.66, "learning_rate": 3.908500919370485e-05, "loss": 5.6198, "step": 181000 }, { "epoch": 0.66, "eval_loss": 5.645481109619141, "eval_runtime": 4.3967, "eval_samples_per_second": 227.672, "eval_steps_per_second": 14.329, "step": 181000 }, { "epoch": 0.66, "learning_rate": 3.9054846120063515e-05, "loss": 5.6095, "step": 181500 }, { "epoch": 0.66, "learning_rate": 3.9024683046422175e-05, "loss": 5.6231, "step": 182000 }, { "epoch": 0.66, "eval_loss": 5.6638264656066895, "eval_runtime": 4.4238, "eval_samples_per_second": 226.274, "eval_steps_per_second": 14.241, "step": 182000 }, { "epoch": 0.66, "learning_rate": 3.899451997278085e-05, "loss": 5.6158, "step": 182500 }, { "epoch": 0.66, "learning_rate": 3.896441722528679e-05, "loss": 5.61, "step": 183000 }, { "epoch": 0.66, "eval_loss": 5.614964962005615, "eval_runtime": 4.3378, "eval_samples_per_second": 230.76, "eval_steps_per_second": 14.523, "step": 183000 }, { "epoch": 0.66, "learning_rate": 3.893425415164545e-05, "loss": 5.5975, "step": 183500 }, { "epoch": 0.67, "learning_rate": 3.8904091078004126e-05, "loss": 5.614, "step": 184000 }, { "epoch": 0.67, "eval_loss": 5.670050621032715, "eval_runtime": 4.3122, "eval_samples_per_second": 232.134, "eval_steps_per_second": 14.61, "step": 184000 }, { "epoch": 0.67, "learning_rate": 3.8873928004362786e-05, "loss": 5.6029, "step": 184500 }, { "epoch": 0.67, "learning_rate": 3.8843825256868736e-05, "loss": 5.6158, "step": 185000 }, { "epoch": 0.67, "eval_loss": 5.641917705535889, "eval_runtime": 4.3145, "eval_samples_per_second": 232.008, "eval_steps_per_second": 14.602, "step": 185000 }, { "epoch": 0.67, "learning_rate": 3.88136621832274e-05, "loss": 5.6351, "step": 185500 }, { "epoch": 0.67, "learning_rate": 3.878349910958607e-05, "loss": 5.6163, "step": 186000 }, { "epoch": 0.67, "eval_loss": 5.651737213134766, "eval_runtime": 4.3373, "eval_samples_per_second": 230.79, "eval_steps_per_second": 14.525, "step": 186000 }, { "epoch": 0.68, "learning_rate": 3.875333603594473e-05, "loss": 5.5979, "step": 186500 }, { "epoch": 0.68, "learning_rate": 3.8723172962303403e-05, "loss": 5.6151, "step": 187000 }, { "epoch": 0.68, "eval_loss": 5.646933078765869, "eval_runtime": 4.2964, "eval_samples_per_second": 232.987, "eval_steps_per_second": 14.664, "step": 187000 }, { "epoch": 0.68, "learning_rate": 3.8693070214809346e-05, "loss": 5.6236, "step": 187500 }, { "epoch": 0.68, "learning_rate": 3.866290714116801e-05, "loss": 5.6251, "step": 188000 }, { "epoch": 0.68, "eval_loss": 5.671980381011963, "eval_runtime": 4.4739, "eval_samples_per_second": 223.743, "eval_steps_per_second": 14.082, "step": 188000 }, { "epoch": 0.68, "learning_rate": 3.863274406752668e-05, "loss": 5.6018, "step": 188500 }, { "epoch": 0.68, "learning_rate": 3.860258099388535e-05, "loss": 5.6272, "step": 189000 }, { "epoch": 0.68, "eval_loss": 5.603824138641357, "eval_runtime": 4.3496, "eval_samples_per_second": 230.135, "eval_steps_per_second": 14.484, "step": 189000 }, { "epoch": 0.69, "learning_rate": 3.857247824639129e-05, "loss": 5.6282, "step": 189500 }, { "epoch": 0.69, "learning_rate": 3.854231517274996e-05, "loss": 5.6291, "step": 190000 }, { "epoch": 0.69, "eval_loss": 5.6158552169799805, "eval_runtime": 6.4339, "eval_samples_per_second": 155.582, "eval_steps_per_second": 9.792, "step": 190000 }, { "epoch": 0.69, "learning_rate": 3.8512152099108624e-05, "loss": 5.6017, "step": 190500 }, { "epoch": 0.69, "learning_rate": 3.8481989025467284e-05, "loss": 5.6114, "step": 191000 }, { "epoch": 0.69, "eval_loss": 5.6429595947265625, "eval_runtime": 4.3858, "eval_samples_per_second": 228.235, "eval_steps_per_second": 14.364, "step": 191000 }, { "epoch": 0.69, "learning_rate": 3.845182595182595e-05, "loss": 5.6194, "step": 191500 }, { "epoch": 0.69, "learning_rate": 3.842166287818462e-05, "loss": 5.6128, "step": 192000 }, { "epoch": 0.69, "eval_loss": 5.634527683258057, "eval_runtime": 4.3652, "eval_samples_per_second": 229.313, "eval_steps_per_second": 14.432, "step": 192000 }, { "epoch": 0.7, "learning_rate": 3.8391499804543285e-05, "loss": 5.62, "step": 192500 }, { "epoch": 0.7, "learning_rate": 3.836139705704923e-05, "loss": 5.6213, "step": 193000 }, { "epoch": 0.7, "eval_loss": 5.641001224517822, "eval_runtime": 4.355, "eval_samples_per_second": 229.851, "eval_steps_per_second": 14.466, "step": 193000 }, { "epoch": 0.7, "learning_rate": 3.83312339834079e-05, "loss": 5.6182, "step": 193500 }, { "epoch": 0.7, "learning_rate": 3.830107090976656e-05, "loss": 5.6104, "step": 194000 }, { "epoch": 0.7, "eval_loss": 5.6409525871276855, "eval_runtime": 4.3644, "eval_samples_per_second": 229.353, "eval_steps_per_second": 14.435, "step": 194000 }, { "epoch": 0.7, "learning_rate": 3.827090783612523e-05, "loss": 5.6061, "step": 194500 }, { "epoch": 0.71, "learning_rate": 3.8240744762483896e-05, "loss": 5.6081, "step": 195000 }, { "epoch": 0.71, "eval_loss": 5.624050617218018, "eval_runtime": 4.463, "eval_samples_per_second": 224.291, "eval_steps_per_second": 14.116, "step": 195000 }, { "epoch": 0.71, "learning_rate": 3.821058168884256e-05, "loss": 5.6234, "step": 195500 }, { "epoch": 0.71, "learning_rate": 3.8180478941348505e-05, "loss": 5.6288, "step": 196000 }, { "epoch": 0.71, "eval_loss": 5.689996242523193, "eval_runtime": 6.3147, "eval_samples_per_second": 158.518, "eval_steps_per_second": 9.977, "step": 196000 }, { "epoch": 0.71, "learning_rate": 3.815031586770718e-05, "loss": 5.6253, "step": 196500 }, { "epoch": 0.71, "learning_rate": 3.812015279406584e-05, "loss": 5.607, "step": 197000 }, { "epoch": 0.71, "eval_loss": 5.656425952911377, "eval_runtime": 4.3871, "eval_samples_per_second": 228.167, "eval_steps_per_second": 14.36, "step": 197000 }, { "epoch": 0.71, "learning_rate": 3.8089989720424506e-05, "loss": 5.6259, "step": 197500 }, { "epoch": 0.72, "learning_rate": 3.805982664678317e-05, "loss": 5.605, "step": 198000 }, { "epoch": 0.72, "eval_loss": 5.651924133300781, "eval_runtime": 4.362, "eval_samples_per_second": 229.484, "eval_steps_per_second": 14.443, "step": 198000 }, { "epoch": 0.72, "learning_rate": 3.8029723899289116e-05, "loss": 5.6153, "step": 198500 }, { "epoch": 0.72, "learning_rate": 3.799956082564778e-05, "loss": 5.6109, "step": 199000 }, { "epoch": 0.72, "eval_loss": 5.639188766479492, "eval_runtime": 4.301, "eval_samples_per_second": 232.739, "eval_steps_per_second": 14.648, "step": 199000 }, { "epoch": 0.72, "learning_rate": 3.796939775200645e-05, "loss": 5.6064, "step": 199500 }, { "epoch": 0.72, "learning_rate": 3.793923467836512e-05, "loss": 5.619, "step": 200000 }, { "epoch": 0.72, "eval_loss": 5.662364959716797, "eval_runtime": 4.3023, "eval_samples_per_second": 232.664, "eval_steps_per_second": 14.643, "step": 200000 }, { "epoch": 0.73, "learning_rate": 3.790913193087106e-05, "loss": 5.6079, "step": 200500 }, { "epoch": 0.73, "learning_rate": 3.7878968857229727e-05, "loss": 5.6019, "step": 201000 }, { "epoch": 0.73, "eval_loss": 5.653491973876953, "eval_runtime": 4.2771, "eval_samples_per_second": 234.036, "eval_steps_per_second": 14.73, "step": 201000 }, { "epoch": 0.73, "learning_rate": 3.7848805783588394e-05, "loss": 5.6022, "step": 201500 }, { "epoch": 0.73, "learning_rate": 3.781864270994706e-05, "loss": 5.6133, "step": 202000 }, { "epoch": 0.73, "eval_loss": 5.676525115966797, "eval_runtime": 4.3425, "eval_samples_per_second": 230.512, "eval_steps_per_second": 14.508, "step": 202000 }, { "epoch": 0.73, "learning_rate": 3.778847963630572e-05, "loss": 5.6117, "step": 202500 }, { "epoch": 0.73, "learning_rate": 3.775837688881168e-05, "loss": 5.5927, "step": 203000 }, { "epoch": 0.73, "eval_loss": 5.636429786682129, "eval_runtime": 4.3367, "eval_samples_per_second": 230.822, "eval_steps_per_second": 14.527, "step": 203000 }, { "epoch": 0.74, "learning_rate": 3.772821381517034e-05, "loss": 5.6198, "step": 203500 }, { "epoch": 0.74, "learning_rate": 3.7698050741529004e-05, "loss": 5.6119, "step": 204000 }, { "epoch": 0.74, "eval_loss": 5.666501045227051, "eval_runtime": 4.3287, "eval_samples_per_second": 231.246, "eval_steps_per_second": 14.554, "step": 204000 }, { "epoch": 0.74, "learning_rate": 3.766788766788767e-05, "loss": 5.6199, "step": 204500 }, { "epoch": 0.74, "learning_rate": 3.763778492039362e-05, "loss": 5.602, "step": 205000 }, { "epoch": 0.74, "eval_loss": 5.631441116333008, "eval_runtime": 4.3578, "eval_samples_per_second": 229.703, "eval_steps_per_second": 14.457, "step": 205000 }, { "epoch": 0.74, "learning_rate": 3.760762184675228e-05, "loss": 5.5997, "step": 205500 }, { "epoch": 0.75, "learning_rate": 3.757745877311095e-05, "loss": 5.605, "step": 206000 }, { "epoch": 0.75, "eval_loss": 5.630640983581543, "eval_runtime": 4.2859, "eval_samples_per_second": 233.558, "eval_steps_per_second": 14.699, "step": 206000 }, { "epoch": 0.75, "learning_rate": 3.7547295699469615e-05, "loss": 5.6155, "step": 206500 }, { "epoch": 0.75, "learning_rate": 3.7517132625828275e-05, "loss": 5.612, "step": 207000 }, { "epoch": 0.75, "eval_loss": 5.597905158996582, "eval_runtime": 4.3535, "eval_samples_per_second": 229.93, "eval_steps_per_second": 14.471, "step": 207000 }, { "epoch": 0.75, "learning_rate": 3.748696955218695e-05, "loss": 5.6189, "step": 207500 }, { "epoch": 0.75, "learning_rate": 3.745680647854561e-05, "loss": 5.6184, "step": 208000 }, { "epoch": 0.75, "eval_loss": 5.636325359344482, "eval_runtime": 4.3413, "eval_samples_per_second": 230.575, "eval_steps_per_second": 14.512, "step": 208000 }, { "epoch": 0.75, "learning_rate": 3.742670373105156e-05, "loss": 5.6078, "step": 208500 }, { "epoch": 0.76, "learning_rate": 3.7396540657410225e-05, "loss": 5.6131, "step": 209000 }, { "epoch": 0.76, "eval_loss": 5.635697364807129, "eval_runtime": 4.3624, "eval_samples_per_second": 229.461, "eval_steps_per_second": 14.442, "step": 209000 }, { "epoch": 0.76, "learning_rate": 3.736637758376889e-05, "loss": 5.6102, "step": 209500 }, { "epoch": 0.76, "learning_rate": 3.733621451012755e-05, "loss": 5.6063, "step": 210000 }, { "epoch": 0.76, "eval_loss": 5.6277570724487305, "eval_runtime": 4.4064, "eval_samples_per_second": 227.169, "eval_steps_per_second": 14.297, "step": 210000 } ], "max_steps": 828828, "num_train_epochs": 3, "total_flos": 2.574739225286738e+17, "trial_name": null, "trial_params": null }