{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05815644082582146, "eval_steps": 25, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007754192110109528, "grad_norm": 13.743983268737793, "learning_rate": 3.3333333333333335e-05, "loss": 10.4552, "step": 1 }, { "epoch": 0.0007754192110109528, "eval_loss": 10.45703125, "eval_runtime": 17.3532, "eval_samples_per_second": 62.582, "eval_steps_per_second": 31.291, "step": 1 }, { "epoch": 0.0015508384220219056, "grad_norm": 15.994314193725586, "learning_rate": 6.666666666666667e-05, "loss": 10.4156, "step": 2 }, { "epoch": 0.0023262576330328583, "grad_norm": 17.251068115234375, "learning_rate": 0.0001, "loss": 10.3381, "step": 3 }, { "epoch": 0.0031016768440438112, "grad_norm": 13.243555068969727, "learning_rate": 9.99524110790929e-05, "loss": 10.5021, "step": 4 }, { "epoch": 0.003877096055054764, "grad_norm": 12.53420352935791, "learning_rate": 9.980973490458728e-05, "loss": 10.0685, "step": 5 }, { "epoch": 0.004652515266065717, "grad_norm": 13.710821151733398, "learning_rate": 9.957224306869053e-05, "loss": 9.9248, "step": 6 }, { "epoch": 0.0054279344770766695, "grad_norm": 14.05363941192627, "learning_rate": 9.924038765061042e-05, "loss": 9.9161, "step": 7 }, { "epoch": 0.0062033536880876225, "grad_norm": 13.404027938842773, "learning_rate": 9.881480035599667e-05, "loss": 9.5758, "step": 8 }, { "epoch": 0.006978772899098575, "grad_norm": 15.494867324829102, "learning_rate": 9.829629131445342e-05, "loss": 9.7933, "step": 9 }, { "epoch": 0.007754192110109528, "grad_norm": 21.381547927856445, "learning_rate": 9.768584753741134e-05, "loss": 9.1108, "step": 10 }, { "epoch": 0.00852961132112048, "grad_norm": 17.526710510253906, "learning_rate": 9.698463103929542e-05, "loss": 9.1686, "step": 11 }, { "epoch": 0.009305030532131433, "grad_norm": 16.56471824645996, "learning_rate": 9.619397662556435e-05, "loss": 9.0037, "step": 12 }, { "epoch": 0.010080449743142386, "grad_norm": 17.765729904174805, "learning_rate": 9.53153893518325e-05, "loss": 8.9391, "step": 13 }, { "epoch": 0.010855868954153339, "grad_norm": 18.965198516845703, "learning_rate": 9.435054165891109e-05, "loss": 8.794, "step": 14 }, { "epoch": 0.011631288165164292, "grad_norm": 20.40072250366211, "learning_rate": 9.330127018922194e-05, "loss": 8.0026, "step": 15 }, { "epoch": 0.012406707376175245, "grad_norm": 15.78911304473877, "learning_rate": 9.21695722906443e-05, "loss": 8.396, "step": 16 }, { "epoch": 0.013182126587186198, "grad_norm": 21.308853149414062, "learning_rate": 9.09576022144496e-05, "loss": 8.1367, "step": 17 }, { "epoch": 0.01395754579819715, "grad_norm": 17.67489242553711, "learning_rate": 8.966766701456177e-05, "loss": 7.9123, "step": 18 }, { "epoch": 0.014732965009208104, "grad_norm": 17.39400291442871, "learning_rate": 8.83022221559489e-05, "loss": 7.8025, "step": 19 }, { "epoch": 0.015508384220219057, "grad_norm": 18.812978744506836, "learning_rate": 8.68638668405062e-05, "loss": 7.7002, "step": 20 }, { "epoch": 0.01628380343123001, "grad_norm": 20.998043060302734, "learning_rate": 8.535533905932738e-05, "loss": 7.5169, "step": 21 }, { "epoch": 0.01705922264224096, "grad_norm": 17.887672424316406, "learning_rate": 8.377951038078302e-05, "loss": 7.2246, "step": 22 }, { "epoch": 0.017834641853251915, "grad_norm": 18.00612449645996, "learning_rate": 8.213938048432697e-05, "loss": 6.679, "step": 23 }, { "epoch": 0.018610061064262867, "grad_norm": 16.111370086669922, "learning_rate": 8.043807145043604e-05, "loss": 6.2643, "step": 24 }, { "epoch": 0.01938548027527382, "grad_norm": 22.258943557739258, "learning_rate": 7.86788218175523e-05, "loss": 6.6102, "step": 25 }, { "epoch": 0.01938548027527382, "eval_loss": 6.311874866485596, "eval_runtime": 17.3852, "eval_samples_per_second": 62.467, "eval_steps_per_second": 31.233, "step": 25 }, { "epoch": 0.020160899486284772, "grad_norm": 15.573179244995117, "learning_rate": 7.68649804173412e-05, "loss": 6.3441, "step": 26 }, { "epoch": 0.020936318697295727, "grad_norm": 15.074617385864258, "learning_rate": 7.500000000000001e-05, "loss": 6.2842, "step": 27 }, { "epoch": 0.021711737908306678, "grad_norm": 15.864977836608887, "learning_rate": 7.308743066175172e-05, "loss": 5.7357, "step": 28 }, { "epoch": 0.02248715711931763, "grad_norm": 19.382728576660156, "learning_rate": 7.113091308703498e-05, "loss": 6.0517, "step": 29 }, { "epoch": 0.023262576330328584, "grad_norm": 18.489917755126953, "learning_rate": 6.91341716182545e-05, "loss": 5.7561, "step": 30 }, { "epoch": 0.024037995541339535, "grad_norm": 11.977217674255371, "learning_rate": 6.710100716628344e-05, "loss": 5.3761, "step": 31 }, { "epoch": 0.02481341475235049, "grad_norm": 12.359403610229492, "learning_rate": 6.503528997521366e-05, "loss": 5.6858, "step": 32 }, { "epoch": 0.02558883396336144, "grad_norm": 17.48925018310547, "learning_rate": 6.294095225512603e-05, "loss": 5.4112, "step": 33 }, { "epoch": 0.026364253174372396, "grad_norm": 12.810553550720215, "learning_rate": 6.0821980696905146e-05, "loss": 5.1038, "step": 34 }, { "epoch": 0.027139672385383347, "grad_norm": 11.41131591796875, "learning_rate": 5.868240888334653e-05, "loss": 5.1019, "step": 35 }, { "epoch": 0.0279150915963943, "grad_norm": 11.404109001159668, "learning_rate": 5.6526309611002594e-05, "loss": 5.0096, "step": 36 }, { "epoch": 0.028690510807405253, "grad_norm": 10.934562683105469, "learning_rate": 5.435778713738292e-05, "loss": 4.8367, "step": 37 }, { "epoch": 0.029465930018416207, "grad_norm": 11.160900115966797, "learning_rate": 5.218096936826681e-05, "loss": 5.009, "step": 38 }, { "epoch": 0.03024134922942716, "grad_norm": 9.400223731994629, "learning_rate": 5e-05, "loss": 5.121, "step": 39 }, { "epoch": 0.031016768440438113, "grad_norm": 9.578547477722168, "learning_rate": 4.781903063173321e-05, "loss": 4.9154, "step": 40 }, { "epoch": 0.03179218765144907, "grad_norm": 8.08479118347168, "learning_rate": 4.564221286261709e-05, "loss": 4.4877, "step": 41 }, { "epoch": 0.03256760686246002, "grad_norm": 9.323592185974121, "learning_rate": 4.347369038899744e-05, "loss": 4.7919, "step": 42 }, { "epoch": 0.03334302607347097, "grad_norm": 12.592964172363281, "learning_rate": 4.131759111665349e-05, "loss": 4.9646, "step": 43 }, { "epoch": 0.03411844528448192, "grad_norm": 8.313202857971191, "learning_rate": 3.917801930309486e-05, "loss": 4.5304, "step": 44 }, { "epoch": 0.03489386449549287, "grad_norm": 9.623037338256836, "learning_rate": 3.705904774487396e-05, "loss": 4.5277, "step": 45 }, { "epoch": 0.03566928370650383, "grad_norm": 6.551573753356934, "learning_rate": 3.4964710024786354e-05, "loss": 4.7194, "step": 46 }, { "epoch": 0.03644470291751478, "grad_norm": 9.742829322814941, "learning_rate": 3.289899283371657e-05, "loss": 4.4445, "step": 47 }, { "epoch": 0.03722012212852573, "grad_norm": 6.696950435638428, "learning_rate": 3.086582838174551e-05, "loss": 4.3009, "step": 48 }, { "epoch": 0.037995541339536684, "grad_norm": 6.955257415771484, "learning_rate": 2.886908691296504e-05, "loss": 4.2683, "step": 49 }, { "epoch": 0.03877096055054764, "grad_norm": 11.268085479736328, "learning_rate": 2.6912569338248315e-05, "loss": 4.3482, "step": 50 }, { "epoch": 0.03877096055054764, "eval_loss": 4.515497207641602, "eval_runtime": 17.341, "eval_samples_per_second": 62.626, "eval_steps_per_second": 31.313, "step": 50 }, { "epoch": 0.039546379761558594, "grad_norm": 7.341830253601074, "learning_rate": 2.500000000000001e-05, "loss": 4.3773, "step": 51 }, { "epoch": 0.040321798972569545, "grad_norm": 8.497244834899902, "learning_rate": 2.3135019582658802e-05, "loss": 4.6826, "step": 52 }, { "epoch": 0.041097218183580496, "grad_norm": 7.568642616271973, "learning_rate": 2.132117818244771e-05, "loss": 4.4897, "step": 53 }, { "epoch": 0.041872637394591454, "grad_norm": 7.663918495178223, "learning_rate": 1.9561928549563968e-05, "loss": 5.1256, "step": 54 }, { "epoch": 0.042648056605602405, "grad_norm": 6.138592720031738, "learning_rate": 1.7860619515673033e-05, "loss": 4.3068, "step": 55 }, { "epoch": 0.043423475816613356, "grad_norm": 6.359229564666748, "learning_rate": 1.622048961921699e-05, "loss": 4.6431, "step": 56 }, { "epoch": 0.04419889502762431, "grad_norm": 6.404804706573486, "learning_rate": 1.4644660940672627e-05, "loss": 4.4604, "step": 57 }, { "epoch": 0.04497431423863526, "grad_norm": 5.194124221801758, "learning_rate": 1.3136133159493802e-05, "loss": 4.4148, "step": 58 }, { "epoch": 0.04574973344964622, "grad_norm": 6.9789557456970215, "learning_rate": 1.1697777844051105e-05, "loss": 4.5563, "step": 59 }, { "epoch": 0.04652515266065717, "grad_norm": 6.215468406677246, "learning_rate": 1.0332332985438248e-05, "loss": 4.6267, "step": 60 }, { "epoch": 0.04730057187166812, "grad_norm": 5.669643878936768, "learning_rate": 9.042397785550405e-06, "loss": 4.3564, "step": 61 }, { "epoch": 0.04807599108267907, "grad_norm": 6.695188045501709, "learning_rate": 7.830427709355725e-06, "loss": 4.6728, "step": 62 }, { "epoch": 0.04885141029369003, "grad_norm": 6.3594069480896, "learning_rate": 6.698729810778065e-06, "loss": 4.4725, "step": 63 }, { "epoch": 0.04962682950470098, "grad_norm": 6.124025344848633, "learning_rate": 5.649458341088915e-06, "loss": 4.7726, "step": 64 }, { "epoch": 0.05040224871571193, "grad_norm": 6.306771755218506, "learning_rate": 4.684610648167503e-06, "loss": 4.7565, "step": 65 }, { "epoch": 0.05117766792672288, "grad_norm": 6.299499034881592, "learning_rate": 3.8060233744356633e-06, "loss": 4.1523, "step": 66 }, { "epoch": 0.05195308713773384, "grad_norm": 5.291078567504883, "learning_rate": 3.0153689607045845e-06, "loss": 4.4084, "step": 67 }, { "epoch": 0.05272850634874479, "grad_norm": 7.21616268157959, "learning_rate": 2.314152462588659e-06, "loss": 4.2842, "step": 68 }, { "epoch": 0.05350392555975574, "grad_norm": 5.060047626495361, "learning_rate": 1.70370868554659e-06, "loss": 4.4137, "step": 69 }, { "epoch": 0.054279344770766694, "grad_norm": 5.063629627227783, "learning_rate": 1.1851996440033319e-06, "loss": 4.4392, "step": 70 }, { "epoch": 0.05505476398177765, "grad_norm": 4.903884410858154, "learning_rate": 7.596123493895991e-07, "loss": 4.3334, "step": 71 }, { "epoch": 0.0558301831927886, "grad_norm": 6.626821517944336, "learning_rate": 4.277569313094809e-07, "loss": 4.8866, "step": 72 }, { "epoch": 0.056605602403799554, "grad_norm": 5.395899772644043, "learning_rate": 1.9026509541272275e-07, "loss": 4.1787, "step": 73 }, { "epoch": 0.057381021614810505, "grad_norm": 5.96885347366333, "learning_rate": 4.7588920907110094e-08, "loss": 4.9024, "step": 74 }, { "epoch": 0.05815644082582146, "grad_norm": 6.069552898406982, "learning_rate": 0.0, "loss": 4.4114, "step": 75 }, { "epoch": 0.05815644082582146, "eval_loss": 4.3782525062561035, "eval_runtime": 17.3208, "eval_samples_per_second": 62.699, "eval_steps_per_second": 31.35, "step": 75 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 43502587084800.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }