{ "best_metric": null, "best_model_checkpoint": null, "epoch": 11.999931153184166, "global_step": 43572, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14, "learning_rate": 0.0009885247406591388, "loss": 0.4139, "step": 500 }, { "epoch": 0.28, "learning_rate": 0.0009770494813182778, "loss": 0.315, "step": 1000 }, { "epoch": 0.41, "learning_rate": 0.0009655742219774168, "loss": 0.2807, "step": 1500 }, { "epoch": 0.55, "learning_rate": 0.0009540989626365555, "loss": 0.2624, "step": 2000 }, { "epoch": 0.69, "learning_rate": 0.0009426237032956944, "loss": 0.257, "step": 2500 }, { "epoch": 0.83, "learning_rate": 0.0009311484439548334, "loss": 0.2378, "step": 3000 }, { "epoch": 0.96, "learning_rate": 0.0009196731846139723, "loss": 0.2245, "step": 3500 }, { "epoch": 1.1, "learning_rate": 0.0009081979252731112, "loss": 0.1983, "step": 4000 }, { "epoch": 1.24, "learning_rate": 0.0008967226659322501, "loss": 0.1987, "step": 4500 }, { "epoch": 1.38, "learning_rate": 0.0008852474065913891, "loss": 0.1904, "step": 5000 }, { "epoch": 1.51, "learning_rate": 0.0008737721472505279, "loss": 0.1829, "step": 5500 }, { "epoch": 1.65, "learning_rate": 0.0008622968879096667, "loss": 0.1773, "step": 6000 }, { "epoch": 1.79, "learning_rate": 0.0008508216285688056, "loss": 0.179, "step": 6500 }, { "epoch": 1.93, "learning_rate": 0.0008393463692279446, "loss": 0.177, "step": 7000 }, { "epoch": 2.07, "learning_rate": 0.0008278711098870835, "loss": 0.1577, "step": 7500 }, { "epoch": 2.2, "learning_rate": 0.0008163958505462224, "loss": 0.1529, "step": 8000 }, { "epoch": 2.34, "learning_rate": 0.0008049205912053612, "loss": 0.1468, "step": 8500 }, { "epoch": 2.48, "learning_rate": 0.0007934453318645002, "loss": 0.1492, "step": 9000 }, { "epoch": 2.62, "learning_rate": 0.000781970072523639, "loss": 0.1426, "step": 9500 }, { "epoch": 2.75, "learning_rate": 0.0007704948131827779, "loss": 0.1368, "step": 10000 }, { "epoch": 2.89, "learning_rate": 0.0007590195538419168, "loss": 0.1443, "step": 10500 }, { "epoch": 3.03, "learning_rate": 0.0007475442945010558, "loss": 0.1364, "step": 11000 }, { "epoch": 3.17, "learning_rate": 0.0007360690351601946, "loss": 0.1221, "step": 11500 }, { "epoch": 3.3, "learning_rate": 0.0007245937758193335, "loss": 0.1188, "step": 12000 }, { "epoch": 3.44, "learning_rate": 0.0007131185164784725, "loss": 0.1218, "step": 12500 }, { "epoch": 3.58, "learning_rate": 0.0007016432571376114, "loss": 0.1175, "step": 13000 }, { "epoch": 3.72, "learning_rate": 0.0006901679977967502, "loss": 0.117, "step": 13500 }, { "epoch": 3.86, "learning_rate": 0.000678692738455889, "loss": 0.1154, "step": 14000 }, { "epoch": 3.99, "learning_rate": 0.000667217479115028, "loss": 0.1124, "step": 14500 }, { "epoch": 4.13, "learning_rate": 0.0006557422197741669, "loss": 0.0995, "step": 15000 }, { "epoch": 4.27, "learning_rate": 0.0006442669604333058, "loss": 0.0948, "step": 15500 }, { "epoch": 4.41, "learning_rate": 0.0006327917010924447, "loss": 0.0972, "step": 16000 }, { "epoch": 4.54, "learning_rate": 0.0006213164417515837, "loss": 0.0967, "step": 16500 }, { "epoch": 4.68, "learning_rate": 0.0006098411824107226, "loss": 0.1003, "step": 17000 }, { "epoch": 4.82, "learning_rate": 0.0005983659230698613, "loss": 0.0961, "step": 17500 }, { "epoch": 4.96, "learning_rate": 0.0005868906637290002, "loss": 0.0967, "step": 18000 }, { "epoch": 5.1, "learning_rate": 0.0005754154043881392, "loss": 0.0825, "step": 18500 }, { "epoch": 5.23, "learning_rate": 0.0005639401450472781, "loss": 0.081, "step": 19000 }, { "epoch": 5.37, "learning_rate": 0.000552464885706417, "loss": 0.0827, "step": 19500 }, { "epoch": 5.51, "learning_rate": 0.0005409896263655559, "loss": 0.08, "step": 20000 }, { "epoch": 5.65, "learning_rate": 0.0005295143670246948, "loss": 0.0842, "step": 20500 }, { "epoch": 5.78, "learning_rate": 0.0005180391076838336, "loss": 0.0798, "step": 21000 }, { "epoch": 5.92, "learning_rate": 0.0005065638483429725, "loss": 0.083, "step": 21500 }, { "epoch": 6.06, "learning_rate": 0.0004950885890021115, "loss": 0.0749, "step": 22000 }, { "epoch": 6.2, "learning_rate": 0.00048361332966125037, "loss": 0.0667, "step": 22500 }, { "epoch": 6.33, "learning_rate": 0.00047213807032038925, "loss": 0.0677, "step": 23000 }, { "epoch": 6.47, "learning_rate": 0.0004606628109795282, "loss": 0.0699, "step": 23500 }, { "epoch": 6.61, "learning_rate": 0.000449187551638667, "loss": 0.0694, "step": 24000 }, { "epoch": 6.75, "learning_rate": 0.00043771229229780595, "loss": 0.0674, "step": 24500 }, { "epoch": 6.89, "learning_rate": 0.00042623703295694484, "loss": 0.0652, "step": 25000 }, { "epoch": 7.02, "learning_rate": 0.00041476177361608377, "loss": 0.0668, "step": 25500 }, { "epoch": 7.16, "learning_rate": 0.0004032865142752226, "loss": 0.0573, "step": 26000 }, { "epoch": 7.3, "learning_rate": 0.00039181125493436154, "loss": 0.0564, "step": 26500 }, { "epoch": 7.44, "learning_rate": 0.0003803359955935004, "loss": 0.0573, "step": 27000 }, { "epoch": 7.57, "learning_rate": 0.0003688607362526393, "loss": 0.0572, "step": 27500 }, { "epoch": 7.71, "learning_rate": 0.0003573854769117782, "loss": 0.058, "step": 28000 }, { "epoch": 7.85, "learning_rate": 0.0003459102175709171, "loss": 0.0583, "step": 28500 }, { "epoch": 7.99, "learning_rate": 0.000334434958230056, "loss": 0.0581, "step": 29000 }, { "epoch": 8.12, "learning_rate": 0.0003229596988891949, "loss": 0.0483, "step": 29500 }, { "epoch": 8.26, "learning_rate": 0.00031148443954833377, "loss": 0.0474, "step": 30000 }, { "epoch": 8.4, "learning_rate": 0.0003000091802074727, "loss": 0.0497, "step": 30500 }, { "epoch": 8.54, "learning_rate": 0.0002885339208666116, "loss": 0.0487, "step": 31000 }, { "epoch": 8.68, "learning_rate": 0.00027705866152575047, "loss": 0.0469, "step": 31500 }, { "epoch": 8.81, "learning_rate": 0.0002655834021848894, "loss": 0.0492, "step": 32000 }, { "epoch": 8.95, "learning_rate": 0.0002541081428440283, "loss": 0.0494, "step": 32500 }, { "epoch": 9.09, "learning_rate": 0.00024263288350316717, "loss": 0.0426, "step": 33000 }, { "epoch": 9.23, "learning_rate": 0.00023115762416230608, "loss": 0.0406, "step": 33500 }, { "epoch": 9.36, "learning_rate": 0.00021968236482144496, "loss": 0.0405, "step": 34000 }, { "epoch": 9.5, "learning_rate": 0.00020820710548058387, "loss": 0.043, "step": 34500 }, { "epoch": 9.64, "learning_rate": 0.00019673184613972275, "loss": 0.0411, "step": 35000 }, { "epoch": 9.78, "learning_rate": 0.00018525658679886166, "loss": 0.0419, "step": 35500 }, { "epoch": 9.91, "learning_rate": 0.00017378132745800054, "loss": 0.0409, "step": 36000 }, { "epoch": 10.05, "learning_rate": 0.00016230606811713943, "loss": 0.0388, "step": 36500 }, { "epoch": 10.19, "learning_rate": 0.00015083080877627833, "loss": 0.0367, "step": 37000 }, { "epoch": 10.33, "learning_rate": 0.00013935554943541724, "loss": 0.0355, "step": 37500 }, { "epoch": 10.47, "learning_rate": 0.00012788029009455615, "loss": 0.0358, "step": 38000 }, { "epoch": 10.6, "learning_rate": 0.00011640503075369504, "loss": 0.0369, "step": 38500 }, { "epoch": 10.74, "learning_rate": 0.00010492977141283393, "loss": 0.0339, "step": 39000 }, { "epoch": 10.88, "learning_rate": 9.345451207197283e-05, "loss": 0.0359, "step": 39500 }, { "epoch": 11.02, "learning_rate": 8.197925273111174e-05, "loss": 0.0364, "step": 40000 }, { "epoch": 11.15, "learning_rate": 7.050399339025063e-05, "loss": 0.0325, "step": 40500 }, { "epoch": 11.29, "learning_rate": 5.902873404938952e-05, "loss": 0.0321, "step": 41000 }, { "epoch": 11.43, "learning_rate": 4.755347470852842e-05, "loss": 0.0328, "step": 41500 }, { "epoch": 11.57, "learning_rate": 3.607821536766731e-05, "loss": 0.032, "step": 42000 }, { "epoch": 11.7, "learning_rate": 2.4602956026806205e-05, "loss": 0.0315, "step": 42500 }, { "epoch": 11.84, "learning_rate": 1.3127696685945103e-05, "loss": 0.0318, "step": 43000 }, { "epoch": 11.98, "learning_rate": 1.6524373450839989e-06, "loss": 0.0314, "step": 43500 }, { "epoch": 12.0, "step": 43572, "total_flos": 2.93279124897792e+17, "train_loss": 0.09914802481164445, "train_runtime": 67574.8841, "train_samples_per_second": 20.634, "train_steps_per_second": 0.645 } ], "max_steps": 43572, "num_train_epochs": 12, "total_flos": 2.93279124897792e+17, "trial_name": null, "trial_params": null }