Jiazheng Li
init push
7d35748
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.993680884676145,
"eval_steps": 100,
"global_step": 632,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.13,
"grad_norm": 0.37133172154426575,
"learning_rate": 4.9876553763060684e-05,
"loss": 1.3016,
"step": 20
},
{
"epoch": 0.25,
"grad_norm": 0.3215538263320923,
"learning_rate": 4.950743417011591e-05,
"loss": 0.9953,
"step": 40
},
{
"epoch": 0.38,
"grad_norm": 0.33872345089912415,
"learning_rate": 4.889628653514402e-05,
"loss": 0.925,
"step": 60
},
{
"epoch": 0.51,
"grad_norm": 0.3175918459892273,
"learning_rate": 4.804914636820517e-05,
"loss": 0.8788,
"step": 80
},
{
"epoch": 0.63,
"grad_norm": 0.3202904164791107,
"learning_rate": 4.6974379770560846e-05,
"loss": 0.8668,
"step": 100
},
{
"epoch": 0.63,
"eval_loss": 0.8571113348007202,
"eval_runtime": 868.2347,
"eval_samples_per_second": 3.192,
"eval_steps_per_second": 0.4,
"step": 100
},
{
"epoch": 0.76,
"grad_norm": 0.33093100786209106,
"learning_rate": 4.5682600813576435e-05,
"loss": 0.8488,
"step": 120
},
{
"epoch": 0.88,
"grad_norm": 0.3297623097896576,
"learning_rate": 4.41865667173477e-05,
"loss": 0.8462,
"step": 140
},
{
"epoch": 1.01,
"grad_norm": 0.3524036705493927,
"learning_rate": 4.2501051864235636e-05,
"loss": 0.827,
"step": 160
},
{
"epoch": 1.14,
"grad_norm": 0.3592537045478821,
"learning_rate": 4.0642701891514e-05,
"loss": 0.7885,
"step": 180
},
{
"epoch": 1.26,
"grad_norm": 0.3888987600803375,
"learning_rate": 3.862986930406669e-05,
"loss": 0.7837,
"step": 200
},
{
"epoch": 1.26,
"eval_loss": 0.8230095505714417,
"eval_runtime": 869.2538,
"eval_samples_per_second": 3.188,
"eval_steps_per_second": 0.399,
"step": 200
},
{
"epoch": 1.39,
"grad_norm": 0.380818247795105,
"learning_rate": 3.6482432230574446e-05,
"loss": 0.7938,
"step": 220
},
{
"epoch": 1.52,
"grad_norm": 0.3564074635505676,
"learning_rate": 3.4221598113100195e-05,
"loss": 0.7819,
"step": 240
},
{
"epoch": 1.64,
"grad_norm": 0.3780010938644409,
"learning_rate": 3.186969426877563e-05,
"loss": 0.7877,
"step": 260
},
{
"epoch": 1.77,
"grad_norm": 0.36975908279418945,
"learning_rate": 2.9449947391938766e-05,
"loss": 0.7918,
"step": 280
},
{
"epoch": 1.9,
"grad_norm": 0.39148128032684326,
"learning_rate": 2.6986254174292862e-05,
"loss": 0.7824,
"step": 300
},
{
"epoch": 1.9,
"eval_loss": 0.8058096766471863,
"eval_runtime": 867.7611,
"eval_samples_per_second": 3.193,
"eval_steps_per_second": 0.4,
"step": 300
},
{
"epoch": 2.02,
"grad_norm": 0.36495909094810486,
"learning_rate": 2.4502945308373246e-05,
"loss": 0.7741,
"step": 320
},
{
"epoch": 2.15,
"grad_norm": 0.40916556119918823,
"learning_rate": 2.2024545204952383e-05,
"loss": 0.7369,
"step": 340
},
{
"epoch": 2.27,
"grad_norm": 0.42436033487319946,
"learning_rate": 1.957552979734205e-05,
"loss": 0.7365,
"step": 360
},
{
"epoch": 2.4,
"grad_norm": 0.43983975052833557,
"learning_rate": 1.7180084824444325e-05,
"loss": 0.7463,
"step": 380
},
{
"epoch": 2.53,
"grad_norm": 0.44617146253585815,
"learning_rate": 1.4861866979675154e-05,
"loss": 0.7401,
"step": 400
},
{
"epoch": 2.53,
"eval_loss": 0.8059037923812866,
"eval_runtime": 873.5556,
"eval_samples_per_second": 3.172,
"eval_steps_per_second": 0.397,
"step": 400
},
{
"epoch": 2.65,
"grad_norm": 0.4369719624519348,
"learning_rate": 1.2643770284581929e-05,
"loss": 0.7332,
"step": 420
},
{
"epoch": 2.78,
"grad_norm": 0.4235495328903198,
"learning_rate": 1.0547699994378787e-05,
"loss": 0.7364,
"step": 440
},
{
"epoch": 2.91,
"grad_norm": 0.4584214389324188,
"learning_rate": 8.594356268240616e-06,
"loss": 0.7318,
"step": 460
},
{
"epoch": 3.03,
"grad_norm": 0.4329874813556671,
"learning_rate": 6.803029740762648e-06,
"loss": 0.7222,
"step": 480
},
{
"epoch": 3.16,
"grad_norm": 0.5470691919326782,
"learning_rate": 5.191411013460645e-06,
"loss": 0.7101,
"step": 500
},
{
"epoch": 3.16,
"eval_loss": 0.807178795337677,
"eval_runtime": 867.2257,
"eval_samples_per_second": 3.195,
"eval_steps_per_second": 0.4,
"step": 500
},
{
"epoch": 3.29,
"grad_norm": 0.4557144343852997,
"learning_rate": 3.775415947715899e-06,
"loss": 0.7135,
"step": 520
},
{
"epoch": 3.41,
"grad_norm": 0.46527382731437683,
"learning_rate": 2.5690284845196923e-06,
"loss": 0.7045,
"step": 540
},
{
"epoch": 3.54,
"grad_norm": 0.4702458679676056,
"learning_rate": 1.5841625432818057e-06,
"loss": 0.7057,
"step": 560
},
{
"epoch": 3.67,
"grad_norm": 0.4915095865726471,
"learning_rate": 8.305443635490711e-07,
"loss": 0.7044,
"step": 580
},
{
"epoch": 3.79,
"grad_norm": 0.5043957233428955,
"learning_rate": 3.1561645159166597e-07,
"loss": 0.7037,
"step": 600
},
{
"epoch": 3.79,
"eval_loss": 0.8061766028404236,
"eval_runtime": 867.9324,
"eval_samples_per_second": 3.193,
"eval_steps_per_second": 0.4,
"step": 600
},
{
"epoch": 3.92,
"grad_norm": 0.4672119915485382,
"learning_rate": 4.4464080451675494e-08,
"loss": 0.7027,
"step": 620
},
{
"epoch": 3.99,
"step": 632,
"total_flos": 1.137224723506643e+19,
"train_loss": 0.7920433780815028,
"train_runtime": 47185.3474,
"train_samples_per_second": 0.858,
"train_steps_per_second": 0.013
}
],
"logging_steps": 20,
"max_steps": 632,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"total_flos": 1.137224723506643e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}