juyongjiang's picture
upload model checkpoint
f6c2fb5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 380,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02631578947368421,
"grad_norm": 170.0,
"learning_rate": 5.263157894736842e-06,
"loss": 33.7533,
"step": 1
},
{
"epoch": 0.13157894736842105,
"grad_norm": 137.0,
"learning_rate": 2.6315789473684212e-05,
"loss": 34.5932,
"step": 5
},
{
"epoch": 0.2631578947368421,
"grad_norm": 67.5,
"learning_rate": 5.2631578947368424e-05,
"loss": 30.1164,
"step": 10
},
{
"epoch": 0.39473684210526316,
"grad_norm": 17.25,
"learning_rate": 7.894736842105263e-05,
"loss": 21.7002,
"step": 15
},
{
"epoch": 0.5263157894736842,
"grad_norm": 14.3125,
"learning_rate": 0.00010526315789473685,
"loss": 17.9593,
"step": 20
},
{
"epoch": 0.6578947368421053,
"grad_norm": 5.375,
"learning_rate": 0.00013157894736842108,
"loss": 15.8965,
"step": 25
},
{
"epoch": 0.7894736842105263,
"grad_norm": 3.203125,
"learning_rate": 0.00015789473684210527,
"loss": 14.3005,
"step": 30
},
{
"epoch": 0.9210526315789473,
"grad_norm": 3.90625,
"learning_rate": 0.00018421052631578948,
"loss": 13.7073,
"step": 35
},
{
"epoch": 1.0,
"eval_loss": 7.137722015380859,
"eval_runtime": 0.2537,
"eval_samples_per_second": 39.423,
"eval_steps_per_second": 3.942,
"step": 38
},
{
"epoch": 1.0526315789473684,
"grad_norm": 7.375,
"learning_rate": 0.00019998312416333227,
"loss": 12.9513,
"step": 40
},
{
"epoch": 1.1842105263157894,
"grad_norm": 12.0,
"learning_rate": 0.00019979333640833947,
"loss": 10.9396,
"step": 45
},
{
"epoch": 1.3157894736842106,
"grad_norm": 20.25,
"learning_rate": 0.00019939306773179497,
"loss": 8.1563,
"step": 50
},
{
"epoch": 1.4473684210526316,
"grad_norm": 13.8125,
"learning_rate": 0.00019878316236762196,
"loss": 4.3271,
"step": 55
},
{
"epoch": 1.5789473684210527,
"grad_norm": 5.84375,
"learning_rate": 0.0001979649067087574,
"loss": 2.3733,
"step": 60
},
{
"epoch": 1.7105263157894737,
"grad_norm": 4.96875,
"learning_rate": 0.00019694002659393305,
"loss": 2.0099,
"step": 65
},
{
"epoch": 1.8421052631578947,
"grad_norm": 1.171875,
"learning_rate": 0.00019571068366759143,
"loss": 1.7688,
"step": 70
},
{
"epoch": 1.973684210526316,
"grad_norm": 1.046875,
"learning_rate": 0.00019427947082061432,
"loss": 1.6173,
"step": 75
},
{
"epoch": 2.0,
"eval_loss": 2.9756951332092285,
"eval_runtime": 0.2386,
"eval_samples_per_second": 41.915,
"eval_steps_per_second": 4.191,
"step": 76
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.9453125,
"learning_rate": 0.00019264940672148018,
"loss": 1.5155,
"step": 80
},
{
"epoch": 2.236842105263158,
"grad_norm": 0.84765625,
"learning_rate": 0.00019082392944938466,
"loss": 1.4557,
"step": 85
},
{
"epoch": 2.3684210526315788,
"grad_norm": 0.7421875,
"learning_rate": 0.00018880688924275378,
"loss": 1.4092,
"step": 90
},
{
"epoch": 2.5,
"grad_norm": 0.43359375,
"learning_rate": 0.00018660254037844388,
"loss": 1.3523,
"step": 95
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.58203125,
"learning_rate": 0.00018421553219875658,
"loss": 1.3176,
"step": 100
},
{
"epoch": 2.763157894736842,
"grad_norm": 0.58203125,
"learning_rate": 0.0001816508993051943,
"loss": 1.2811,
"step": 105
},
{
"epoch": 2.8947368421052633,
"grad_norm": 0.82421875,
"learning_rate": 0.00017891405093963938,
"loss": 1.2585,
"step": 110
},
{
"epoch": 3.0,
"eval_loss": 2.7225849628448486,
"eval_runtime": 0.2375,
"eval_samples_per_second": 42.102,
"eval_steps_per_second": 4.21,
"step": 114
},
{
"epoch": 3.026315789473684,
"grad_norm": 0.9453125,
"learning_rate": 0.00017601075957535364,
"loss": 1.2406,
"step": 115
},
{
"epoch": 3.1578947368421053,
"grad_norm": 0.984375,
"learning_rate": 0.0001729471487418621,
"loss": 1.209,
"step": 120
},
{
"epoch": 3.2894736842105265,
"grad_norm": 0.85546875,
"learning_rate": 0.00016972968010939954,
"loss": 1.1876,
"step": 125
},
{
"epoch": 3.4210526315789473,
"grad_norm": 0.953125,
"learning_rate": 0.00016636513986016213,
"loss": 1.1913,
"step": 130
},
{
"epoch": 3.5526315789473686,
"grad_norm": 0.6015625,
"learning_rate": 0.0001628606243751082,
"loss": 1.17,
"step": 135
},
{
"epoch": 3.6842105263157894,
"grad_norm": 0.546875,
"learning_rate": 0.00015922352526649803,
"loss": 1.1573,
"step": 140
},
{
"epoch": 3.8157894736842106,
"grad_norm": 0.734375,
"learning_rate": 0.00015546151378774086,
"loss": 1.157,
"step": 145
},
{
"epoch": 3.9473684210526314,
"grad_norm": 0.81640625,
"learning_rate": 0.00015158252465343242,
"loss": 1.1493,
"step": 150
},
{
"epoch": 4.0,
"eval_loss": 2.631202220916748,
"eval_runtime": 0.238,
"eval_samples_per_second": 42.017,
"eval_steps_per_second": 4.202,
"step": 152
},
{
"epoch": 4.078947368421052,
"grad_norm": 1.21875,
"learning_rate": 0.00014759473930370736,
"loss": 1.1375,
"step": 155
},
{
"epoch": 4.2105263157894735,
"grad_norm": 1.1015625,
"learning_rate": 0.00014350656864820733,
"loss": 1.1162,
"step": 160
},
{
"epoch": 4.342105263157895,
"grad_norm": 1.2734375,
"learning_rate": 0.0001393266353260583,
"loss": 1.1148,
"step": 165
},
{
"epoch": 4.473684210526316,
"grad_norm": 1.0078125,
"learning_rate": 0.00013506375551927547,
"loss": 1.1125,
"step": 170
},
{
"epoch": 4.605263157894737,
"grad_norm": 0.625,
"learning_rate": 0.00013072692035795305,
"loss": 1.1057,
"step": 175
},
{
"epoch": 4.7368421052631575,
"grad_norm": 0.7421875,
"learning_rate": 0.00012632527695645993,
"loss": 1.1023,
"step": 180
},
{
"epoch": 4.868421052631579,
"grad_norm": 0.703125,
"learning_rate": 0.0001218681091206376,
"loss": 1.0983,
"step": 185
},
{
"epoch": 5.0,
"grad_norm": 0.6796875,
"learning_rate": 0.00011736481776669306,
"loss": 1.0934,
"step": 190
},
{
"epoch": 5.0,
"eval_loss": 2.6199984550476074,
"eval_runtime": 0.2365,
"eval_samples_per_second": 42.281,
"eval_steps_per_second": 4.228,
"step": 190
},
{
"epoch": 5.131578947368421,
"grad_norm": 0.74609375,
"learning_rate": 0.00011282490109308633,
"loss": 1.0826,
"step": 195
},
{
"epoch": 5.2631578947368425,
"grad_norm": 0.81640625,
"learning_rate": 0.00010825793454723325,
"loss": 1.0788,
"step": 200
},
{
"epoch": 5.394736842105263,
"grad_norm": 0.68359375,
"learning_rate": 0.00010367355062927726,
"loss": 1.0782,
"step": 205
},
{
"epoch": 5.526315789473684,
"grad_norm": 0.95703125,
"learning_rate": 9.908141857552737e-05,
"loss": 1.0606,
"step": 210
},
{
"epoch": 5.657894736842105,
"grad_norm": 0.65234375,
"learning_rate": 9.449122396441345e-05,
"loss": 1.0564,
"step": 215
},
{
"epoch": 5.7894736842105265,
"grad_norm": 0.5546875,
"learning_rate": 8.991264828797319e-05,
"loss": 1.0509,
"step": 220
},
{
"epoch": 5.921052631578947,
"grad_norm": 0.6328125,
"learning_rate": 8.535534853195786e-05,
"loss": 1.0587,
"step": 225
},
{
"epoch": 6.0,
"eval_loss": 2.6019885540008545,
"eval_runtime": 0.2397,
"eval_samples_per_second": 41.723,
"eval_steps_per_second": 4.172,
"step": 228
},
{
"epoch": 6.052631578947368,
"grad_norm": 0.6640625,
"learning_rate": 8.082893680762619e-05,
"loss": 1.0543,
"step": 230
},
{
"epoch": 6.184210526315789,
"grad_norm": 0.72265625,
"learning_rate": 7.634296007818576e-05,
"loss": 1.0452,
"step": 235
},
{
"epoch": 6.315789473684211,
"grad_norm": 0.84375,
"learning_rate": 7.190688002264308e-05,
"loss": 1.042,
"step": 240
},
{
"epoch": 6.447368421052632,
"grad_norm": 0.66796875,
"learning_rate": 6.753005307953167e-05,
"loss": 1.0413,
"step": 245
},
{
"epoch": 6.578947368421053,
"grad_norm": 0.80078125,
"learning_rate": 6.322171071261071e-05,
"loss": 1.0436,
"step": 250
},
{
"epoch": 6.7105263157894735,
"grad_norm": 0.77734375,
"learning_rate": 5.8990939940156e-05,
"loss": 1.0367,
"step": 255
},
{
"epoch": 6.842105263157895,
"grad_norm": 0.5703125,
"learning_rate": 5.484666416891109e-05,
"loss": 1.0299,
"step": 260
},
{
"epoch": 6.973684210526316,
"grad_norm": 0.70703125,
"learning_rate": 5.079762437312219e-05,
"loss": 1.0289,
"step": 265
},
{
"epoch": 7.0,
"eval_loss": 2.5995545387268066,
"eval_runtime": 0.2387,
"eval_samples_per_second": 41.887,
"eval_steps_per_second": 4.189,
"step": 266
},
{
"epoch": 7.105263157894737,
"grad_norm": 0.53125,
"learning_rate": 4.685236065835443e-05,
"loss": 1.0249,
"step": 270
},
{
"epoch": 7.2368421052631575,
"grad_norm": 0.671875,
"learning_rate": 4.301919424897338e-05,
"loss": 1.0192,
"step": 275
},
{
"epoch": 7.368421052631579,
"grad_norm": 0.486328125,
"learning_rate": 3.9306209937284346e-05,
"loss": 1.0285,
"step": 280
},
{
"epoch": 7.5,
"grad_norm": 0.72265625,
"learning_rate": 3.5721239031346066e-05,
"loss": 1.025,
"step": 285
},
{
"epoch": 7.631578947368421,
"grad_norm": 0.58984375,
"learning_rate": 3.227184283742591e-05,
"loss": 1.0347,
"step": 290
},
{
"epoch": 7.7631578947368425,
"grad_norm": 0.51171875,
"learning_rate": 2.89652967119336e-05,
"loss": 1.0223,
"step": 295
},
{
"epoch": 7.894736842105263,
"grad_norm": 0.52734375,
"learning_rate": 2.5808574716471856e-05,
"loss": 1.0197,
"step": 300
},
{
"epoch": 8.0,
"eval_loss": 2.602214813232422,
"eval_runtime": 0.2375,
"eval_samples_per_second": 42.1,
"eval_steps_per_second": 4.21,
"step": 304
},
{
"epoch": 8.026315789473685,
"grad_norm": 0.5546875,
"learning_rate": 2.2808334908367914e-05,
"loss": 1.023,
"step": 305
},
{
"epoch": 8.157894736842104,
"grad_norm": 0.498046875,
"learning_rate": 1.9970905297711606e-05,
"loss": 1.0158,
"step": 310
},
{
"epoch": 8.289473684210526,
"grad_norm": 0.51953125,
"learning_rate": 1.7302270500518182e-05,
"loss": 1.0183,
"step": 315
},
{
"epoch": 8.421052631578947,
"grad_norm": 0.46875,
"learning_rate": 1.4808059116167305e-05,
"loss": 1.0081,
"step": 320
},
{
"epoch": 8.552631578947368,
"grad_norm": 0.53125,
"learning_rate": 1.2493531855740625e-05,
"loss": 1.0149,
"step": 325
},
{
"epoch": 8.68421052631579,
"grad_norm": 0.494140625,
"learning_rate": 1.0363570446297999e-05,
"loss": 1.0197,
"step": 330
},
{
"epoch": 8.81578947368421,
"grad_norm": 0.46484375,
"learning_rate": 8.422667334494249e-06,
"loss": 1.02,
"step": 335
},
{
"epoch": 8.947368421052632,
"grad_norm": 0.5,
"learning_rate": 6.674916211254289e-06,
"loss": 1.0221,
"step": 340
},
{
"epoch": 9.0,
"eval_loss": 2.605945110321045,
"eval_runtime": 0.2375,
"eval_samples_per_second": 42.099,
"eval_steps_per_second": 4.21,
"step": 342
},
{
"epoch": 9.078947368421053,
"grad_norm": 0.45703125,
"learning_rate": 5.124003377490582e-06,
"loss": 1.0218,
"step": 345
},
{
"epoch": 9.210526315789474,
"grad_norm": 0.46484375,
"learning_rate": 3.7731999690749585e-06,
"loss": 1.0087,
"step": 350
},
{
"epoch": 9.342105263157896,
"grad_norm": 0.4609375,
"learning_rate": 2.6253550574632303e-06,
"loss": 1.0178,
"step": 355
},
{
"epoch": 9.473684210526315,
"grad_norm": 0.453125,
"learning_rate": 1.6828896405244988e-06,
"loss": 1.0225,
"step": 360
},
{
"epoch": 9.605263157894736,
"grad_norm": 0.455078125,
"learning_rate": 9.477915362496758e-07,
"loss": 1.0206,
"step": 365
},
{
"epoch": 9.736842105263158,
"grad_norm": 0.490234375,
"learning_rate": 4.216111901092501e-07,
"loss": 1.017,
"step": 370
},
{
"epoch": 9.868421052631579,
"grad_norm": 0.46484375,
"learning_rate": 1.0545840490313596e-07,
"loss": 1.0248,
"step": 375
},
{
"epoch": 10.0,
"grad_norm": 0.57421875,
"learning_rate": 0.0,
"loss": 1.0175,
"step": 380
},
{
"epoch": 10.0,
"eval_loss": 2.6068081855773926,
"eval_runtime": 0.235,
"eval_samples_per_second": 42.55,
"eval_steps_per_second": 4.255,
"step": 380
},
{
"epoch": 10.0,
"step": 380,
"total_flos": 1.158687595912233e+18,
"train_loss": 3.413645140748275,
"train_runtime": 927.0586,
"train_samples_per_second": 26.201,
"train_steps_per_second": 0.41
}
],
"logging_steps": 5,
"max_steps": 380,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"total_flos": 1.158687595912233e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}