|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 51.54639175257732, |
|
"eval_steps": 500, |
|
"global_step": 25000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0309278350515463, |
|
"grad_norm": 4.000246524810791, |
|
"learning_rate": 3.4364261168384886e-06, |
|
"loss": 8.264, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0618556701030926, |
|
"grad_norm": 2.342319965362549, |
|
"learning_rate": 6.872852233676977e-06, |
|
"loss": 6.6664, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.0927835051546393, |
|
"grad_norm": 2.5340118408203125, |
|
"learning_rate": 1.0309278350515464e-05, |
|
"loss": 5.3366, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.123711340206185, |
|
"grad_norm": 2.5916709899902344, |
|
"learning_rate": 1.3745704467353954e-05, |
|
"loss": 4.4002, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.154639175257732, |
|
"grad_norm": 3.448418378829956, |
|
"learning_rate": 1.7182130584192442e-05, |
|
"loss": 3.8367, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.185567010309279, |
|
"grad_norm": 3.4657623767852783, |
|
"learning_rate": 2.0618556701030927e-05, |
|
"loss": 3.4293, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 7.216494845360825, |
|
"grad_norm": 4.080520153045654, |
|
"learning_rate": 2.4054982817869417e-05, |
|
"loss": 3.1112, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 8.24742268041237, |
|
"grad_norm": 3.7761707305908203, |
|
"learning_rate": 2.749140893470791e-05, |
|
"loss": 2.8723, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 9.278350515463918, |
|
"grad_norm": 3.7703592777252197, |
|
"learning_rate": 3.0927835051546395e-05, |
|
"loss": 2.6786, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 10.309278350515465, |
|
"grad_norm": 2.716610908508301, |
|
"learning_rate": 3.4364261168384884e-05, |
|
"loss": 2.5182, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 11.34020618556701, |
|
"grad_norm": 3.0980777740478516, |
|
"learning_rate": 3.7800687285223366e-05, |
|
"loss": 2.388, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 12.371134020618557, |
|
"grad_norm": 2.8109469413757324, |
|
"learning_rate": 4.1237113402061855e-05, |
|
"loss": 2.271, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 13.402061855670103, |
|
"grad_norm": 2.512942314147949, |
|
"learning_rate": 4.466666666666667e-05, |
|
"loss": 2.1741, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 14.43298969072165, |
|
"grad_norm": 2.585681200027466, |
|
"learning_rate": 4.810309278350515e-05, |
|
"loss": 2.0924, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 15.463917525773196, |
|
"grad_norm": 2.4047884941101074, |
|
"learning_rate": 5.153951890034364e-05, |
|
"loss": 2.011, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 16.49484536082474, |
|
"grad_norm": 2.4080662727355957, |
|
"learning_rate": 5.497594501718213e-05, |
|
"loss": 1.946, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 17.52577319587629, |
|
"grad_norm": 2.2564289569854736, |
|
"learning_rate": 5.840549828178694e-05, |
|
"loss": 1.8899, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 18.556701030927837, |
|
"grad_norm": 2.0903544425964355, |
|
"learning_rate": 6.184192439862543e-05, |
|
"loss": 1.8404, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 19.587628865979383, |
|
"grad_norm": 2.0401394367218018, |
|
"learning_rate": 6.527835051546391e-05, |
|
"loss": 1.7894, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 20.61855670103093, |
|
"grad_norm": 2.026660919189453, |
|
"learning_rate": 6.87147766323024e-05, |
|
"loss": 1.7505, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 21.649484536082475, |
|
"grad_norm": 1.9435638189315796, |
|
"learning_rate": 7.21512027491409e-05, |
|
"loss": 1.7097, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 22.68041237113402, |
|
"grad_norm": 1.9047859907150269, |
|
"learning_rate": 7.558075601374571e-05, |
|
"loss": 1.6748, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 23.711340206185568, |
|
"grad_norm": 1.8212696313858032, |
|
"learning_rate": 7.90171821305842e-05, |
|
"loss": 1.64, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 24.742268041237114, |
|
"grad_norm": 1.7629321813583374, |
|
"learning_rate": 8.245360824742269e-05, |
|
"loss": 1.6115, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 25.77319587628866, |
|
"grad_norm": 1.7129898071289062, |
|
"learning_rate": 8.589003436426117e-05, |
|
"loss": 1.5824, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 26.804123711340207, |
|
"grad_norm": 1.6478886604309082, |
|
"learning_rate": 8.932646048109967e-05, |
|
"loss": 1.5537, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 27.835051546391753, |
|
"grad_norm": 1.6928553581237793, |
|
"learning_rate": 9.276288659793815e-05, |
|
"loss": 1.528, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 28.8659793814433, |
|
"grad_norm": 1.5595742464065552, |
|
"learning_rate": 9.619931271477663e-05, |
|
"loss": 1.5037, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 29.896907216494846, |
|
"grad_norm": 1.6010233163833618, |
|
"learning_rate": 9.963573883161513e-05, |
|
"loss": 1.4804, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 30.927835051546392, |
|
"grad_norm": 1.4490132331848145, |
|
"learning_rate": 9.99574236655172e-05, |
|
"loss": 1.4564, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 31.95876288659794, |
|
"grad_norm": 1.449069857597351, |
|
"learning_rate": 9.980854570375779e-05, |
|
"loss": 1.4322, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 32.98969072164948, |
|
"grad_norm": 1.4955676794052124, |
|
"learning_rate": 9.955306053101556e-05, |
|
"loss": 1.4091, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 34.02061855670103, |
|
"grad_norm": 1.4652965068817139, |
|
"learning_rate": 9.919234343505417e-05, |
|
"loss": 1.3867, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 35.05154639175258, |
|
"grad_norm": 1.4383347034454346, |
|
"learning_rate": 9.872572097155327e-05, |
|
"loss": 1.367, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 36.08247422680412, |
|
"grad_norm": 1.3232207298278809, |
|
"learning_rate": 9.815480890990188e-05, |
|
"loss": 1.3452, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 37.11340206185567, |
|
"grad_norm": 1.3220405578613281, |
|
"learning_rate": 9.748082919588761e-05, |
|
"loss": 1.3265, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 38.144329896907216, |
|
"grad_norm": 1.2943527698516846, |
|
"learning_rate": 9.670522437509286e-05, |
|
"loss": 1.3091, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 39.175257731958766, |
|
"grad_norm": 1.3372294902801514, |
|
"learning_rate": 9.582965450535715e-05, |
|
"loss": 1.2913, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 40.20618556701031, |
|
"grad_norm": 1.359191656112671, |
|
"learning_rate": 9.485599360368925e-05, |
|
"loss": 1.2745, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 41.23711340206186, |
|
"grad_norm": 1.3238861560821533, |
|
"learning_rate": 9.378632563523418e-05, |
|
"loss": 1.2581, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 42.2680412371134, |
|
"grad_norm": 1.311515212059021, |
|
"learning_rate": 9.262535873205258e-05, |
|
"loss": 1.2448, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 43.29896907216495, |
|
"grad_norm": 1.3535444736480713, |
|
"learning_rate": 9.137092541559738e-05, |
|
"loss": 1.2271, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 44.329896907216494, |
|
"grad_norm": 1.3142534494400024, |
|
"learning_rate": 9.002794426238008e-05, |
|
"loss": 1.2132, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 45.36082474226804, |
|
"grad_norm": 1.3405091762542725, |
|
"learning_rate": 8.859928970836587e-05, |
|
"loss": 1.1984, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 46.391752577319586, |
|
"grad_norm": 1.339012861251831, |
|
"learning_rate": 8.709112242917366e-05, |
|
"loss": 1.1846, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 47.422680412371136, |
|
"grad_norm": 1.336722493171692, |
|
"learning_rate": 8.550062673710893e-05, |
|
"loss": 1.1725, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 48.45360824742268, |
|
"grad_norm": 1.2952606678009033, |
|
"learning_rate": 8.383414764197326e-05, |
|
"loss": 1.1577, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 49.48453608247423, |
|
"grad_norm": 1.2937185764312744, |
|
"learning_rate": 8.209525197524074e-05, |
|
"loss": 1.1451, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 50.51546391752577, |
|
"grad_norm": 1.3922706842422485, |
|
"learning_rate": 8.029134275478738e-05, |
|
"loss": 1.1334, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 51.54639175257732, |
|
"grad_norm": 1.2733055353164673, |
|
"learning_rate": 7.841905215460069e-05, |
|
"loss": 1.1203, |
|
"step": 25000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 48500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.046659236626432e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|