|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 870, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.14367816091954022, |
|
"grad_norm": 1.424246072769165, |
|
"learning_rate": 5.517241379310345e-05, |
|
"loss": 2.0273, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.28735632183908044, |
|
"grad_norm": 0.9430229663848877, |
|
"learning_rate": 0.00011264367816091954, |
|
"loss": 1.8285, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.43103448275862066, |
|
"grad_norm": 0.7553120851516724, |
|
"learning_rate": 0.00017011494252873563, |
|
"loss": 1.6756, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5747126436781609, |
|
"grad_norm": 0.7407281398773193, |
|
"learning_rate": 0.00019693486590038314, |
|
"loss": 1.5606, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7183908045977011, |
|
"grad_norm": 0.7509495615959167, |
|
"learning_rate": 0.0001905491698595147, |
|
"loss": 1.5438, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 0.7058537602424622, |
|
"learning_rate": 0.00018416347381864626, |
|
"loss": 1.4816, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0057471264367817, |
|
"grad_norm": 0.6202262043952942, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 1.5232, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.1494252873563218, |
|
"grad_norm": 0.7314026355743408, |
|
"learning_rate": 0.00017139208173690932, |
|
"loss": 1.4954, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.293103448275862, |
|
"grad_norm": 0.7846217155456543, |
|
"learning_rate": 0.00016500638569604087, |
|
"loss": 1.4199, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.4367816091954024, |
|
"grad_norm": 0.751372754573822, |
|
"learning_rate": 0.00015862068965517243, |
|
"loss": 1.48, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5804597701149425, |
|
"grad_norm": 0.7667829990386963, |
|
"learning_rate": 0.00015223499361430396, |
|
"loss": 1.4346, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 0.7370414733886719, |
|
"learning_rate": 0.00014584929757343552, |
|
"loss": 1.4334, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.867816091954023, |
|
"grad_norm": 0.9422834515571594, |
|
"learning_rate": 0.00013946360153256705, |
|
"loss": 1.3969, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.0114942528735633, |
|
"grad_norm": 0.7596230506896973, |
|
"learning_rate": 0.0001330779054916986, |
|
"loss": 1.4241, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.1551724137931036, |
|
"grad_norm": 0.9657158851623535, |
|
"learning_rate": 0.00012669220945083016, |
|
"loss": 1.3115, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.2988505747126435, |
|
"grad_norm": 0.9675273895263672, |
|
"learning_rate": 0.00012030651340996169, |
|
"loss": 1.3382, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.442528735632184, |
|
"grad_norm": 0.9593296647071838, |
|
"learning_rate": 0.00011392081736909323, |
|
"loss": 1.3843, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.586206896551724, |
|
"grad_norm": 0.8661421537399292, |
|
"learning_rate": 0.00010753512132822479, |
|
"loss": 1.3161, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.7298850574712645, |
|
"grad_norm": 0.9521291851997375, |
|
"learning_rate": 0.00010114942528735633, |
|
"loss": 1.3355, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.873563218390805, |
|
"grad_norm": 1.0091650485992432, |
|
"learning_rate": 9.476372924648788e-05, |
|
"loss": 1.3052, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.0172413793103448, |
|
"grad_norm": 0.9030539989471436, |
|
"learning_rate": 8.837803320561942e-05, |
|
"loss": 1.2682, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.160919540229885, |
|
"grad_norm": 1.0906364917755127, |
|
"learning_rate": 8.199233716475096e-05, |
|
"loss": 1.2097, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.3045977011494254, |
|
"grad_norm": 1.1765050888061523, |
|
"learning_rate": 7.56066411238825e-05, |
|
"loss": 1.2564, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"grad_norm": 1.1693239212036133, |
|
"learning_rate": 6.922094508301405e-05, |
|
"loss": 1.2207, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.5919540229885056, |
|
"grad_norm": 1.0836124420166016, |
|
"learning_rate": 6.283524904214559e-05, |
|
"loss": 1.2338, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 3.735632183908046, |
|
"grad_norm": 1.0297603607177734, |
|
"learning_rate": 5.644955300127714e-05, |
|
"loss": 1.2179, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.8793103448275863, |
|
"grad_norm": 1.3402975797653198, |
|
"learning_rate": 5.0063856960408687e-05, |
|
"loss": 1.2363, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.022988505747127, |
|
"grad_norm": 1.1158450841903687, |
|
"learning_rate": 4.367816091954024e-05, |
|
"loss": 1.1844, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"grad_norm": 1.3297624588012695, |
|
"learning_rate": 3.729246487867178e-05, |
|
"loss": 1.1176, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 4.310344827586207, |
|
"grad_norm": 1.3470237255096436, |
|
"learning_rate": 3.090676883780332e-05, |
|
"loss": 1.1639, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.454022988505747, |
|
"grad_norm": 1.293717861175537, |
|
"learning_rate": 2.4521072796934867e-05, |
|
"loss": 1.1668, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 4.597701149425287, |
|
"grad_norm": 1.400448203086853, |
|
"learning_rate": 1.8135376756066413e-05, |
|
"loss": 1.1751, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.741379310344827, |
|
"grad_norm": 1.1660029888153076, |
|
"learning_rate": 1.1749680715197957e-05, |
|
"loss": 1.1406, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 4.885057471264368, |
|
"grad_norm": 1.311295747756958, |
|
"learning_rate": 5.3639846743295025e-06, |
|
"loss": 1.1406, |
|
"step": 850 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 870, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 1.6060942324334592e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|