|
{ |
|
"best_metric": 0.26356959342956543, |
|
"best_model_checkpoint": "./ryan_model3272024/checkpoint-1000", |
|
"epoch": 0.6496519721577726, |
|
"eval_steps": 100, |
|
"global_step": 1400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.38699468970298767, |
|
"learning_rate": 0.0001994199535962877, |
|
"loss": 0.4038, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.6787680387496948, |
|
"learning_rate": 0.00019883990719257543, |
|
"loss": 0.4003, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.5743306279182434, |
|
"learning_rate": 0.00019825986078886312, |
|
"loss": 0.3591, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.41705068945884705, |
|
"learning_rate": 0.00019767981438515082, |
|
"loss": 0.3524, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 0.339992493391037, |
|
"eval_na_accuracy": 0.7586872577667236, |
|
"eval_ordinal_accuracy": 0.38746026158332825, |
|
"eval_ordinal_mae": 0.8904515504837036, |
|
"eval_runtime": 335.205, |
|
"eval_samples_per_second": 11.87, |
|
"eval_steps_per_second": 1.486, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.36200761795043945, |
|
"learning_rate": 0.0001970997679814385, |
|
"loss": 0.3071, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.24589791893959045, |
|
"learning_rate": 0.00019651972157772623, |
|
"loss": 0.3475, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.6089735627174377, |
|
"learning_rate": 0.00019593967517401393, |
|
"loss": 0.3072, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5671761631965637, |
|
"learning_rate": 0.00019535962877030162, |
|
"loss": 0.2683, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 0.36712726950645447, |
|
"eval_na_accuracy": 0.623552143573761, |
|
"eval_ordinal_accuracy": 0.48916497826576233, |
|
"eval_ordinal_mae": 0.7306416630744934, |
|
"eval_runtime": 155.9343, |
|
"eval_samples_per_second": 25.517, |
|
"eval_steps_per_second": 3.194, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.2764167785644531, |
|
"learning_rate": 0.00019477958236658932, |
|
"loss": 0.2953, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.9076497554779053, |
|
"learning_rate": 0.00019419953596287704, |
|
"loss": 0.3382, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.2747127115726471, |
|
"learning_rate": 0.00019361948955916474, |
|
"loss": 0.2752, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.9448749423027039, |
|
"learning_rate": 0.00019303944315545243, |
|
"loss": 0.3314, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 0.3450469672679901, |
|
"eval_na_accuracy": 0.6969112157821655, |
|
"eval_ordinal_accuracy": 0.4013291001319885, |
|
"eval_ordinal_mae": 0.8077224493026733, |
|
"eval_runtime": 156.2328, |
|
"eval_samples_per_second": 25.468, |
|
"eval_steps_per_second": 3.188, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.2589721083641052, |
|
"learning_rate": 0.00019245939675174015, |
|
"loss": 0.3486, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.44286003708839417, |
|
"learning_rate": 0.00019187935034802785, |
|
"loss": 0.3386, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.3215602934360504, |
|
"learning_rate": 0.00019129930394431554, |
|
"loss": 0.3056, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.9510051012039185, |
|
"learning_rate": 0.00019071925754060324, |
|
"loss": 0.2747, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 0.28132036328315735, |
|
"eval_na_accuracy": 0.7895752787590027, |
|
"eval_ordinal_accuracy": 0.5423288345336914, |
|
"eval_ordinal_mae": 0.6105712056159973, |
|
"eval_runtime": 155.1965, |
|
"eval_samples_per_second": 25.638, |
|
"eval_steps_per_second": 3.209, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5417093634605408, |
|
"learning_rate": 0.00019013921113689096, |
|
"loss": 0.2522, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.405881643295288, |
|
"learning_rate": 0.00018955916473317868, |
|
"loss": 0.3589, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.8319898843765259, |
|
"learning_rate": 0.00018897911832946638, |
|
"loss": 0.2991, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.9455621242523193, |
|
"learning_rate": 0.00018839907192575407, |
|
"loss": 0.3247, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_loss": 0.3143959045410156, |
|
"eval_na_accuracy": 0.7104247212409973, |
|
"eval_ordinal_accuracy": 0.4524703919887543, |
|
"eval_ordinal_mae": 0.7256373763084412, |
|
"eval_runtime": 157.1141, |
|
"eval_samples_per_second": 25.326, |
|
"eval_steps_per_second": 3.17, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.6339251399040222, |
|
"learning_rate": 0.00018781902552204177, |
|
"loss": 0.303, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.3713740408420563, |
|
"learning_rate": 0.0001872389791183295, |
|
"loss": 0.3035, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7050974369049072, |
|
"learning_rate": 0.00018665893271461718, |
|
"loss": 0.2609, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.791477620601654, |
|
"learning_rate": 0.00018607888631090488, |
|
"loss": 0.3612, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 0.3074879050254822, |
|
"eval_na_accuracy": 0.7586872577667236, |
|
"eval_ordinal_accuracy": 0.4984108507633209, |
|
"eval_ordinal_mae": 0.6415887475013733, |
|
"eval_runtime": 154.2538, |
|
"eval_samples_per_second": 25.795, |
|
"eval_steps_per_second": 3.228, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.39196524024009705, |
|
"learning_rate": 0.0001854988399071926, |
|
"loss": 0.31, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.0753191709518433, |
|
"learning_rate": 0.0001849187935034803, |
|
"loss": 0.2722, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.8922611474990845, |
|
"learning_rate": 0.000184338747099768, |
|
"loss": 0.3132, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6866246461868286, |
|
"learning_rate": 0.0001837587006960557, |
|
"loss": 0.3031, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 0.2784635126590729, |
|
"eval_na_accuracy": 0.7895752787590027, |
|
"eval_ordinal_accuracy": 0.5556197762489319, |
|
"eval_ordinal_mae": 0.5720168352127075, |
|
"eval_runtime": 154.421, |
|
"eval_samples_per_second": 25.767, |
|
"eval_steps_per_second": 3.225, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.713051676750183, |
|
"learning_rate": 0.0001831786542923434, |
|
"loss": 0.337, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.0872548818588257, |
|
"learning_rate": 0.0001825986078886311, |
|
"loss": 0.2918, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.5099256038665771, |
|
"learning_rate": 0.0001820185614849188, |
|
"loss": 0.2509, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5774210691452026, |
|
"learning_rate": 0.0001814385150812065, |
|
"loss": 0.2866, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_loss": 0.28780511021614075, |
|
"eval_na_accuracy": 0.7335907220840454, |
|
"eval_ordinal_accuracy": 0.5775787234306335, |
|
"eval_ordinal_mae": 0.5347856879234314, |
|
"eval_runtime": 154.6062, |
|
"eval_samples_per_second": 25.736, |
|
"eval_steps_per_second": 3.221, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.33059367537498474, |
|
"learning_rate": 0.00018085846867749422, |
|
"loss": 0.2626, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.45087730884552, |
|
"learning_rate": 0.0001802784222737819, |
|
"loss": 0.3485, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.195901870727539, |
|
"learning_rate": 0.0001796983758700696, |
|
"loss": 0.3007, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.26779890060424805, |
|
"learning_rate": 0.00017911832946635733, |
|
"loss": 0.2927, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 0.2688673734664917, |
|
"eval_na_accuracy": 0.7972972989082336, |
|
"eval_ordinal_accuracy": 0.5573533773422241, |
|
"eval_ordinal_mae": 0.5855077505111694, |
|
"eval_runtime": 154.5178, |
|
"eval_samples_per_second": 25.751, |
|
"eval_steps_per_second": 3.223, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.5635965466499329, |
|
"learning_rate": 0.00017853828306264502, |
|
"loss": 0.269, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.8135786056518555, |
|
"learning_rate": 0.00017795823665893272, |
|
"loss": 0.2677, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.49396631121635437, |
|
"learning_rate": 0.0001773781902552204, |
|
"loss": 0.3069, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.3267723321914673, |
|
"learning_rate": 0.00017679814385150814, |
|
"loss": 0.3003, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 0.26356959342956543, |
|
"eval_na_accuracy": 0.7915058135986328, |
|
"eval_ordinal_accuracy": 0.581045925617218, |
|
"eval_ordinal_mae": 0.5543876886367798, |
|
"eval_runtime": 157.946, |
|
"eval_samples_per_second": 25.192, |
|
"eval_steps_per_second": 3.153, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.9938157200813293, |
|
"learning_rate": 0.00017621809744779583, |
|
"loss": 0.2521, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.45715010166168213, |
|
"learning_rate": 0.00017563805104408353, |
|
"loss": 0.2926, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.9666409492492676, |
|
"learning_rate": 0.00017505800464037122, |
|
"loss": 0.2581, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.5301055908203125, |
|
"learning_rate": 0.00017447795823665894, |
|
"loss": 0.2522, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 0.3009192943572998, |
|
"eval_na_accuracy": 0.8571428656578064, |
|
"eval_ordinal_accuracy": 0.54435133934021, |
|
"eval_ordinal_mae": 0.5650931596755981, |
|
"eval_runtime": 159.1216, |
|
"eval_samples_per_second": 25.006, |
|
"eval_steps_per_second": 3.13, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.8192782998085022, |
|
"learning_rate": 0.00017389791183294664, |
|
"loss": 0.3584, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.0657265186309814, |
|
"learning_rate": 0.00017331786542923433, |
|
"loss": 0.2547, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.5887840390205383, |
|
"learning_rate": 0.00017273781902552203, |
|
"loss": 0.2335, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.8169906735420227, |
|
"learning_rate": 0.00017215777262180975, |
|
"loss": 0.262, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 0.279022216796875, |
|
"eval_na_accuracy": 0.8301158547401428, |
|
"eval_ordinal_accuracy": 0.5801791548728943, |
|
"eval_ordinal_mae": 0.5203233361244202, |
|
"eval_runtime": 159.9167, |
|
"eval_samples_per_second": 24.882, |
|
"eval_steps_per_second": 3.114, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.5461835861206055, |
|
"learning_rate": 0.00017157772621809744, |
|
"loss": 0.2387, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.7304142117500305, |
|
"learning_rate": 0.00017099767981438517, |
|
"loss": 0.2366, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.3845186233520508, |
|
"learning_rate": 0.00017041763341067286, |
|
"loss": 0.2309, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.5202885270118713, |
|
"learning_rate": 0.00016983758700696058, |
|
"loss": 0.2139, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.2653418481349945, |
|
"eval_na_accuracy": 0.7509652376174927, |
|
"eval_ordinal_accuracy": 0.5492632389068604, |
|
"eval_ordinal_mae": 0.562603771686554, |
|
"eval_runtime": 158.9921, |
|
"eval_samples_per_second": 25.026, |
|
"eval_steps_per_second": 3.132, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.6506483554840088, |
|
"learning_rate": 0.00016925754060324828, |
|
"loss": 0.3071, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.5789369940757751, |
|
"learning_rate": 0.00016867749419953597, |
|
"loss": 0.2689, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5665389895439148, |
|
"learning_rate": 0.00016809744779582367, |
|
"loss": 0.2598, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.6937847137451172, |
|
"learning_rate": 0.0001675174013921114, |
|
"loss": 0.2655, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 0.2760397493839264, |
|
"eval_na_accuracy": 0.7123551964759827, |
|
"eval_ordinal_accuracy": 0.5426177382469177, |
|
"eval_ordinal_mae": 0.6106911897659302, |
|
"eval_runtime": 160.1635, |
|
"eval_samples_per_second": 24.843, |
|
"eval_steps_per_second": 3.109, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"step": 1400, |
|
"total_flos": 1.735882797809664e+18, |
|
"train_loss": 0.29669314997536794, |
|
"train_runtime": 4786.838, |
|
"train_samples_per_second": 28.807, |
|
"train_steps_per_second": 1.801 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 8620, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"total_flos": 1.735882797809664e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|