|
{ |
|
"best_metric": 0.4678440617843855, |
|
"best_model_checkpoint": "/fsx/anton/cosmopedia/edu_score/bert_snowflake_regression_4/checkpoint-7000", |
|
"epoch": 4.247572815533981, |
|
"eval_steps": 1000, |
|
"global_step": 7000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06067961165048544, |
|
"grad_norm": 0.5638211965560913, |
|
"learning_rate": 0.0002990898058252427, |
|
"loss": 0.4753, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12135922330097088, |
|
"grad_norm": 0.47830212116241455, |
|
"learning_rate": 0.0002981796116504854, |
|
"loss": 0.357, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1820388349514563, |
|
"grad_norm": 0.6941384077072144, |
|
"learning_rate": 0.0002972694174757281, |
|
"loss": 0.3542, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.24271844660194175, |
|
"grad_norm": 0.459163635969162, |
|
"learning_rate": 0.00029635922330097087, |
|
"loss": 0.3508, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.30339805825242716, |
|
"grad_norm": 1.1585971117019653, |
|
"learning_rate": 0.0002954490291262136, |
|
"loss": 0.3407, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3640776699029126, |
|
"grad_norm": 0.6505594849586487, |
|
"learning_rate": 0.0002945388349514563, |
|
"loss": 0.3394, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.42475728155339804, |
|
"grad_norm": 0.9804072976112366, |
|
"learning_rate": 0.000293628640776699, |
|
"loss": 0.3435, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4854368932038835, |
|
"grad_norm": 0.5816351175308228, |
|
"learning_rate": 0.0002927184466019417, |
|
"loss": 0.3323, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5461165048543689, |
|
"grad_norm": 0.6582027673721313, |
|
"learning_rate": 0.00029180825242718447, |
|
"loss": 0.3293, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6067961165048543, |
|
"grad_norm": 0.8432559370994568, |
|
"learning_rate": 0.0002908980582524271, |
|
"loss": 0.3337, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6067961165048543, |
|
"eval_accuracy": 0.6516312117268014, |
|
"eval_f1_macro": 0.388617225018415, |
|
"eval_loss": 0.32428401708602905, |
|
"eval_precision": 0.5273989146868351, |
|
"eval_recall": 0.3731209274235363, |
|
"eval_runtime": 63.5791, |
|
"eval_samples_per_second": 737.144, |
|
"eval_steps_per_second": 5.772, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6674757281553398, |
|
"grad_norm": 0.6387248039245605, |
|
"learning_rate": 0.0002899878640776699, |
|
"loss": 0.3292, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7281553398058253, |
|
"grad_norm": 2.0111730098724365, |
|
"learning_rate": 0.0002890776699029126, |
|
"loss": 0.3283, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7888349514563107, |
|
"grad_norm": 1.0319699048995972, |
|
"learning_rate": 0.0002881674757281553, |
|
"loss": 0.3236, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8495145631067961, |
|
"grad_norm": 1.194286584854126, |
|
"learning_rate": 0.000287257281553398, |
|
"loss": 0.3178, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9101941747572816, |
|
"grad_norm": 0.4329046308994293, |
|
"learning_rate": 0.00028634708737864073, |
|
"loss": 0.3234, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.970873786407767, |
|
"grad_norm": 0.4490291476249695, |
|
"learning_rate": 0.0002854368932038835, |
|
"loss": 0.3148, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.0315533980582525, |
|
"grad_norm": 1.9341398477554321, |
|
"learning_rate": 0.0002845266990291262, |
|
"loss": 0.3157, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.0922330097087378, |
|
"grad_norm": 0.6705629825592041, |
|
"learning_rate": 0.0002836165048543689, |
|
"loss": 0.3144, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.1529126213592233, |
|
"grad_norm": 0.3708420395851135, |
|
"learning_rate": 0.0002827063106796116, |
|
"loss": 0.3106, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.2135922330097086, |
|
"grad_norm": 0.4166070222854614, |
|
"learning_rate": 0.00028179611650485433, |
|
"loss": 0.3065, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.2135922330097086, |
|
"eval_accuracy": 0.6756993193505025, |
|
"eval_f1_macro": 0.4177625223934091, |
|
"eval_loss": 0.3020932972431183, |
|
"eval_precision": 0.5208365532058327, |
|
"eval_recall": 0.3908095186846785, |
|
"eval_runtime": 63.8686, |
|
"eval_samples_per_second": 733.803, |
|
"eval_steps_per_second": 5.746, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.2742718446601942, |
|
"grad_norm": 0.949810266494751, |
|
"learning_rate": 0.00028088592233009704, |
|
"loss": 0.309, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.3349514563106797, |
|
"grad_norm": 0.6933236718177795, |
|
"learning_rate": 0.00027997572815533975, |
|
"loss": 0.3039, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.395631067961165, |
|
"grad_norm": 0.4874693751335144, |
|
"learning_rate": 0.0002790655339805825, |
|
"loss": 0.3016, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.4563106796116505, |
|
"grad_norm": 0.5307803750038147, |
|
"learning_rate": 0.0002781553398058252, |
|
"loss": 0.295, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.516990291262136, |
|
"grad_norm": 0.7260825634002686, |
|
"learning_rate": 0.00027724514563106793, |
|
"loss": 0.298, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.5776699029126213, |
|
"grad_norm": 1.3546072244644165, |
|
"learning_rate": 0.00027633495145631064, |
|
"loss": 0.2937, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.6383495145631068, |
|
"grad_norm": 0.7695233821868896, |
|
"learning_rate": 0.00027542475728155335, |
|
"loss": 0.2939, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.6990291262135924, |
|
"grad_norm": 0.46857160329818726, |
|
"learning_rate": 0.0002745145631067961, |
|
"loss": 0.2911, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.7597087378640777, |
|
"grad_norm": 0.521542489528656, |
|
"learning_rate": 0.0002736043689320388, |
|
"loss": 0.2936, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.820388349514563, |
|
"grad_norm": 1.1797749996185303, |
|
"learning_rate": 0.00027269417475728154, |
|
"loss": 0.291, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.820388349514563, |
|
"eval_accuracy": 0.6800947361683061, |
|
"eval_f1_macro": 0.43005056999542096, |
|
"eval_loss": 0.2904761731624603, |
|
"eval_precision": 0.5283372179121356, |
|
"eval_recall": 0.40012210038254903, |
|
"eval_runtime": 63.5697, |
|
"eval_samples_per_second": 737.253, |
|
"eval_steps_per_second": 5.773, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.8810679611650487, |
|
"grad_norm": 0.9966709613800049, |
|
"learning_rate": 0.00027178398058252425, |
|
"loss": 0.2965, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.941747572815534, |
|
"grad_norm": 0.40996024012565613, |
|
"learning_rate": 0.00027087378640776696, |
|
"loss": 0.2941, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.0024271844660193, |
|
"grad_norm": 0.5450060367584229, |
|
"learning_rate": 0.00026996359223300967, |
|
"loss": 0.2912, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.063106796116505, |
|
"grad_norm": 0.5307539701461792, |
|
"learning_rate": 0.0002690533980582524, |
|
"loss": 0.2872, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.1237864077669903, |
|
"grad_norm": 0.5863193273544312, |
|
"learning_rate": 0.00026814320388349514, |
|
"loss": 0.2929, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.1844660194174756, |
|
"grad_norm": 0.584078311920166, |
|
"learning_rate": 0.00026723300970873785, |
|
"loss": 0.2879, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.2451456310679614, |
|
"grad_norm": 0.6381602883338928, |
|
"learning_rate": 0.00026632281553398056, |
|
"loss": 0.2892, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.3058252427184467, |
|
"grad_norm": 0.4760149121284485, |
|
"learning_rate": 0.00026541262135922327, |
|
"loss": 0.2863, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.366504854368932, |
|
"grad_norm": 0.4088296890258789, |
|
"learning_rate": 0.000264502427184466, |
|
"loss": 0.2913, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.4271844660194173, |
|
"grad_norm": 1.3476176261901855, |
|
"learning_rate": 0.00026359223300970874, |
|
"loss": 0.2845, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.4271844660194173, |
|
"eval_accuracy": 0.6929182580493738, |
|
"eval_f1_macro": 0.4442664055274559, |
|
"eval_loss": 0.280377060174942, |
|
"eval_precision": 0.5328225535144124, |
|
"eval_recall": 0.41160029395774395, |
|
"eval_runtime": 64.199, |
|
"eval_samples_per_second": 730.027, |
|
"eval_steps_per_second": 5.717, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.487864077669903, |
|
"grad_norm": 0.6318752765655518, |
|
"learning_rate": 0.0002626820388349514, |
|
"loss": 0.2812, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.5485436893203883, |
|
"grad_norm": 0.49435973167419434, |
|
"learning_rate": 0.00026177184466019416, |
|
"loss": 0.2803, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.6092233009708736, |
|
"grad_norm": 0.4300900101661682, |
|
"learning_rate": 0.00026086165048543687, |
|
"loss": 0.2853, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.6699029126213594, |
|
"grad_norm": 0.9545436501502991, |
|
"learning_rate": 0.0002599514563106796, |
|
"loss": 0.2813, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.7305825242718447, |
|
"grad_norm": 0.5803716778755188, |
|
"learning_rate": 0.0002590412621359223, |
|
"loss": 0.2838, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.79126213592233, |
|
"grad_norm": 1.4714713096618652, |
|
"learning_rate": 0.000258131067961165, |
|
"loss": 0.2814, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.8519417475728153, |
|
"grad_norm": 0.6767821311950684, |
|
"learning_rate": 0.00025722087378640777, |
|
"loss": 0.2741, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.912621359223301, |
|
"grad_norm": 0.4653462767601013, |
|
"learning_rate": 0.0002563106796116505, |
|
"loss": 0.2783, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.9733009708737863, |
|
"grad_norm": 1.3012775182724, |
|
"learning_rate": 0.0002554004854368932, |
|
"loss": 0.283, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 3.033980582524272, |
|
"grad_norm": 0.4733451306819916, |
|
"learning_rate": 0.0002544902912621359, |
|
"loss": 0.2767, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.033980582524272, |
|
"eval_accuracy": 0.6949666076343696, |
|
"eval_f1_macro": 0.4524166681181929, |
|
"eval_loss": 0.2772601842880249, |
|
"eval_precision": 0.52914110464024, |
|
"eval_recall": 0.42261704559523156, |
|
"eval_runtime": 63.4257, |
|
"eval_samples_per_second": 738.927, |
|
"eval_steps_per_second": 5.786, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.0946601941747574, |
|
"grad_norm": 0.4103662371635437, |
|
"learning_rate": 0.0002535800970873786, |
|
"loss": 0.2796, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 3.1553398058252426, |
|
"grad_norm": 0.4195462763309479, |
|
"learning_rate": 0.0002526699029126213, |
|
"loss": 0.2757, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 3.216019417475728, |
|
"grad_norm": 1.2391552925109863, |
|
"learning_rate": 0.0002517597087378641, |
|
"loss": 0.2783, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 3.2766990291262137, |
|
"grad_norm": 1.2029412984848022, |
|
"learning_rate": 0.0002508495145631068, |
|
"loss": 0.2772, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.337378640776699, |
|
"grad_norm": 0.5050978660583496, |
|
"learning_rate": 0.0002499393203883495, |
|
"loss": 0.2776, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.3980582524271843, |
|
"grad_norm": 1.0107412338256836, |
|
"learning_rate": 0.0002490291262135922, |
|
"loss": 0.2766, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 3.45873786407767, |
|
"grad_norm": 0.4374917149543762, |
|
"learning_rate": 0.0002481189320388349, |
|
"loss": 0.2719, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 3.5194174757281553, |
|
"grad_norm": 1.6768765449523926, |
|
"learning_rate": 0.0002472087378640777, |
|
"loss": 0.2803, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 3.5800970873786406, |
|
"grad_norm": 0.8120823502540588, |
|
"learning_rate": 0.0002462985436893204, |
|
"loss": 0.2723, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 3.6407766990291264, |
|
"grad_norm": 1.3967177867889404, |
|
"learning_rate": 0.0002453883495145631, |
|
"loss": 0.2796, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.6407766990291264, |
|
"eval_accuracy": 0.7001301555465466, |
|
"eval_f1_macro": 0.46180498265852127, |
|
"eval_loss": 0.272257536649704, |
|
"eval_precision": 0.5281578618931982, |
|
"eval_recall": 0.4315295129889904, |
|
"eval_runtime": 63.8213, |
|
"eval_samples_per_second": 734.347, |
|
"eval_steps_per_second": 5.75, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.7014563106796117, |
|
"grad_norm": 0.6093985438346863, |
|
"learning_rate": 0.0002444781553398058, |
|
"loss": 0.2744, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 3.762135922330097, |
|
"grad_norm": 0.7282202243804932, |
|
"learning_rate": 0.00024356796116504852, |
|
"loss": 0.2715, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 3.8228155339805827, |
|
"grad_norm": 1.1341967582702637, |
|
"learning_rate": 0.00024265776699029123, |
|
"loss": 0.2709, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 3.883495145631068, |
|
"grad_norm": 0.8576841354370117, |
|
"learning_rate": 0.00024174757281553394, |
|
"loss": 0.275, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 3.9441747572815533, |
|
"grad_norm": 0.5656840205192566, |
|
"learning_rate": 0.00024083737864077668, |
|
"loss": 0.2676, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 4.004854368932039, |
|
"grad_norm": 0.6544743180274963, |
|
"learning_rate": 0.0002399271844660194, |
|
"loss": 0.2734, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 4.065533980582524, |
|
"grad_norm": 1.5159205198287964, |
|
"learning_rate": 0.0002390169902912621, |
|
"loss": 0.2666, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 4.12621359223301, |
|
"grad_norm": 0.9112799763679504, |
|
"learning_rate": 0.00023810679611650483, |
|
"loss": 0.2646, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 4.186893203883495, |
|
"grad_norm": 0.6971092224121094, |
|
"learning_rate": 0.00023719660194174754, |
|
"loss": 0.2681, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 4.247572815533981, |
|
"grad_norm": 0.5126680731773376, |
|
"learning_rate": 0.00023628640776699028, |
|
"loss": 0.2669, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.247572815533981, |
|
"eval_accuracy": 0.7024985597541981, |
|
"eval_f1_macro": 0.4678440617843855, |
|
"eval_loss": 0.26851192116737366, |
|
"eval_precision": 0.5330566032345198, |
|
"eval_recall": 0.43620949446199525, |
|
"eval_runtime": 63.9009, |
|
"eval_samples_per_second": 733.432, |
|
"eval_steps_per_second": 5.743, |
|
"step": 7000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 32960, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 1000, |
|
"total_flos": 4.713992160165028e+17, |
|
"train_batch_size": 256, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|