{ "best_metric": 1.8574078837127486, "best_model_checkpoint": "./modernBERT-content-regression/run-0/checkpoint-124", "epoch": 1.0, "eval_steps": 500, "global_step": 124, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008064516129032258, "grad_norm": 312.78533935546875, "learning_rate": 5.891861273742105e-07, "loss": 21.3087, "step": 1 }, { "epoch": 0.016129032258064516, "grad_norm": 97.88153839111328, "learning_rate": 5.84395996257347e-07, "loss": 2.0402, "step": 2 }, { "epoch": 0.024193548387096774, "grad_norm": 538.2888793945312, "learning_rate": 5.796058651404834e-07, "loss": 155.6907, "step": 3 }, { "epoch": 0.03225806451612903, "grad_norm": 215.75390625, "learning_rate": 5.748157340236199e-07, "loss": 19.2115, "step": 4 }, { "epoch": 0.04032258064516129, "grad_norm": 125.33987426757812, "learning_rate": 5.700256029067565e-07, "loss": 4.8412, "step": 5 }, { "epoch": 0.04838709677419355, "grad_norm": 48.7938346862793, "learning_rate": 5.65235471789893e-07, "loss": 0.7595, "step": 6 }, { "epoch": 0.056451612903225805, "grad_norm": 199.99588012695312, "learning_rate": 5.604453406730294e-07, "loss": 10.9418, "step": 7 }, { "epoch": 0.06451612903225806, "grad_norm": 320.8084716796875, "learning_rate": 5.556552095561659e-07, "loss": 24.2806, "step": 8 }, { "epoch": 0.07258064516129033, "grad_norm": 86.29633331298828, "learning_rate": 5.508650784393024e-07, "loss": 1.3563, "step": 9 }, { "epoch": 0.08064516129032258, "grad_norm": 162.44317626953125, "learning_rate": 5.460749473224389e-07, "loss": 6.7617, "step": 10 }, { "epoch": 0.08870967741935484, "grad_norm": 277.6880187988281, "learning_rate": 5.412848162055754e-07, "loss": 22.2787, "step": 11 }, { "epoch": 0.0967741935483871, "grad_norm": 98.6792984008789, "learning_rate": 5.364946850887119e-07, "loss": 2.3356, "step": 12 }, { "epoch": 0.10483870967741936, "grad_norm": 46.1937141418457, "learning_rate": 5.317045539718485e-07, "loss": 0.4073, "step": 13 }, { "epoch": 0.11290322580645161, "grad_norm": 174.8550567626953, "learning_rate": 5.269144228549849e-07, "loss": 6.7447, "step": 14 }, { "epoch": 0.12096774193548387, "grad_norm": 98.10540771484375, "learning_rate": 5.221242917381214e-07, "loss": 1.3577, "step": 15 }, { "epoch": 0.12903225806451613, "grad_norm": 242.8153533935547, "learning_rate": 5.173341606212579e-07, "loss": 15.6496, "step": 16 }, { "epoch": 0.13709677419354838, "grad_norm": 128.5262908935547, "learning_rate": 5.125440295043945e-07, "loss": 3.0652, "step": 17 }, { "epoch": 0.14516129032258066, "grad_norm": 167.771484375, "learning_rate": 5.077538983875309e-07, "loss": 6.343, "step": 18 }, { "epoch": 0.1532258064516129, "grad_norm": 64.37162017822266, "learning_rate": 5.029637672706674e-07, "loss": 1.2451, "step": 19 }, { "epoch": 0.16129032258064516, "grad_norm": 340.9455261230469, "learning_rate": 4.98173636153804e-07, "loss": 18.9788, "step": 20 }, { "epoch": 0.1693548387096774, "grad_norm": 32.88615798950195, "learning_rate": 4.933835050369405e-07, "loss": 0.325, "step": 21 }, { "epoch": 0.1774193548387097, "grad_norm": 11.620039939880371, "learning_rate": 4.88593373920077e-07, "loss": 0.2452, "step": 22 }, { "epoch": 0.18548387096774194, "grad_norm": 162.82505798339844, "learning_rate": 4.838032428032134e-07, "loss": 20.8943, "step": 23 }, { "epoch": 0.1935483870967742, "grad_norm": 32.955055236816406, "learning_rate": 4.7901311168635e-07, "loss": 0.3191, "step": 24 }, { "epoch": 0.20161290322580644, "grad_norm": 58.342063903808594, "learning_rate": 4.7422298056948643e-07, "loss": 0.8499, "step": 25 }, { "epoch": 0.20967741935483872, "grad_norm": 48.5831413269043, "learning_rate": 4.694328494526229e-07, "loss": 0.6767, "step": 26 }, { "epoch": 0.21774193548387097, "grad_norm": 64.95976257324219, "learning_rate": 4.6464271833575944e-07, "loss": 3.5687, "step": 27 }, { "epoch": 0.22580645161290322, "grad_norm": 82.42206573486328, "learning_rate": 4.598525872188959e-07, "loss": 0.2793, "step": 28 }, { "epoch": 0.23387096774193547, "grad_norm": 24.026878356933594, "learning_rate": 4.5506245610203244e-07, "loss": 0.8235, "step": 29 }, { "epoch": 0.24193548387096775, "grad_norm": 58.60378646850586, "learning_rate": 4.502723249851689e-07, "loss": 0.8523, "step": 30 }, { "epoch": 0.25, "grad_norm": 103.00776672363281, "learning_rate": 4.4548219386830544e-07, "loss": 1.8451, "step": 31 }, { "epoch": 0.25806451612903225, "grad_norm": 105.24837493896484, "learning_rate": 4.4069206275144197e-07, "loss": 1.9855, "step": 32 }, { "epoch": 0.2661290322580645, "grad_norm": 31.479812622070312, "learning_rate": 4.3590193163457845e-07, "loss": 1.1242, "step": 33 }, { "epoch": 0.27419354838709675, "grad_norm": 96.84868621826172, "learning_rate": 4.3111180051771497e-07, "loss": 2.3358, "step": 34 }, { "epoch": 0.28225806451612906, "grad_norm": 397.0208740234375, "learning_rate": 4.2632166940085145e-07, "loss": 28.5264, "step": 35 }, { "epoch": 0.2903225806451613, "grad_norm": 31.28931999206543, "learning_rate": 4.21531538283988e-07, "loss": 0.4193, "step": 36 }, { "epoch": 0.29838709677419356, "grad_norm": 178.57113647460938, "learning_rate": 4.1674140716712445e-07, "loss": 2.5081, "step": 37 }, { "epoch": 0.3064516129032258, "grad_norm": 279.4121398925781, "learning_rate": 4.11951276050261e-07, "loss": 28.2095, "step": 38 }, { "epoch": 0.31451612903225806, "grad_norm": 36.0539436340332, "learning_rate": 4.071611449333974e-07, "loss": 0.2968, "step": 39 }, { "epoch": 0.3225806451612903, "grad_norm": 586.9475708007812, "learning_rate": 4.0237101381653393e-07, "loss": 36.0125, "step": 40 }, { "epoch": 0.33064516129032256, "grad_norm": 45.97328186035156, "learning_rate": 3.975808826996704e-07, "loss": 1.1626, "step": 41 }, { "epoch": 0.3387096774193548, "grad_norm": 190.89552307128906, "learning_rate": 3.9279075158280693e-07, "loss": 19.2772, "step": 42 }, { "epoch": 0.3467741935483871, "grad_norm": 199.61676025390625, "learning_rate": 3.8800062046594346e-07, "loss": 29.038, "step": 43 }, { "epoch": 0.3548387096774194, "grad_norm": 33.44911575317383, "learning_rate": 3.8321048934907993e-07, "loss": 0.5206, "step": 44 }, { "epoch": 0.3629032258064516, "grad_norm": 57.073036193847656, "learning_rate": 3.7842035823221646e-07, "loss": 1.4738, "step": 45 }, { "epoch": 0.3709677419354839, "grad_norm": 95.9813232421875, "learning_rate": 3.7363022711535294e-07, "loss": 3.1093, "step": 46 }, { "epoch": 0.3790322580645161, "grad_norm": 47.49271011352539, "learning_rate": 3.6884009599848947e-07, "loss": 2.8001, "step": 47 }, { "epoch": 0.3870967741935484, "grad_norm": 126.86119079589844, "learning_rate": 3.6404996488162594e-07, "loss": 1.2492, "step": 48 }, { "epoch": 0.3951612903225806, "grad_norm": 96.22097778320312, "learning_rate": 3.5925983376476247e-07, "loss": 1.7812, "step": 49 }, { "epoch": 0.4032258064516129, "grad_norm": 346.9049072265625, "learning_rate": 3.5446970264789894e-07, "loss": 9.2024, "step": 50 }, { "epoch": 0.4112903225806452, "grad_norm": 94.45950317382812, "learning_rate": 3.4967957153103547e-07, "loss": 0.7495, "step": 51 }, { "epoch": 0.41935483870967744, "grad_norm": 22.184967041015625, "learning_rate": 3.44889440414172e-07, "loss": 0.3148, "step": 52 }, { "epoch": 0.4274193548387097, "grad_norm": 354.94293212890625, "learning_rate": 3.400993092973085e-07, "loss": 16.7273, "step": 53 }, { "epoch": 0.43548387096774194, "grad_norm": 61.76173400878906, "learning_rate": 3.35309178180445e-07, "loss": 0.7872, "step": 54 }, { "epoch": 0.4435483870967742, "grad_norm": 65.9664077758789, "learning_rate": 3.305190470635814e-07, "loss": 5.1777, "step": 55 }, { "epoch": 0.45161290322580644, "grad_norm": 64.19781494140625, "learning_rate": 3.2572891594671795e-07, "loss": 1.3474, "step": 56 }, { "epoch": 0.4596774193548387, "grad_norm": 112.18177032470703, "learning_rate": 3.2093878482985443e-07, "loss": 1.0369, "step": 57 }, { "epoch": 0.46774193548387094, "grad_norm": 216.22451782226562, "learning_rate": 3.1614865371299096e-07, "loss": 9.7099, "step": 58 }, { "epoch": 0.47580645161290325, "grad_norm": 55.55582046508789, "learning_rate": 3.1135852259612743e-07, "loss": 0.5347, "step": 59 }, { "epoch": 0.4838709677419355, "grad_norm": 263.30804443359375, "learning_rate": 3.0656839147926396e-07, "loss": 15.46, "step": 60 }, { "epoch": 0.49193548387096775, "grad_norm": 42.56807327270508, "learning_rate": 3.0177826036240043e-07, "loss": 0.1513, "step": 61 }, { "epoch": 0.5, "grad_norm": 163.58816528320312, "learning_rate": 2.9698812924553696e-07, "loss": 22.7721, "step": 62 }, { "epoch": 0.5080645161290323, "grad_norm": 95.77588653564453, "learning_rate": 2.921979981286735e-07, "loss": 1.042, "step": 63 }, { "epoch": 0.5161290322580645, "grad_norm": 91.72923278808594, "learning_rate": 2.8740786701180996e-07, "loss": 3.4456, "step": 64 }, { "epoch": 0.5241935483870968, "grad_norm": 211.87564086914062, "learning_rate": 2.826177358949465e-07, "loss": 13.9779, "step": 65 }, { "epoch": 0.532258064516129, "grad_norm": 133.1850128173828, "learning_rate": 2.7782760477808297e-07, "loss": 1.6836, "step": 66 }, { "epoch": 0.5403225806451613, "grad_norm": 61.98344802856445, "learning_rate": 2.7303747366121944e-07, "loss": 1.0419, "step": 67 }, { "epoch": 0.5483870967741935, "grad_norm": 41.35300064086914, "learning_rate": 2.6824734254435597e-07, "loss": 0.7594, "step": 68 }, { "epoch": 0.5564516129032258, "grad_norm": 92.70877838134766, "learning_rate": 2.6345721142749245e-07, "loss": 1.1764, "step": 69 }, { "epoch": 0.5645161290322581, "grad_norm": 128.7194366455078, "learning_rate": 2.5866708031062897e-07, "loss": 1.6011, "step": 70 }, { "epoch": 0.5725806451612904, "grad_norm": 117.02701568603516, "learning_rate": 2.5387694919376545e-07, "loss": 2.4181, "step": 71 }, { "epoch": 0.5806451612903226, "grad_norm": 260.2272644042969, "learning_rate": 2.49086818076902e-07, "loss": 20.2532, "step": 72 }, { "epoch": 0.5887096774193549, "grad_norm": 120.9480972290039, "learning_rate": 2.442966869600385e-07, "loss": 1.3606, "step": 73 }, { "epoch": 0.5967741935483871, "grad_norm": 126.54743194580078, "learning_rate": 2.39506555843175e-07, "loss": 0.9655, "step": 74 }, { "epoch": 0.6048387096774194, "grad_norm": 133.0532684326172, "learning_rate": 2.3471642472631145e-07, "loss": 17.9774, "step": 75 }, { "epoch": 0.6129032258064516, "grad_norm": 20.1326961517334, "learning_rate": 2.2992629360944796e-07, "loss": 0.2602, "step": 76 }, { "epoch": 0.6209677419354839, "grad_norm": 27.283140182495117, "learning_rate": 2.2513616249258446e-07, "loss": 0.164, "step": 77 }, { "epoch": 0.6290322580645161, "grad_norm": 120.67879486083984, "learning_rate": 2.2034603137572099e-07, "loss": 14.4914, "step": 78 }, { "epoch": 0.6370967741935484, "grad_norm": 240.2298583984375, "learning_rate": 2.1555590025885749e-07, "loss": 22.8088, "step": 79 }, { "epoch": 0.6451612903225806, "grad_norm": 72.27034759521484, "learning_rate": 2.10765769141994e-07, "loss": 1.0051, "step": 80 }, { "epoch": 0.6532258064516129, "grad_norm": 13.931716918945312, "learning_rate": 2.059756380251305e-07, "loss": 0.2283, "step": 81 }, { "epoch": 0.6612903225806451, "grad_norm": 113.01272583007812, "learning_rate": 2.0118550690826696e-07, "loss": 2.5264, "step": 82 }, { "epoch": 0.6693548387096774, "grad_norm": 44.34663009643555, "learning_rate": 1.9639537579140347e-07, "loss": 0.2403, "step": 83 }, { "epoch": 0.6774193548387096, "grad_norm": 27.369699478149414, "learning_rate": 1.9160524467453997e-07, "loss": 1.3949, "step": 84 }, { "epoch": 0.6854838709677419, "grad_norm": 297.8616638183594, "learning_rate": 1.8681511355767647e-07, "loss": 18.9797, "step": 85 }, { "epoch": 0.6935483870967742, "grad_norm": 116.84327697753906, "learning_rate": 1.8202498244081297e-07, "loss": 13.8606, "step": 86 }, { "epoch": 0.7016129032258065, "grad_norm": 79.06473541259766, "learning_rate": 1.7723485132394947e-07, "loss": 2.3509, "step": 87 }, { "epoch": 0.7096774193548387, "grad_norm": 73.07052612304688, "learning_rate": 1.72444720207086e-07, "loss": 0.6984, "step": 88 }, { "epoch": 0.717741935483871, "grad_norm": 79.30596160888672, "learning_rate": 1.676545890902225e-07, "loss": 0.8937, "step": 89 }, { "epoch": 0.7258064516129032, "grad_norm": 100.33399963378906, "learning_rate": 1.6286445797335898e-07, "loss": 1.0283, "step": 90 }, { "epoch": 0.7338709677419355, "grad_norm": 344.4866638183594, "learning_rate": 1.5807432685649548e-07, "loss": 22.9303, "step": 91 }, { "epoch": 0.7419354838709677, "grad_norm": 167.5432891845703, "learning_rate": 1.5328419573963198e-07, "loss": 8.3142, "step": 92 }, { "epoch": 0.75, "grad_norm": 44.125980377197266, "learning_rate": 1.4849406462276848e-07, "loss": 0.4436, "step": 93 }, { "epoch": 0.7580645161290323, "grad_norm": 85.20245361328125, "learning_rate": 1.4370393350590498e-07, "loss": 0.9511, "step": 94 }, { "epoch": 0.7661290322580645, "grad_norm": 213.55230712890625, "learning_rate": 1.3891380238904148e-07, "loss": 11.2112, "step": 95 }, { "epoch": 0.7741935483870968, "grad_norm": 120.69640350341797, "learning_rate": 1.3412367127217799e-07, "loss": 1.8579, "step": 96 }, { "epoch": 0.782258064516129, "grad_norm": 91.34770965576172, "learning_rate": 1.2933354015531449e-07, "loss": 0.8711, "step": 97 }, { "epoch": 0.7903225806451613, "grad_norm": 113.83234405517578, "learning_rate": 1.24543409038451e-07, "loss": 1.3935, "step": 98 }, { "epoch": 0.7983870967741935, "grad_norm": 188.62608337402344, "learning_rate": 1.197532779215875e-07, "loss": 1.5678, "step": 99 }, { "epoch": 0.8064516129032258, "grad_norm": 275.4859619140625, "learning_rate": 1.1496314680472398e-07, "loss": 19.5526, "step": 100 }, { "epoch": 0.8145161290322581, "grad_norm": 79.26446533203125, "learning_rate": 1.1017301568786049e-07, "loss": 1.3286, "step": 101 }, { "epoch": 0.8225806451612904, "grad_norm": 99.26114654541016, "learning_rate": 1.05382884570997e-07, "loss": 1.1271, "step": 102 }, { "epoch": 0.8306451612903226, "grad_norm": 120.89157104492188, "learning_rate": 1.0059275345413348e-07, "loss": 10.8661, "step": 103 }, { "epoch": 0.8387096774193549, "grad_norm": 16.99560546875, "learning_rate": 9.580262233726998e-08, "loss": 0.4624, "step": 104 }, { "epoch": 0.8467741935483871, "grad_norm": 81.22891235351562, "learning_rate": 9.101249122040649e-08, "loss": 1.3276, "step": 105 }, { "epoch": 0.8548387096774194, "grad_norm": 139.22935485839844, "learning_rate": 8.6222360103543e-08, "loss": 4.7111, "step": 106 }, { "epoch": 0.8629032258064516, "grad_norm": 101.03129577636719, "learning_rate": 8.143222898667949e-08, "loss": 1.8655, "step": 107 }, { "epoch": 0.8709677419354839, "grad_norm": 84.7345199584961, "learning_rate": 7.664209786981599e-08, "loss": 1.102, "step": 108 }, { "epoch": 0.8790322580645161, "grad_norm": 139.8319091796875, "learning_rate": 7.185196675295249e-08, "loss": 1.837, "step": 109 }, { "epoch": 0.8870967741935484, "grad_norm": 62.02327346801758, "learning_rate": 6.706183563608899e-08, "loss": 0.976, "step": 110 }, { "epoch": 0.8951612903225806, "grad_norm": 280.1962890625, "learning_rate": 6.22717045192255e-08, "loss": 5.7555, "step": 111 }, { "epoch": 0.9032258064516129, "grad_norm": 26.91400718688965, "learning_rate": 5.748157340236199e-08, "loss": 1.115, "step": 112 }, { "epoch": 0.9112903225806451, "grad_norm": 68.3736801147461, "learning_rate": 5.26914422854985e-08, "loss": 0.582, "step": 113 }, { "epoch": 0.9193548387096774, "grad_norm": 28.068424224853516, "learning_rate": 4.790131116863499e-08, "loss": 1.0568, "step": 114 }, { "epoch": 0.9274193548387096, "grad_norm": 186.28965759277344, "learning_rate": 4.31111800517715e-08, "loss": 6.9815, "step": 115 }, { "epoch": 0.9354838709677419, "grad_norm": 76.79621124267578, "learning_rate": 3.8321048934907995e-08, "loss": 2.1168, "step": 116 }, { "epoch": 0.9435483870967742, "grad_norm": 115.89000701904297, "learning_rate": 3.3530917818044496e-08, "loss": 1.4588, "step": 117 }, { "epoch": 0.9516129032258065, "grad_norm": 286.4219665527344, "learning_rate": 2.8740786701180994e-08, "loss": 13.7741, "step": 118 }, { "epoch": 0.9596774193548387, "grad_norm": 75.13905334472656, "learning_rate": 2.3950655584317496e-08, "loss": 1.2502, "step": 119 }, { "epoch": 0.967741935483871, "grad_norm": 68.71749877929688, "learning_rate": 1.9160524467453997e-08, "loss": 1.7878, "step": 120 }, { "epoch": 0.9758064516129032, "grad_norm": 185.82862854003906, "learning_rate": 1.4370393350590497e-08, "loss": 1.735, "step": 121 }, { "epoch": 0.9838709677419355, "grad_norm": 36.008094787597656, "learning_rate": 9.580262233726999e-09, "loss": 0.4565, "step": 122 }, { "epoch": 0.9919354838709677, "grad_norm": 82.23770904541016, "learning_rate": 4.790131116863499e-09, "loss": 0.872, "step": 123 }, { "epoch": 1.0, "grad_norm": 0.47375962138175964, "learning_rate": 0.0, "loss": 0.0, "step": 124 }, { "epoch": 1.0, "eval_loss": 3.4499640464782715, "eval_mae": 1.183539628982544, "eval_mse": 3.4499640464782715, "eval_r2": 0.05545985698699951, "eval_rmse": 1.8574078837127486, "eval_runtime": 1.3199, "eval_samples_per_second": 41.669, "eval_smape": 46.03448510169983, "eval_steps_per_second": 10.607, "step": 124 } ], "logging_steps": 1, "max_steps": 124, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 671969813753856.0, "train_batch_size": 4, "trial_name": null, "trial_params": { "learning_rate": 5.939762584910739e-07 } }