|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.4186289900575615, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0020931449502878076, |
|
"grad_norm": 2.813615083694458, |
|
"learning_rate": 5e-05, |
|
"loss": 3.0406, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004186289900575615, |
|
"grad_norm": 2.6805036067962646, |
|
"learning_rate": 0.0001, |
|
"loss": 2.9062, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.006279434850863423, |
|
"grad_norm": 2.032153367996216, |
|
"learning_rate": 0.00015, |
|
"loss": 2.6258, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00837257980115123, |
|
"grad_norm": 1.7369762659072876, |
|
"learning_rate": 0.0002, |
|
"loss": 2.4318, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.010465724751439037, |
|
"grad_norm": 1.5764646530151367, |
|
"learning_rate": 0.00025, |
|
"loss": 2.1646, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.012558869701726845, |
|
"grad_norm": 1.1295244693756104, |
|
"learning_rate": 0.0003, |
|
"loss": 2.0415, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.014652014652014652, |
|
"grad_norm": 0.9872271418571472, |
|
"learning_rate": 0.00035, |
|
"loss": 2.0001, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.01674515960230246, |
|
"grad_norm": 1.6292792558670044, |
|
"learning_rate": 0.0004, |
|
"loss": 1.9348, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.018838304552590265, |
|
"grad_norm": 0.6931092143058777, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 1.8677, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.020931449502878074, |
|
"grad_norm": 0.5845860242843628, |
|
"learning_rate": 0.0005, |
|
"loss": 1.8057, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.023024594453165882, |
|
"grad_norm": 0.6705211997032166, |
|
"learning_rate": 0.0004989293361884369, |
|
"loss": 1.7457, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02511773940345369, |
|
"grad_norm": 0.6003366112709045, |
|
"learning_rate": 0.0004978586723768737, |
|
"loss": 1.7477, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.027210884353741496, |
|
"grad_norm": 0.764362633228302, |
|
"learning_rate": 0.0004967880085653105, |
|
"loss": 1.6971, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.029304029304029304, |
|
"grad_norm": 0.9480692148208618, |
|
"learning_rate": 0.0004957173447537474, |
|
"loss": 1.7387, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03139717425431711, |
|
"grad_norm": 0.8233430981636047, |
|
"learning_rate": 0.0004946466809421842, |
|
"loss": 1.679, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03349031920460492, |
|
"grad_norm": 0.4710249602794647, |
|
"learning_rate": 0.000493576017130621, |
|
"loss": 1.687, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.035583464154892726, |
|
"grad_norm": 0.5001205801963806, |
|
"learning_rate": 0.0004925053533190578, |
|
"loss": 1.7042, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03767660910518053, |
|
"grad_norm": 0.37574276328086853, |
|
"learning_rate": 0.0004914346895074946, |
|
"loss": 1.6976, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03976975405546834, |
|
"grad_norm": 0.4445561170578003, |
|
"learning_rate": 0.0004903640256959315, |
|
"loss": 1.711, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.04186289900575615, |
|
"grad_norm": 0.32608023285865784, |
|
"learning_rate": 0.0004892933618843683, |
|
"loss": 1.6491, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04395604395604396, |
|
"grad_norm": 0.34311097860336304, |
|
"learning_rate": 0.0004882226980728052, |
|
"loss": 1.6588, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.046049188906331764, |
|
"grad_norm": 0.31974583864212036, |
|
"learning_rate": 0.000487152034261242, |
|
"loss": 1.6907, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04814233385661957, |
|
"grad_norm": 0.3181461989879608, |
|
"learning_rate": 0.0004860813704496788, |
|
"loss": 1.6776, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.05023547880690738, |
|
"grad_norm": 0.31020134687423706, |
|
"learning_rate": 0.00048501070663811566, |
|
"loss": 1.6665, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.052328623757195186, |
|
"grad_norm": 0.3187050521373749, |
|
"learning_rate": 0.00048394004282655245, |
|
"loss": 1.647, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05442176870748299, |
|
"grad_norm": 0.3272475600242615, |
|
"learning_rate": 0.0004828693790149893, |
|
"loss": 1.6769, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0565149136577708, |
|
"grad_norm": 0.29509666562080383, |
|
"learning_rate": 0.00048179871520342613, |
|
"loss": 1.6292, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.05860805860805861, |
|
"grad_norm": 0.29900938272476196, |
|
"learning_rate": 0.00048072805139186297, |
|
"loss": 1.6318, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.06070120355834641, |
|
"grad_norm": 0.3012602627277374, |
|
"learning_rate": 0.0004796573875802998, |
|
"loss": 1.5997, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.06279434850863422, |
|
"grad_norm": 0.3533616065979004, |
|
"learning_rate": 0.00047858672376873665, |
|
"loss": 1.632, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06488749345892203, |
|
"grad_norm": 0.2721816599369049, |
|
"learning_rate": 0.00047751605995717344, |
|
"loss": 1.6432, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.06698063840920984, |
|
"grad_norm": 0.29362842440605164, |
|
"learning_rate": 0.0004764453961456103, |
|
"loss": 1.6608, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.06907378335949764, |
|
"grad_norm": 0.27665096521377563, |
|
"learning_rate": 0.0004753747323340471, |
|
"loss": 1.6286, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.07116692830978545, |
|
"grad_norm": 0.28791311383247375, |
|
"learning_rate": 0.0004743040685224839, |
|
"loss": 1.6093, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.07326007326007326, |
|
"grad_norm": 0.31565895676612854, |
|
"learning_rate": 0.0004732334047109208, |
|
"loss": 1.6672, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.07535321821036106, |
|
"grad_norm": 0.26670706272125244, |
|
"learning_rate": 0.00047216274089935764, |
|
"loss": 1.6251, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07744636316064887, |
|
"grad_norm": 0.2804130017757416, |
|
"learning_rate": 0.0004710920770877944, |
|
"loss": 1.5653, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.07953950811093669, |
|
"grad_norm": 0.27214744687080383, |
|
"learning_rate": 0.00047002141327623126, |
|
"loss": 1.5726, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.08163265306122448, |
|
"grad_norm": 0.28986403346061707, |
|
"learning_rate": 0.0004689507494646681, |
|
"loss": 1.601, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0837257980115123, |
|
"grad_norm": 0.3080230951309204, |
|
"learning_rate": 0.0004678800856531049, |
|
"loss": 1.5939, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08581894296180011, |
|
"grad_norm": 0.2734631597995758, |
|
"learning_rate": 0.0004668094218415418, |
|
"loss": 1.5951, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.08791208791208792, |
|
"grad_norm": 0.28978678584098816, |
|
"learning_rate": 0.0004657387580299786, |
|
"loss": 1.6146, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.09000523286237572, |
|
"grad_norm": 0.27776286005973816, |
|
"learning_rate": 0.00046466809421841546, |
|
"loss": 1.6194, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.09209837781266353, |
|
"grad_norm": 0.2763765752315521, |
|
"learning_rate": 0.00046359743040685225, |
|
"loss": 1.555, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.09419152276295134, |
|
"grad_norm": 0.28580474853515625, |
|
"learning_rate": 0.0004625267665952891, |
|
"loss": 1.5959, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09628466771323914, |
|
"grad_norm": 0.27958357334136963, |
|
"learning_rate": 0.00046145610278372593, |
|
"loss": 1.5754, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.09837781266352695, |
|
"grad_norm": 0.2925872504711151, |
|
"learning_rate": 0.0004603854389721627, |
|
"loss": 1.656, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.10047095761381476, |
|
"grad_norm": 0.27339980006217957, |
|
"learning_rate": 0.0004593147751605996, |
|
"loss": 1.6213, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.10256410256410256, |
|
"grad_norm": 0.2932608127593994, |
|
"learning_rate": 0.00045824411134903645, |
|
"loss": 1.5749, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.10465724751439037, |
|
"grad_norm": 0.26638683676719666, |
|
"learning_rate": 0.00045717344753747323, |
|
"loss": 1.5951, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10675039246467818, |
|
"grad_norm": 0.28242963552474976, |
|
"learning_rate": 0.0004561027837259101, |
|
"loss": 1.5676, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.10884353741496598, |
|
"grad_norm": 0.2681107819080353, |
|
"learning_rate": 0.0004550321199143469, |
|
"loss": 1.5513, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1109366823652538, |
|
"grad_norm": 0.27786529064178467, |
|
"learning_rate": 0.0004539614561027837, |
|
"loss": 1.4974, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1130298273155416, |
|
"grad_norm": 0.2654118835926056, |
|
"learning_rate": 0.00045289079229122054, |
|
"loss": 1.6262, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1151229722658294, |
|
"grad_norm": 0.30704954266548157, |
|
"learning_rate": 0.00045182012847965744, |
|
"loss": 1.564, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.11721611721611722, |
|
"grad_norm": 0.27236270904541016, |
|
"learning_rate": 0.0004507494646680942, |
|
"loss": 1.6199, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.11930926216640503, |
|
"grad_norm": 0.2929720878601074, |
|
"learning_rate": 0.00044967880085653106, |
|
"loss": 1.6008, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.12140240711669283, |
|
"grad_norm": 0.28251537680625916, |
|
"learning_rate": 0.0004486081370449679, |
|
"loss": 1.5103, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.12349555206698064, |
|
"grad_norm": 0.27533096075057983, |
|
"learning_rate": 0.0004475374732334047, |
|
"loss": 1.5274, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.12558869701726844, |
|
"grad_norm": 0.30340835452079773, |
|
"learning_rate": 0.00044646680942184153, |
|
"loss": 1.5842, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12768184196755625, |
|
"grad_norm": 0.2956872880458832, |
|
"learning_rate": 0.00044539614561027837, |
|
"loss": 1.5504, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.12977498691784406, |
|
"grad_norm": 0.2717457413673401, |
|
"learning_rate": 0.00044432548179871526, |
|
"loss": 1.5728, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.13186813186813187, |
|
"grad_norm": 0.27269890904426575, |
|
"learning_rate": 0.00044325481798715205, |
|
"loss": 1.5891, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.13396127681841968, |
|
"grad_norm": 0.294362872838974, |
|
"learning_rate": 0.0004421841541755889, |
|
"loss": 1.5494, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1360544217687075, |
|
"grad_norm": 0.3467015326023102, |
|
"learning_rate": 0.00044111349036402573, |
|
"loss": 1.6157, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.13814756671899528, |
|
"grad_norm": 0.26985207200050354, |
|
"learning_rate": 0.0004400428265524625, |
|
"loss": 1.5489, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1402407116692831, |
|
"grad_norm": 0.30386754870414734, |
|
"learning_rate": 0.00043897216274089935, |
|
"loss": 1.547, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.1423338566195709, |
|
"grad_norm": 0.2737506330013275, |
|
"learning_rate": 0.0004379014989293362, |
|
"loss": 1.541, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.14442700156985872, |
|
"grad_norm": 0.2916475832462311, |
|
"learning_rate": 0.00043683083511777303, |
|
"loss": 1.6008, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.14652014652014653, |
|
"grad_norm": 0.27791959047317505, |
|
"learning_rate": 0.0004357601713062099, |
|
"loss": 1.6368, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14861329147043434, |
|
"grad_norm": 0.2925644516944885, |
|
"learning_rate": 0.0004346895074946467, |
|
"loss": 1.5606, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.15070643642072212, |
|
"grad_norm": 0.2825354039669037, |
|
"learning_rate": 0.0004336188436830835, |
|
"loss": 1.5759, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.15279958137100993, |
|
"grad_norm": 0.27884945273399353, |
|
"learning_rate": 0.00043254817987152034, |
|
"loss": 1.624, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.15489272632129775, |
|
"grad_norm": 0.29608336091041565, |
|
"learning_rate": 0.0004314775160599572, |
|
"loss": 1.5619, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.15698587127158556, |
|
"grad_norm": 0.2830757200717926, |
|
"learning_rate": 0.00043040685224839397, |
|
"loss": 1.6233, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.15907901622187337, |
|
"grad_norm": 0.3144885301589966, |
|
"learning_rate": 0.00042933618843683086, |
|
"loss": 1.567, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.16117216117216118, |
|
"grad_norm": 0.29038679599761963, |
|
"learning_rate": 0.0004282655246252677, |
|
"loss": 1.5317, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.16326530612244897, |
|
"grad_norm": 0.34527644515037537, |
|
"learning_rate": 0.0004271948608137045, |
|
"loss": 1.5485, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.16535845107273678, |
|
"grad_norm": 0.2915840446949005, |
|
"learning_rate": 0.0004261241970021413, |
|
"loss": 1.5792, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.1674515960230246, |
|
"grad_norm": 0.30239176750183105, |
|
"learning_rate": 0.00042505353319057817, |
|
"loss": 1.5533, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1695447409733124, |
|
"grad_norm": 0.28941529989242554, |
|
"learning_rate": 0.00042398286937901495, |
|
"loss": 1.5635, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.17163788592360021, |
|
"grad_norm": 0.27628207206726074, |
|
"learning_rate": 0.0004229122055674518, |
|
"loss": 1.5542, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.17373103087388803, |
|
"grad_norm": 0.28659799695014954, |
|
"learning_rate": 0.0004218415417558887, |
|
"loss": 1.5969, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.17582417582417584, |
|
"grad_norm": 0.2995677888393402, |
|
"learning_rate": 0.00042077087794432553, |
|
"loss": 1.5339, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.17791732077446362, |
|
"grad_norm": 0.28352785110473633, |
|
"learning_rate": 0.0004197002141327623, |
|
"loss": 1.5739, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.18001046572475143, |
|
"grad_norm": 0.296410471200943, |
|
"learning_rate": 0.00041862955032119915, |
|
"loss": 1.5432, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.18210361067503925, |
|
"grad_norm": 0.3075838088989258, |
|
"learning_rate": 0.000417558886509636, |
|
"loss": 1.5749, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.18419675562532706, |
|
"grad_norm": 0.29746511578559875, |
|
"learning_rate": 0.0004164882226980728, |
|
"loss": 1.5509, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.18628990057561487, |
|
"grad_norm": 0.3004538118839264, |
|
"learning_rate": 0.0004154175588865097, |
|
"loss": 1.5732, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.18838304552590268, |
|
"grad_norm": 0.287615031003952, |
|
"learning_rate": 0.0004143468950749465, |
|
"loss": 1.5482, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": 0.2988753020763397, |
|
"learning_rate": 0.0004132762312633833, |
|
"loss": 1.5656, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.19256933542647828, |
|
"grad_norm": 0.29836592078208923, |
|
"learning_rate": 0.00041220556745182014, |
|
"loss": 1.5759, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.1946624803767661, |
|
"grad_norm": 0.3167785406112671, |
|
"learning_rate": 0.000411134903640257, |
|
"loss": 1.5362, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.1967556253270539, |
|
"grad_norm": 0.27747228741645813, |
|
"learning_rate": 0.00041006423982869377, |
|
"loss": 1.5212, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.1988487702773417, |
|
"grad_norm": 0.30162835121154785, |
|
"learning_rate": 0.0004089935760171306, |
|
"loss": 1.5362, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.20094191522762953, |
|
"grad_norm": 0.28325414657592773, |
|
"learning_rate": 0.0004079229122055675, |
|
"loss": 1.4925, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2030350601779173, |
|
"grad_norm": 0.28862977027893066, |
|
"learning_rate": 0.0004068522483940043, |
|
"loss": 1.5731, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 0.3084706962108612, |
|
"learning_rate": 0.0004057815845824411, |
|
"loss": 1.5632, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.20722135007849293, |
|
"grad_norm": 0.29341885447502136, |
|
"learning_rate": 0.00040471092077087797, |
|
"loss": 1.5436, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.20931449502878074, |
|
"grad_norm": 0.3143270015716553, |
|
"learning_rate": 0.00040364025695931475, |
|
"loss": 1.5839, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21140763997906856, |
|
"grad_norm": 0.30276885628700256, |
|
"learning_rate": 0.0004025695931477516, |
|
"loss": 1.6037, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.21350078492935637, |
|
"grad_norm": 0.31467488408088684, |
|
"learning_rate": 0.00040149892933618843, |
|
"loss": 1.5227, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.21559392987964415, |
|
"grad_norm": 0.28636667132377625, |
|
"learning_rate": 0.0004004282655246253, |
|
"loss": 1.5022, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.21768707482993196, |
|
"grad_norm": 0.28783223032951355, |
|
"learning_rate": 0.0003993576017130621, |
|
"loss": 1.5432, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.21978021978021978, |
|
"grad_norm": 0.3006027042865753, |
|
"learning_rate": 0.00039828693790149895, |
|
"loss": 1.5305, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2218733647305076, |
|
"grad_norm": 0.3113887310028076, |
|
"learning_rate": 0.0003972162740899358, |
|
"loss": 1.4946, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2239665096807954, |
|
"grad_norm": 0.3233683109283447, |
|
"learning_rate": 0.0003961456102783726, |
|
"loss": 1.5176, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2260596546310832, |
|
"grad_norm": 0.29007241129875183, |
|
"learning_rate": 0.0003950749464668094, |
|
"loss": 1.5661, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.228152799581371, |
|
"grad_norm": 0.3270627558231354, |
|
"learning_rate": 0.00039400428265524626, |
|
"loss": 1.5414, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2302459445316588, |
|
"grad_norm": 0.2789075970649719, |
|
"learning_rate": 0.0003929336188436831, |
|
"loss": 1.5324, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.23233908948194662, |
|
"grad_norm": 0.3245764672756195, |
|
"learning_rate": 0.00039186295503211994, |
|
"loss": 1.5541, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.23443223443223443, |
|
"grad_norm": 0.28058871626853943, |
|
"learning_rate": 0.0003907922912205568, |
|
"loss": 1.5398, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.23652537938252224, |
|
"grad_norm": 0.29182901978492737, |
|
"learning_rate": 0.00038972162740899356, |
|
"loss": 1.4795, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.23861852433281006, |
|
"grad_norm": 0.3088870346546173, |
|
"learning_rate": 0.0003886509635974304, |
|
"loss": 1.5326, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.24071166928309787, |
|
"grad_norm": 0.3134807050228119, |
|
"learning_rate": 0.00038758029978586725, |
|
"loss": 1.5346, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.24280481423338565, |
|
"grad_norm": 0.30298101902008057, |
|
"learning_rate": 0.00038650963597430403, |
|
"loss": 1.5134, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.24489795918367346, |
|
"grad_norm": 0.2860242426395416, |
|
"learning_rate": 0.0003854389721627409, |
|
"loss": 1.5306, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.24699110413396128, |
|
"grad_norm": 0.2905466556549072, |
|
"learning_rate": 0.00038436830835117777, |
|
"loss": 1.5234, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2490842490842491, |
|
"grad_norm": 0.28561463952064514, |
|
"learning_rate": 0.00038329764453961455, |
|
"loss": 1.4752, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.25117739403453687, |
|
"grad_norm": 0.2877683639526367, |
|
"learning_rate": 0.0003822269807280514, |
|
"loss": 1.598, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2532705389848247, |
|
"grad_norm": 0.2815863788127899, |
|
"learning_rate": 0.00038115631691648823, |
|
"loss": 1.4643, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.2553636839351125, |
|
"grad_norm": 0.28134405612945557, |
|
"learning_rate": 0.000380085653104925, |
|
"loss": 1.5037, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.25745682888540034, |
|
"grad_norm": 0.2940825819969177, |
|
"learning_rate": 0.00037901498929336186, |
|
"loss": 1.4963, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.2595499738356881, |
|
"grad_norm": 0.28303319215774536, |
|
"learning_rate": 0.00037794432548179875, |
|
"loss": 1.4708, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.2616431187859759, |
|
"grad_norm": 0.3112112581729889, |
|
"learning_rate": 0.0003768736616702356, |
|
"loss": 1.4981, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.26373626373626374, |
|
"grad_norm": 0.2902218997478485, |
|
"learning_rate": 0.0003758029978586724, |
|
"loss": 1.4654, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.2658294086865515, |
|
"grad_norm": 0.31582140922546387, |
|
"learning_rate": 0.0003747323340471092, |
|
"loss": 1.551, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.26792255363683937, |
|
"grad_norm": 0.28578075766563416, |
|
"learning_rate": 0.00037366167023554606, |
|
"loss": 1.5018, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.27001569858712715, |
|
"grad_norm": 0.33017498254776, |
|
"learning_rate": 0.00037259100642398284, |
|
"loss": 1.5177, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.272108843537415, |
|
"grad_norm": 0.2954592704772949, |
|
"learning_rate": 0.0003715203426124197, |
|
"loss": 1.5126, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2742019884877028, |
|
"grad_norm": 0.34393608570098877, |
|
"learning_rate": 0.0003704496788008566, |
|
"loss": 1.5061, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.27629513343799056, |
|
"grad_norm": 0.3111407160758972, |
|
"learning_rate": 0.00036937901498929336, |
|
"loss": 1.5599, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.2783882783882784, |
|
"grad_norm": 0.2914719581604004, |
|
"learning_rate": 0.0003683083511777302, |
|
"loss": 1.5165, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.2804814233385662, |
|
"grad_norm": 0.29973331093788147, |
|
"learning_rate": 0.00036723768736616704, |
|
"loss": 1.4669, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.282574568288854, |
|
"grad_norm": 0.3091066777706146, |
|
"learning_rate": 0.00036616702355460383, |
|
"loss": 1.5048, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2846677132391418, |
|
"grad_norm": 0.31531593203544617, |
|
"learning_rate": 0.00036509635974304067, |
|
"loss": 1.5086, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.2867608581894296, |
|
"grad_norm": 0.29967445135116577, |
|
"learning_rate": 0.0003640256959314775, |
|
"loss": 1.546, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.28885400313971743, |
|
"grad_norm": 0.33331945538520813, |
|
"learning_rate": 0.00036295503211991435, |
|
"loss": 1.4824, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.2909471480900052, |
|
"grad_norm": 0.3064332902431488, |
|
"learning_rate": 0.0003618843683083512, |
|
"loss": 1.573, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.29304029304029305, |
|
"grad_norm": 0.3276407718658447, |
|
"learning_rate": 0.00036081370449678803, |
|
"loss": 1.5517, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.29513343799058084, |
|
"grad_norm": 0.2774730622768402, |
|
"learning_rate": 0.0003597430406852248, |
|
"loss": 1.4853, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.2972265829408687, |
|
"grad_norm": 0.33176928758621216, |
|
"learning_rate": 0.00035867237687366166, |
|
"loss": 1.5346, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.29931972789115646, |
|
"grad_norm": 0.29119884967803955, |
|
"learning_rate": 0.0003576017130620985, |
|
"loss": 1.4962, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.30141287284144425, |
|
"grad_norm": 0.2862621545791626, |
|
"learning_rate": 0.0003565310492505354, |
|
"loss": 1.511, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3035060177917321, |
|
"grad_norm": 0.31120261549949646, |
|
"learning_rate": 0.0003554603854389722, |
|
"loss": 1.5114, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.30559916274201987, |
|
"grad_norm": 0.3067992329597473, |
|
"learning_rate": 0.000354389721627409, |
|
"loss": 1.4757, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.30063048005104065, |
|
"learning_rate": 0.00035331905781584586, |
|
"loss": 1.5617, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.3097854526425955, |
|
"grad_norm": 0.3026478588581085, |
|
"learning_rate": 0.00035224839400428264, |
|
"loss": 1.446, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.31187859759288333, |
|
"grad_norm": 0.2846631705760956, |
|
"learning_rate": 0.0003511777301927195, |
|
"loss": 1.4821, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3139717425431711, |
|
"grad_norm": 0.3027445673942566, |
|
"learning_rate": 0.0003501070663811563, |
|
"loss": 1.5073, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3160648874934589, |
|
"grad_norm": 0.29627394676208496, |
|
"learning_rate": 0.00034903640256959316, |
|
"loss": 1.496, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.31815803244374674, |
|
"grad_norm": 0.3188508450984955, |
|
"learning_rate": 0.00034796573875803, |
|
"loss": 1.5395, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3202511773940345, |
|
"grad_norm": 0.29025983810424805, |
|
"learning_rate": 0.00034689507494646684, |
|
"loss": 1.5177, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.32234432234432236, |
|
"grad_norm": 0.31973665952682495, |
|
"learning_rate": 0.00034582441113490363, |
|
"loss": 1.4774, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.32443746729461015, |
|
"grad_norm": 0.29737603664398193, |
|
"learning_rate": 0.00034475374732334047, |
|
"loss": 1.5141, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.32653061224489793, |
|
"grad_norm": 0.31660401821136475, |
|
"learning_rate": 0.0003436830835117773, |
|
"loss": 1.4917, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3286237571951858, |
|
"grad_norm": 0.29376548528671265, |
|
"learning_rate": 0.0003426124197002141, |
|
"loss": 1.4832, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.33071690214547356, |
|
"grad_norm": 0.3015284836292267, |
|
"learning_rate": 0.000341541755888651, |
|
"loss": 1.5408, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3328100470957614, |
|
"grad_norm": 0.3028362989425659, |
|
"learning_rate": 0.00034047109207708783, |
|
"loss": 1.5326, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3349031920460492, |
|
"grad_norm": 0.292458176612854, |
|
"learning_rate": 0.0003394004282655246, |
|
"loss": 1.5642, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.336996336996337, |
|
"grad_norm": 0.29941415786743164, |
|
"learning_rate": 0.00033832976445396146, |
|
"loss": 1.5114, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.3390894819466248, |
|
"grad_norm": 0.2882905602455139, |
|
"learning_rate": 0.0003372591006423983, |
|
"loss": 1.5531, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.3411826268969126, |
|
"grad_norm": 0.294680655002594, |
|
"learning_rate": 0.00033618843683083514, |
|
"loss": 1.4926, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.34327577184720043, |
|
"grad_norm": 0.3013262152671814, |
|
"learning_rate": 0.0003351177730192719, |
|
"loss": 1.5403, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.3453689167974882, |
|
"grad_norm": 0.29358139634132385, |
|
"learning_rate": 0.0003340471092077088, |
|
"loss": 1.503, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.34746206174777605, |
|
"grad_norm": 0.2902645766735077, |
|
"learning_rate": 0.00033297644539614566, |
|
"loss": 1.5107, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.34955520669806384, |
|
"grad_norm": 0.2952733635902405, |
|
"learning_rate": 0.00033190578158458244, |
|
"loss": 1.525, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.3516483516483517, |
|
"grad_norm": 0.29654550552368164, |
|
"learning_rate": 0.0003308351177730193, |
|
"loss": 1.4989, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.35374149659863946, |
|
"grad_norm": 0.28225046396255493, |
|
"learning_rate": 0.0003297644539614561, |
|
"loss": 1.46, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.35583464154892724, |
|
"grad_norm": 0.30447182059288025, |
|
"learning_rate": 0.0003286937901498929, |
|
"loss": 1.5389, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3579277864992151, |
|
"grad_norm": 0.29335105419158936, |
|
"learning_rate": 0.00032762312633832975, |
|
"loss": 1.4962, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.36002093144950287, |
|
"grad_norm": 0.3066118061542511, |
|
"learning_rate": 0.00032655246252676664, |
|
"loss": 1.5122, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.3621140763997907, |
|
"grad_norm": 0.3051617443561554, |
|
"learning_rate": 0.00032548179871520343, |
|
"loss": 1.4534, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.3642072213500785, |
|
"grad_norm": 0.3032102584838867, |
|
"learning_rate": 0.00032441113490364027, |
|
"loss": 1.4937, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.3663003663003663, |
|
"grad_norm": 0.28743627667427063, |
|
"learning_rate": 0.0003233404710920771, |
|
"loss": 1.5288, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3683935112506541, |
|
"grad_norm": 0.2957185208797455, |
|
"learning_rate": 0.0003222698072805139, |
|
"loss": 1.4617, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.3704866562009419, |
|
"grad_norm": 0.30057474970817566, |
|
"learning_rate": 0.00032119914346895073, |
|
"loss": 1.5141, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.37257980115122974, |
|
"grad_norm": 0.3146776556968689, |
|
"learning_rate": 0.0003201284796573876, |
|
"loss": 1.5033, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.3746729461015175, |
|
"grad_norm": 0.3096458911895752, |
|
"learning_rate": 0.0003190578158458244, |
|
"loss": 1.5224, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.37676609105180536, |
|
"grad_norm": 0.30057886242866516, |
|
"learning_rate": 0.00031798715203426126, |
|
"loss": 1.447, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.37885923600209315, |
|
"grad_norm": 0.3033558428287506, |
|
"learning_rate": 0.0003169164882226981, |
|
"loss": 1.5301, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 0.31797683238983154, |
|
"learning_rate": 0.0003158458244111349, |
|
"loss": 1.467, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.38304552590266877, |
|
"grad_norm": 0.28933098912239075, |
|
"learning_rate": 0.0003147751605995717, |
|
"loss": 1.4411, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.38513867085295656, |
|
"grad_norm": 0.3075162470340729, |
|
"learning_rate": 0.00031370449678800856, |
|
"loss": 1.4969, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.3872318158032444, |
|
"grad_norm": 0.2986271381378174, |
|
"learning_rate": 0.0003126338329764454, |
|
"loss": 1.4833, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.3893249607535322, |
|
"grad_norm": 0.30813565850257874, |
|
"learning_rate": 0.00031156316916488224, |
|
"loss": 1.4931, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.39141810570381996, |
|
"grad_norm": 0.30407366156578064, |
|
"learning_rate": 0.0003104925053533191, |
|
"loss": 1.5209, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.3935112506541078, |
|
"grad_norm": 0.3050621747970581, |
|
"learning_rate": 0.0003094218415417559, |
|
"loss": 1.5341, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.3956043956043956, |
|
"grad_norm": 0.29337289929389954, |
|
"learning_rate": 0.0003083511777301927, |
|
"loss": 1.4728, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.3976975405546834, |
|
"grad_norm": 0.3019981384277344, |
|
"learning_rate": 0.00030728051391862955, |
|
"loss": 1.4761, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3997906855049712, |
|
"grad_norm": 0.30941662192344666, |
|
"learning_rate": 0.0003062098501070664, |
|
"loss": 1.5217, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.40188383045525905, |
|
"grad_norm": 0.3021605312824249, |
|
"learning_rate": 0.00030513918629550323, |
|
"loss": 1.5251, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.40397697540554683, |
|
"grad_norm": 0.30456283688545227, |
|
"learning_rate": 0.00030406852248394007, |
|
"loss": 1.5013, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.4060701203558346, |
|
"grad_norm": 0.3142157793045044, |
|
"learning_rate": 0.0003029978586723769, |
|
"loss": 1.446, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.40816326530612246, |
|
"grad_norm": 0.29888784885406494, |
|
"learning_rate": 0.0003019271948608137, |
|
"loss": 1.4682, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 0.29900768399238586, |
|
"learning_rate": 0.00030085653104925053, |
|
"loss": 1.4986, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4123495552066981, |
|
"grad_norm": 0.2862175405025482, |
|
"learning_rate": 0.0002997858672376874, |
|
"loss": 1.4253, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.41444270015698587, |
|
"grad_norm": 0.323761910200119, |
|
"learning_rate": 0.00029871520342612416, |
|
"loss": 1.4583, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.4165358451072737, |
|
"grad_norm": 0.2848580479621887, |
|
"learning_rate": 0.00029764453961456105, |
|
"loss": 1.4632, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.4186289900575615, |
|
"grad_norm": 0.2909344434738159, |
|
"learning_rate": 0.0002965738758029979, |
|
"loss": 1.4738, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 477, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.592309546614784e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|