{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4186289900575615, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020931449502878076, "grad_norm": 2.813615083694458, "learning_rate": 5e-05, "loss": 3.0406, "step": 1 }, { "epoch": 0.004186289900575615, "grad_norm": 2.6805036067962646, "learning_rate": 0.0001, "loss": 2.9062, "step": 2 }, { "epoch": 0.006279434850863423, "grad_norm": 2.032153367996216, "learning_rate": 0.00015, "loss": 2.6258, "step": 3 }, { "epoch": 0.00837257980115123, "grad_norm": 1.7369762659072876, "learning_rate": 0.0002, "loss": 2.4318, "step": 4 }, { "epoch": 0.010465724751439037, "grad_norm": 1.5764646530151367, "learning_rate": 0.00025, "loss": 2.1646, "step": 5 }, { "epoch": 0.012558869701726845, "grad_norm": 1.1295244693756104, "learning_rate": 0.0003, "loss": 2.0415, "step": 6 }, { "epoch": 0.014652014652014652, "grad_norm": 0.9872271418571472, "learning_rate": 0.00035, "loss": 2.0001, "step": 7 }, { "epoch": 0.01674515960230246, "grad_norm": 1.6292792558670044, "learning_rate": 0.0004, "loss": 1.9348, "step": 8 }, { "epoch": 0.018838304552590265, "grad_norm": 0.6931092143058777, "learning_rate": 0.00045000000000000004, "loss": 1.8677, "step": 9 }, { "epoch": 0.020931449502878074, "grad_norm": 0.5845860242843628, "learning_rate": 0.0005, "loss": 1.8057, "step": 10 }, { "epoch": 0.023024594453165882, "grad_norm": 0.6705211997032166, "learning_rate": 0.0004989293361884369, "loss": 1.7457, "step": 11 }, { "epoch": 0.02511773940345369, "grad_norm": 0.6003366112709045, "learning_rate": 0.0004978586723768737, "loss": 1.7477, "step": 12 }, { "epoch": 0.027210884353741496, "grad_norm": 0.764362633228302, "learning_rate": 0.0004967880085653105, "loss": 1.6971, "step": 13 }, { "epoch": 0.029304029304029304, "grad_norm": 0.9480692148208618, "learning_rate": 0.0004957173447537474, "loss": 1.7387, "step": 14 }, { "epoch": 0.03139717425431711, "grad_norm": 0.8233430981636047, "learning_rate": 0.0004946466809421842, "loss": 1.679, "step": 15 }, { "epoch": 0.03349031920460492, "grad_norm": 0.4710249602794647, "learning_rate": 0.000493576017130621, "loss": 1.687, "step": 16 }, { "epoch": 0.035583464154892726, "grad_norm": 0.5001205801963806, "learning_rate": 0.0004925053533190578, "loss": 1.7042, "step": 17 }, { "epoch": 0.03767660910518053, "grad_norm": 0.37574276328086853, "learning_rate": 0.0004914346895074946, "loss": 1.6976, "step": 18 }, { "epoch": 0.03976975405546834, "grad_norm": 0.4445561170578003, "learning_rate": 0.0004903640256959315, "loss": 1.711, "step": 19 }, { "epoch": 0.04186289900575615, "grad_norm": 0.32608023285865784, "learning_rate": 0.0004892933618843683, "loss": 1.6491, "step": 20 }, { "epoch": 0.04395604395604396, "grad_norm": 0.34311097860336304, "learning_rate": 0.0004882226980728052, "loss": 1.6588, "step": 21 }, { "epoch": 0.046049188906331764, "grad_norm": 0.31974583864212036, "learning_rate": 0.000487152034261242, "loss": 1.6907, "step": 22 }, { "epoch": 0.04814233385661957, "grad_norm": 0.3181461989879608, "learning_rate": 0.0004860813704496788, "loss": 1.6776, "step": 23 }, { "epoch": 0.05023547880690738, "grad_norm": 0.31020134687423706, "learning_rate": 0.00048501070663811566, "loss": 1.6665, "step": 24 }, { "epoch": 0.052328623757195186, "grad_norm": 0.3187050521373749, "learning_rate": 0.00048394004282655245, "loss": 1.647, "step": 25 }, { "epoch": 0.05442176870748299, "grad_norm": 0.3272475600242615, "learning_rate": 0.0004828693790149893, "loss": 1.6769, "step": 26 }, { "epoch": 0.0565149136577708, "grad_norm": 0.29509666562080383, "learning_rate": 0.00048179871520342613, "loss": 1.6292, "step": 27 }, { "epoch": 0.05860805860805861, "grad_norm": 0.29900938272476196, "learning_rate": 0.00048072805139186297, "loss": 1.6318, "step": 28 }, { "epoch": 0.06070120355834641, "grad_norm": 0.3012602627277374, "learning_rate": 0.0004796573875802998, "loss": 1.5997, "step": 29 }, { "epoch": 0.06279434850863422, "grad_norm": 0.3533616065979004, "learning_rate": 0.00047858672376873665, "loss": 1.632, "step": 30 }, { "epoch": 0.06488749345892203, "grad_norm": 0.2721816599369049, "learning_rate": 0.00047751605995717344, "loss": 1.6432, "step": 31 }, { "epoch": 0.06698063840920984, "grad_norm": 0.29362842440605164, "learning_rate": 0.0004764453961456103, "loss": 1.6608, "step": 32 }, { "epoch": 0.06907378335949764, "grad_norm": 0.27665096521377563, "learning_rate": 0.0004753747323340471, "loss": 1.6286, "step": 33 }, { "epoch": 0.07116692830978545, "grad_norm": 0.28791311383247375, "learning_rate": 0.0004743040685224839, "loss": 1.6093, "step": 34 }, { "epoch": 0.07326007326007326, "grad_norm": 0.31565895676612854, "learning_rate": 0.0004732334047109208, "loss": 1.6672, "step": 35 }, { "epoch": 0.07535321821036106, "grad_norm": 0.26670706272125244, "learning_rate": 0.00047216274089935764, "loss": 1.6251, "step": 36 }, { "epoch": 0.07744636316064887, "grad_norm": 0.2804130017757416, "learning_rate": 0.0004710920770877944, "loss": 1.5653, "step": 37 }, { "epoch": 0.07953950811093669, "grad_norm": 0.27214744687080383, "learning_rate": 0.00047002141327623126, "loss": 1.5726, "step": 38 }, { "epoch": 0.08163265306122448, "grad_norm": 0.28986403346061707, "learning_rate": 0.0004689507494646681, "loss": 1.601, "step": 39 }, { "epoch": 0.0837257980115123, "grad_norm": 0.3080230951309204, "learning_rate": 0.0004678800856531049, "loss": 1.5939, "step": 40 }, { "epoch": 0.08581894296180011, "grad_norm": 0.2734631597995758, "learning_rate": 0.0004668094218415418, "loss": 1.5951, "step": 41 }, { "epoch": 0.08791208791208792, "grad_norm": 0.28978678584098816, "learning_rate": 0.0004657387580299786, "loss": 1.6146, "step": 42 }, { "epoch": 0.09000523286237572, "grad_norm": 0.27776286005973816, "learning_rate": 0.00046466809421841546, "loss": 1.6194, "step": 43 }, { "epoch": 0.09209837781266353, "grad_norm": 0.2763765752315521, "learning_rate": 0.00046359743040685225, "loss": 1.555, "step": 44 }, { "epoch": 0.09419152276295134, "grad_norm": 0.28580474853515625, "learning_rate": 0.0004625267665952891, "loss": 1.5959, "step": 45 }, { "epoch": 0.09628466771323914, "grad_norm": 0.27958357334136963, "learning_rate": 0.00046145610278372593, "loss": 1.5754, "step": 46 }, { "epoch": 0.09837781266352695, "grad_norm": 0.2925872504711151, "learning_rate": 0.0004603854389721627, "loss": 1.656, "step": 47 }, { "epoch": 0.10047095761381476, "grad_norm": 0.27339980006217957, "learning_rate": 0.0004593147751605996, "loss": 1.6213, "step": 48 }, { "epoch": 0.10256410256410256, "grad_norm": 0.2932608127593994, "learning_rate": 0.00045824411134903645, "loss": 1.5749, "step": 49 }, { "epoch": 0.10465724751439037, "grad_norm": 0.26638683676719666, "learning_rate": 0.00045717344753747323, "loss": 1.5951, "step": 50 }, { "epoch": 0.10675039246467818, "grad_norm": 0.28242963552474976, "learning_rate": 0.0004561027837259101, "loss": 1.5676, "step": 51 }, { "epoch": 0.10884353741496598, "grad_norm": 0.2681107819080353, "learning_rate": 0.0004550321199143469, "loss": 1.5513, "step": 52 }, { "epoch": 0.1109366823652538, "grad_norm": 0.27786529064178467, "learning_rate": 0.0004539614561027837, "loss": 1.4974, "step": 53 }, { "epoch": 0.1130298273155416, "grad_norm": 0.2654118835926056, "learning_rate": 0.00045289079229122054, "loss": 1.6262, "step": 54 }, { "epoch": 0.1151229722658294, "grad_norm": 0.30704954266548157, "learning_rate": 0.00045182012847965744, "loss": 1.564, "step": 55 }, { "epoch": 0.11721611721611722, "grad_norm": 0.27236270904541016, "learning_rate": 0.0004507494646680942, "loss": 1.6199, "step": 56 }, { "epoch": 0.11930926216640503, "grad_norm": 0.2929720878601074, "learning_rate": 0.00044967880085653106, "loss": 1.6008, "step": 57 }, { "epoch": 0.12140240711669283, "grad_norm": 0.28251537680625916, "learning_rate": 0.0004486081370449679, "loss": 1.5103, "step": 58 }, { "epoch": 0.12349555206698064, "grad_norm": 0.27533096075057983, "learning_rate": 0.0004475374732334047, "loss": 1.5274, "step": 59 }, { "epoch": 0.12558869701726844, "grad_norm": 0.30340835452079773, "learning_rate": 0.00044646680942184153, "loss": 1.5842, "step": 60 }, { "epoch": 0.12768184196755625, "grad_norm": 0.2956872880458832, "learning_rate": 0.00044539614561027837, "loss": 1.5504, "step": 61 }, { "epoch": 0.12977498691784406, "grad_norm": 0.2717457413673401, "learning_rate": 0.00044432548179871526, "loss": 1.5728, "step": 62 }, { "epoch": 0.13186813186813187, "grad_norm": 0.27269890904426575, "learning_rate": 0.00044325481798715205, "loss": 1.5891, "step": 63 }, { "epoch": 0.13396127681841968, "grad_norm": 0.294362872838974, "learning_rate": 0.0004421841541755889, "loss": 1.5494, "step": 64 }, { "epoch": 0.1360544217687075, "grad_norm": 0.3467015326023102, "learning_rate": 0.00044111349036402573, "loss": 1.6157, "step": 65 }, { "epoch": 0.13814756671899528, "grad_norm": 0.26985207200050354, "learning_rate": 0.0004400428265524625, "loss": 1.5489, "step": 66 }, { "epoch": 0.1402407116692831, "grad_norm": 0.30386754870414734, "learning_rate": 0.00043897216274089935, "loss": 1.547, "step": 67 }, { "epoch": 0.1423338566195709, "grad_norm": 0.2737506330013275, "learning_rate": 0.0004379014989293362, "loss": 1.541, "step": 68 }, { "epoch": 0.14442700156985872, "grad_norm": 0.2916475832462311, "learning_rate": 0.00043683083511777303, "loss": 1.6008, "step": 69 }, { "epoch": 0.14652014652014653, "grad_norm": 0.27791959047317505, "learning_rate": 0.0004357601713062099, "loss": 1.6368, "step": 70 }, { "epoch": 0.14861329147043434, "grad_norm": 0.2925644516944885, "learning_rate": 0.0004346895074946467, "loss": 1.5606, "step": 71 }, { "epoch": 0.15070643642072212, "grad_norm": 0.2825354039669037, "learning_rate": 0.0004336188436830835, "loss": 1.5759, "step": 72 }, { "epoch": 0.15279958137100993, "grad_norm": 0.27884945273399353, "learning_rate": 0.00043254817987152034, "loss": 1.624, "step": 73 }, { "epoch": 0.15489272632129775, "grad_norm": 0.29608336091041565, "learning_rate": 0.0004314775160599572, "loss": 1.5619, "step": 74 }, { "epoch": 0.15698587127158556, "grad_norm": 0.2830757200717926, "learning_rate": 0.00043040685224839397, "loss": 1.6233, "step": 75 }, { "epoch": 0.15907901622187337, "grad_norm": 0.3144885301589966, "learning_rate": 0.00042933618843683086, "loss": 1.567, "step": 76 }, { "epoch": 0.16117216117216118, "grad_norm": 0.29038679599761963, "learning_rate": 0.0004282655246252677, "loss": 1.5317, "step": 77 }, { "epoch": 0.16326530612244897, "grad_norm": 0.34527644515037537, "learning_rate": 0.0004271948608137045, "loss": 1.5485, "step": 78 }, { "epoch": 0.16535845107273678, "grad_norm": 0.2915840446949005, "learning_rate": 0.0004261241970021413, "loss": 1.5792, "step": 79 }, { "epoch": 0.1674515960230246, "grad_norm": 0.30239176750183105, "learning_rate": 0.00042505353319057817, "loss": 1.5533, "step": 80 }, { "epoch": 0.1695447409733124, "grad_norm": 0.28941529989242554, "learning_rate": 0.00042398286937901495, "loss": 1.5635, "step": 81 }, { "epoch": 0.17163788592360021, "grad_norm": 0.27628207206726074, "learning_rate": 0.0004229122055674518, "loss": 1.5542, "step": 82 }, { "epoch": 0.17373103087388803, "grad_norm": 0.28659799695014954, "learning_rate": 0.0004218415417558887, "loss": 1.5969, "step": 83 }, { "epoch": 0.17582417582417584, "grad_norm": 0.2995677888393402, "learning_rate": 0.00042077087794432553, "loss": 1.5339, "step": 84 }, { "epoch": 0.17791732077446362, "grad_norm": 0.28352785110473633, "learning_rate": 0.0004197002141327623, "loss": 1.5739, "step": 85 }, { "epoch": 0.18001046572475143, "grad_norm": 0.296410471200943, "learning_rate": 0.00041862955032119915, "loss": 1.5432, "step": 86 }, { "epoch": 0.18210361067503925, "grad_norm": 0.3075838088989258, "learning_rate": 0.000417558886509636, "loss": 1.5749, "step": 87 }, { "epoch": 0.18419675562532706, "grad_norm": 0.29746511578559875, "learning_rate": 0.0004164882226980728, "loss": 1.5509, "step": 88 }, { "epoch": 0.18628990057561487, "grad_norm": 0.3004538118839264, "learning_rate": 0.0004154175588865097, "loss": 1.5732, "step": 89 }, { "epoch": 0.18838304552590268, "grad_norm": 0.287615031003952, "learning_rate": 0.0004143468950749465, "loss": 1.5482, "step": 90 }, { "epoch": 0.19047619047619047, "grad_norm": 0.2988753020763397, "learning_rate": 0.0004132762312633833, "loss": 1.5656, "step": 91 }, { "epoch": 0.19256933542647828, "grad_norm": 0.29836592078208923, "learning_rate": 0.00041220556745182014, "loss": 1.5759, "step": 92 }, { "epoch": 0.1946624803767661, "grad_norm": 0.3167785406112671, "learning_rate": 0.000411134903640257, "loss": 1.5362, "step": 93 }, { "epoch": 0.1967556253270539, "grad_norm": 0.27747228741645813, "learning_rate": 0.00041006423982869377, "loss": 1.5212, "step": 94 }, { "epoch": 0.1988487702773417, "grad_norm": 0.30162835121154785, "learning_rate": 0.0004089935760171306, "loss": 1.5362, "step": 95 }, { "epoch": 0.20094191522762953, "grad_norm": 0.28325414657592773, "learning_rate": 0.0004079229122055675, "loss": 1.4925, "step": 96 }, { "epoch": 0.2030350601779173, "grad_norm": 0.28862977027893066, "learning_rate": 0.0004068522483940043, "loss": 1.5731, "step": 97 }, { "epoch": 0.20512820512820512, "grad_norm": 0.3084706962108612, "learning_rate": 0.0004057815845824411, "loss": 1.5632, "step": 98 }, { "epoch": 0.20722135007849293, "grad_norm": 0.29341885447502136, "learning_rate": 0.00040471092077087797, "loss": 1.5436, "step": 99 }, { "epoch": 0.20931449502878074, "grad_norm": 0.3143270015716553, "learning_rate": 0.00040364025695931475, "loss": 1.5839, "step": 100 }, { "epoch": 0.21140763997906856, "grad_norm": 0.30276885628700256, "learning_rate": 0.0004025695931477516, "loss": 1.6037, "step": 101 }, { "epoch": 0.21350078492935637, "grad_norm": 0.31467488408088684, "learning_rate": 0.00040149892933618843, "loss": 1.5227, "step": 102 }, { "epoch": 0.21559392987964415, "grad_norm": 0.28636667132377625, "learning_rate": 0.0004004282655246253, "loss": 1.5022, "step": 103 }, { "epoch": 0.21768707482993196, "grad_norm": 0.28783223032951355, "learning_rate": 0.0003993576017130621, "loss": 1.5432, "step": 104 }, { "epoch": 0.21978021978021978, "grad_norm": 0.3006027042865753, "learning_rate": 0.00039828693790149895, "loss": 1.5305, "step": 105 }, { "epoch": 0.2218733647305076, "grad_norm": 0.3113887310028076, "learning_rate": 0.0003972162740899358, "loss": 1.4946, "step": 106 }, { "epoch": 0.2239665096807954, "grad_norm": 0.3233683109283447, "learning_rate": 0.0003961456102783726, "loss": 1.5176, "step": 107 }, { "epoch": 0.2260596546310832, "grad_norm": 0.29007241129875183, "learning_rate": 0.0003950749464668094, "loss": 1.5661, "step": 108 }, { "epoch": 0.228152799581371, "grad_norm": 0.3270627558231354, "learning_rate": 0.00039400428265524626, "loss": 1.5414, "step": 109 }, { "epoch": 0.2302459445316588, "grad_norm": 0.2789075970649719, "learning_rate": 0.0003929336188436831, "loss": 1.5324, "step": 110 }, { "epoch": 0.23233908948194662, "grad_norm": 0.3245764672756195, "learning_rate": 0.00039186295503211994, "loss": 1.5541, "step": 111 }, { "epoch": 0.23443223443223443, "grad_norm": 0.28058871626853943, "learning_rate": 0.0003907922912205568, "loss": 1.5398, "step": 112 }, { "epoch": 0.23652537938252224, "grad_norm": 0.29182901978492737, "learning_rate": 0.00038972162740899356, "loss": 1.4795, "step": 113 }, { "epoch": 0.23861852433281006, "grad_norm": 0.3088870346546173, "learning_rate": 0.0003886509635974304, "loss": 1.5326, "step": 114 }, { "epoch": 0.24071166928309787, "grad_norm": 0.3134807050228119, "learning_rate": 0.00038758029978586725, "loss": 1.5346, "step": 115 }, { "epoch": 0.24280481423338565, "grad_norm": 0.30298101902008057, "learning_rate": 0.00038650963597430403, "loss": 1.5134, "step": 116 }, { "epoch": 0.24489795918367346, "grad_norm": 0.2860242426395416, "learning_rate": 0.0003854389721627409, "loss": 1.5306, "step": 117 }, { "epoch": 0.24699110413396128, "grad_norm": 0.2905466556549072, "learning_rate": 0.00038436830835117777, "loss": 1.5234, "step": 118 }, { "epoch": 0.2490842490842491, "grad_norm": 0.28561463952064514, "learning_rate": 0.00038329764453961455, "loss": 1.4752, "step": 119 }, { "epoch": 0.25117739403453687, "grad_norm": 0.2877683639526367, "learning_rate": 0.0003822269807280514, "loss": 1.598, "step": 120 }, { "epoch": 0.2532705389848247, "grad_norm": 0.2815863788127899, "learning_rate": 0.00038115631691648823, "loss": 1.4643, "step": 121 }, { "epoch": 0.2553636839351125, "grad_norm": 0.28134405612945557, "learning_rate": 0.000380085653104925, "loss": 1.5037, "step": 122 }, { "epoch": 0.25745682888540034, "grad_norm": 0.2940825819969177, "learning_rate": 0.00037901498929336186, "loss": 1.4963, "step": 123 }, { "epoch": 0.2595499738356881, "grad_norm": 0.28303319215774536, "learning_rate": 0.00037794432548179875, "loss": 1.4708, "step": 124 }, { "epoch": 0.2616431187859759, "grad_norm": 0.3112112581729889, "learning_rate": 0.0003768736616702356, "loss": 1.4981, "step": 125 }, { "epoch": 0.26373626373626374, "grad_norm": 0.2902218997478485, "learning_rate": 0.0003758029978586724, "loss": 1.4654, "step": 126 }, { "epoch": 0.2658294086865515, "grad_norm": 0.31582140922546387, "learning_rate": 0.0003747323340471092, "loss": 1.551, "step": 127 }, { "epoch": 0.26792255363683937, "grad_norm": 0.28578075766563416, "learning_rate": 0.00037366167023554606, "loss": 1.5018, "step": 128 }, { "epoch": 0.27001569858712715, "grad_norm": 0.33017498254776, "learning_rate": 0.00037259100642398284, "loss": 1.5177, "step": 129 }, { "epoch": 0.272108843537415, "grad_norm": 0.2954592704772949, "learning_rate": 0.0003715203426124197, "loss": 1.5126, "step": 130 }, { "epoch": 0.2742019884877028, "grad_norm": 0.34393608570098877, "learning_rate": 0.0003704496788008566, "loss": 1.5061, "step": 131 }, { "epoch": 0.27629513343799056, "grad_norm": 0.3111407160758972, "learning_rate": 0.00036937901498929336, "loss": 1.5599, "step": 132 }, { "epoch": 0.2783882783882784, "grad_norm": 0.2914719581604004, "learning_rate": 0.0003683083511777302, "loss": 1.5165, "step": 133 }, { "epoch": 0.2804814233385662, "grad_norm": 0.29973331093788147, "learning_rate": 0.00036723768736616704, "loss": 1.4669, "step": 134 }, { "epoch": 0.282574568288854, "grad_norm": 0.3091066777706146, "learning_rate": 0.00036616702355460383, "loss": 1.5048, "step": 135 }, { "epoch": 0.2846677132391418, "grad_norm": 0.31531593203544617, "learning_rate": 0.00036509635974304067, "loss": 1.5086, "step": 136 }, { "epoch": 0.2867608581894296, "grad_norm": 0.29967445135116577, "learning_rate": 0.0003640256959314775, "loss": 1.546, "step": 137 }, { "epoch": 0.28885400313971743, "grad_norm": 0.33331945538520813, "learning_rate": 0.00036295503211991435, "loss": 1.4824, "step": 138 }, { "epoch": 0.2909471480900052, "grad_norm": 0.3064332902431488, "learning_rate": 0.0003618843683083512, "loss": 1.573, "step": 139 }, { "epoch": 0.29304029304029305, "grad_norm": 0.3276407718658447, "learning_rate": 0.00036081370449678803, "loss": 1.5517, "step": 140 }, { "epoch": 0.29513343799058084, "grad_norm": 0.2774730622768402, "learning_rate": 0.0003597430406852248, "loss": 1.4853, "step": 141 }, { "epoch": 0.2972265829408687, "grad_norm": 0.33176928758621216, "learning_rate": 0.00035867237687366166, "loss": 1.5346, "step": 142 }, { "epoch": 0.29931972789115646, "grad_norm": 0.29119884967803955, "learning_rate": 0.0003576017130620985, "loss": 1.4962, "step": 143 }, { "epoch": 0.30141287284144425, "grad_norm": 0.2862621545791626, "learning_rate": 0.0003565310492505354, "loss": 1.511, "step": 144 }, { "epoch": 0.3035060177917321, "grad_norm": 0.31120261549949646, "learning_rate": 0.0003554603854389722, "loss": 1.5114, "step": 145 }, { "epoch": 0.30559916274201987, "grad_norm": 0.3067992329597473, "learning_rate": 0.000354389721627409, "loss": 1.4757, "step": 146 }, { "epoch": 0.3076923076923077, "grad_norm": 0.30063048005104065, "learning_rate": 0.00035331905781584586, "loss": 1.5617, "step": 147 }, { "epoch": 0.3097854526425955, "grad_norm": 0.3026478588581085, "learning_rate": 0.00035224839400428264, "loss": 1.446, "step": 148 }, { "epoch": 0.31187859759288333, "grad_norm": 0.2846631705760956, "learning_rate": 0.0003511777301927195, "loss": 1.4821, "step": 149 }, { "epoch": 0.3139717425431711, "grad_norm": 0.3027445673942566, "learning_rate": 0.0003501070663811563, "loss": 1.5073, "step": 150 }, { "epoch": 0.3160648874934589, "grad_norm": 0.29627394676208496, "learning_rate": 0.00034903640256959316, "loss": 1.496, "step": 151 }, { "epoch": 0.31815803244374674, "grad_norm": 0.3188508450984955, "learning_rate": 0.00034796573875803, "loss": 1.5395, "step": 152 }, { "epoch": 0.3202511773940345, "grad_norm": 0.29025983810424805, "learning_rate": 0.00034689507494646684, "loss": 1.5177, "step": 153 }, { "epoch": 0.32234432234432236, "grad_norm": 0.31973665952682495, "learning_rate": 0.00034582441113490363, "loss": 1.4774, "step": 154 }, { "epoch": 0.32443746729461015, "grad_norm": 0.29737603664398193, "learning_rate": 0.00034475374732334047, "loss": 1.5141, "step": 155 }, { "epoch": 0.32653061224489793, "grad_norm": 0.31660401821136475, "learning_rate": 0.0003436830835117773, "loss": 1.4917, "step": 156 }, { "epoch": 0.3286237571951858, "grad_norm": 0.29376548528671265, "learning_rate": 0.0003426124197002141, "loss": 1.4832, "step": 157 }, { "epoch": 0.33071690214547356, "grad_norm": 0.3015284836292267, "learning_rate": 0.000341541755888651, "loss": 1.5408, "step": 158 }, { "epoch": 0.3328100470957614, "grad_norm": 0.3028362989425659, "learning_rate": 0.00034047109207708783, "loss": 1.5326, "step": 159 }, { "epoch": 0.3349031920460492, "grad_norm": 0.292458176612854, "learning_rate": 0.0003394004282655246, "loss": 1.5642, "step": 160 }, { "epoch": 0.336996336996337, "grad_norm": 0.29941415786743164, "learning_rate": 0.00033832976445396146, "loss": 1.5114, "step": 161 }, { "epoch": 0.3390894819466248, "grad_norm": 0.2882905602455139, "learning_rate": 0.0003372591006423983, "loss": 1.5531, "step": 162 }, { "epoch": 0.3411826268969126, "grad_norm": 0.294680655002594, "learning_rate": 0.00033618843683083514, "loss": 1.4926, "step": 163 }, { "epoch": 0.34327577184720043, "grad_norm": 0.3013262152671814, "learning_rate": 0.0003351177730192719, "loss": 1.5403, "step": 164 }, { "epoch": 0.3453689167974882, "grad_norm": 0.29358139634132385, "learning_rate": 0.0003340471092077088, "loss": 1.503, "step": 165 }, { "epoch": 0.34746206174777605, "grad_norm": 0.2902645766735077, "learning_rate": 0.00033297644539614566, "loss": 1.5107, "step": 166 }, { "epoch": 0.34955520669806384, "grad_norm": 0.2952733635902405, "learning_rate": 0.00033190578158458244, "loss": 1.525, "step": 167 }, { "epoch": 0.3516483516483517, "grad_norm": 0.29654550552368164, "learning_rate": 0.0003308351177730193, "loss": 1.4989, "step": 168 }, { "epoch": 0.35374149659863946, "grad_norm": 0.28225046396255493, "learning_rate": 0.0003297644539614561, "loss": 1.46, "step": 169 }, { "epoch": 0.35583464154892724, "grad_norm": 0.30447182059288025, "learning_rate": 0.0003286937901498929, "loss": 1.5389, "step": 170 }, { "epoch": 0.3579277864992151, "grad_norm": 0.29335105419158936, "learning_rate": 0.00032762312633832975, "loss": 1.4962, "step": 171 }, { "epoch": 0.36002093144950287, "grad_norm": 0.3066118061542511, "learning_rate": 0.00032655246252676664, "loss": 1.5122, "step": 172 }, { "epoch": 0.3621140763997907, "grad_norm": 0.3051617443561554, "learning_rate": 0.00032548179871520343, "loss": 1.4534, "step": 173 }, { "epoch": 0.3642072213500785, "grad_norm": 0.3032102584838867, "learning_rate": 0.00032441113490364027, "loss": 1.4937, "step": 174 }, { "epoch": 0.3663003663003663, "grad_norm": 0.28743627667427063, "learning_rate": 0.0003233404710920771, "loss": 1.5288, "step": 175 }, { "epoch": 0.3683935112506541, "grad_norm": 0.2957185208797455, "learning_rate": 0.0003222698072805139, "loss": 1.4617, "step": 176 }, { "epoch": 0.3704866562009419, "grad_norm": 0.30057474970817566, "learning_rate": 0.00032119914346895073, "loss": 1.5141, "step": 177 }, { "epoch": 0.37257980115122974, "grad_norm": 0.3146776556968689, "learning_rate": 0.0003201284796573876, "loss": 1.5033, "step": 178 }, { "epoch": 0.3746729461015175, "grad_norm": 0.3096458911895752, "learning_rate": 0.0003190578158458244, "loss": 1.5224, "step": 179 }, { "epoch": 0.37676609105180536, "grad_norm": 0.30057886242866516, "learning_rate": 0.00031798715203426126, "loss": 1.447, "step": 180 }, { "epoch": 0.37885923600209315, "grad_norm": 0.3033558428287506, "learning_rate": 0.0003169164882226981, "loss": 1.5301, "step": 181 }, { "epoch": 0.38095238095238093, "grad_norm": 0.31797683238983154, "learning_rate": 0.0003158458244111349, "loss": 1.467, "step": 182 }, { "epoch": 0.38304552590266877, "grad_norm": 0.28933098912239075, "learning_rate": 0.0003147751605995717, "loss": 1.4411, "step": 183 }, { "epoch": 0.38513867085295656, "grad_norm": 0.3075162470340729, "learning_rate": 0.00031370449678800856, "loss": 1.4969, "step": 184 }, { "epoch": 0.3872318158032444, "grad_norm": 0.2986271381378174, "learning_rate": 0.0003126338329764454, "loss": 1.4833, "step": 185 }, { "epoch": 0.3893249607535322, "grad_norm": 0.30813565850257874, "learning_rate": 0.00031156316916488224, "loss": 1.4931, "step": 186 }, { "epoch": 0.39141810570381996, "grad_norm": 0.30407366156578064, "learning_rate": 0.0003104925053533191, "loss": 1.5209, "step": 187 }, { "epoch": 0.3935112506541078, "grad_norm": 0.3050621747970581, "learning_rate": 0.0003094218415417559, "loss": 1.5341, "step": 188 }, { "epoch": 0.3956043956043956, "grad_norm": 0.29337289929389954, "learning_rate": 0.0003083511777301927, "loss": 1.4728, "step": 189 }, { "epoch": 0.3976975405546834, "grad_norm": 0.3019981384277344, "learning_rate": 0.00030728051391862955, "loss": 1.4761, "step": 190 }, { "epoch": 0.3997906855049712, "grad_norm": 0.30941662192344666, "learning_rate": 0.0003062098501070664, "loss": 1.5217, "step": 191 }, { "epoch": 0.40188383045525905, "grad_norm": 0.3021605312824249, "learning_rate": 0.00030513918629550323, "loss": 1.5251, "step": 192 }, { "epoch": 0.40397697540554683, "grad_norm": 0.30456283688545227, "learning_rate": 0.00030406852248394007, "loss": 1.5013, "step": 193 }, { "epoch": 0.4060701203558346, "grad_norm": 0.3142157793045044, "learning_rate": 0.0003029978586723769, "loss": 1.446, "step": 194 }, { "epoch": 0.40816326530612246, "grad_norm": 0.29888784885406494, "learning_rate": 0.0003019271948608137, "loss": 1.4682, "step": 195 }, { "epoch": 0.41025641025641024, "grad_norm": 0.29900768399238586, "learning_rate": 0.00030085653104925053, "loss": 1.4986, "step": 196 }, { "epoch": 0.4123495552066981, "grad_norm": 0.2862175405025482, "learning_rate": 0.0002997858672376874, "loss": 1.4253, "step": 197 }, { "epoch": 0.41444270015698587, "grad_norm": 0.323761910200119, "learning_rate": 0.00029871520342612416, "loss": 1.4583, "step": 198 }, { "epoch": 0.4165358451072737, "grad_norm": 0.2848580479621887, "learning_rate": 0.00029764453961456105, "loss": 1.4632, "step": 199 }, { "epoch": 0.4186289900575615, "grad_norm": 0.2909344434738159, "learning_rate": 0.0002965738758029979, "loss": 1.4738, "step": 200 } ], "logging_steps": 1, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.592309546614784e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }