{ "best_metric": 3.3320932388305664, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.37261294829995345, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018630647414997672, "grad_norm": 1.2902519702911377, "learning_rate": 5e-06, "loss": 3.6391, "step": 1 }, { "epoch": 0.0018630647414997672, "eval_loss": 5.516392230987549, "eval_runtime": 32.1445, "eval_samples_per_second": 28.154, "eval_steps_per_second": 14.093, "step": 1 }, { "epoch": 0.0037261294829995344, "grad_norm": 1.502539873123169, "learning_rate": 1e-05, "loss": 4.285, "step": 2 }, { "epoch": 0.0055891942244993015, "grad_norm": 1.6728174686431885, "learning_rate": 1.5e-05, "loss": 4.3833, "step": 3 }, { "epoch": 0.007452258965999069, "grad_norm": 1.809103012084961, "learning_rate": 2e-05, "loss": 4.5885, "step": 4 }, { "epoch": 0.009315323707498836, "grad_norm": 1.8019917011260986, "learning_rate": 2.5e-05, "loss": 4.6271, "step": 5 }, { "epoch": 0.011178388448998603, "grad_norm": 1.8416504859924316, "learning_rate": 3e-05, "loss": 4.6321, "step": 6 }, { "epoch": 0.01304145319049837, "grad_norm": 1.9281914234161377, "learning_rate": 3.5e-05, "loss": 4.3472, "step": 7 }, { "epoch": 0.014904517931998137, "grad_norm": 2.1244757175445557, "learning_rate": 4e-05, "loss": 4.8937, "step": 8 }, { "epoch": 0.016767582673497903, "grad_norm": 1.9028316736221313, "learning_rate": 4.5e-05, "loss": 5.0692, "step": 9 }, { "epoch": 0.018630647414997672, "grad_norm": 2.2906155586242676, "learning_rate": 5e-05, "loss": 4.3372, "step": 10 }, { "epoch": 0.020493712156497437, "grad_norm": 2.0029475688934326, "learning_rate": 5.500000000000001e-05, "loss": 4.4519, "step": 11 }, { "epoch": 0.022356776897997206, "grad_norm": 1.9513241052627563, "learning_rate": 6e-05, "loss": 4.3201, "step": 12 }, { "epoch": 0.02421984163949697, "grad_norm": 1.7343864440917969, "learning_rate": 6.500000000000001e-05, "loss": 4.0463, "step": 13 }, { "epoch": 0.02608290638099674, "grad_norm": 1.9337594509124756, "learning_rate": 7e-05, "loss": 4.0561, "step": 14 }, { "epoch": 0.027945971122496506, "grad_norm": 1.6187396049499512, "learning_rate": 7.500000000000001e-05, "loss": 3.5715, "step": 15 }, { "epoch": 0.029809035863996275, "grad_norm": 1.5559985637664795, "learning_rate": 8e-05, "loss": 4.0944, "step": 16 }, { "epoch": 0.031672100605496044, "grad_norm": 1.6261332035064697, "learning_rate": 8.5e-05, "loss": 3.9513, "step": 17 }, { "epoch": 0.033535165346995806, "grad_norm": 1.8501555919647217, "learning_rate": 9e-05, "loss": 3.9689, "step": 18 }, { "epoch": 0.035398230088495575, "grad_norm": 2.036672592163086, "learning_rate": 9.5e-05, "loss": 3.9789, "step": 19 }, { "epoch": 0.037261294829995344, "grad_norm": 2.047884702682495, "learning_rate": 0.0001, "loss": 3.663, "step": 20 }, { "epoch": 0.03912435957149511, "grad_norm": 1.90353262424469, "learning_rate": 9.999238475781957e-05, "loss": 3.5892, "step": 21 }, { "epoch": 0.040987424312994875, "grad_norm": 1.863972783088684, "learning_rate": 9.99695413509548e-05, "loss": 3.9369, "step": 22 }, { "epoch": 0.04285048905449464, "grad_norm": 1.4511252641677856, "learning_rate": 9.99314767377287e-05, "loss": 3.1318, "step": 23 }, { "epoch": 0.04471355379599441, "grad_norm": 1.4355757236480713, "learning_rate": 9.987820251299122e-05, "loss": 3.983, "step": 24 }, { "epoch": 0.04657661853749418, "grad_norm": 1.4540237188339233, "learning_rate": 9.980973490458728e-05, "loss": 4.1114, "step": 25 }, { "epoch": 0.04843968327899394, "grad_norm": 1.5578112602233887, "learning_rate": 9.972609476841367e-05, "loss": 3.8437, "step": 26 }, { "epoch": 0.05030274802049371, "grad_norm": 1.8616338968276978, "learning_rate": 9.962730758206611e-05, "loss": 3.516, "step": 27 }, { "epoch": 0.05216581276199348, "grad_norm": 1.6210098266601562, "learning_rate": 9.951340343707852e-05, "loss": 3.3105, "step": 28 }, { "epoch": 0.05402887750349324, "grad_norm": 1.6511797904968262, "learning_rate": 9.938441702975689e-05, "loss": 3.9478, "step": 29 }, { "epoch": 0.05589194224499301, "grad_norm": 1.6636693477630615, "learning_rate": 9.924038765061042e-05, "loss": 3.6779, "step": 30 }, { "epoch": 0.05775500698649278, "grad_norm": 1.4905117750167847, "learning_rate": 9.908135917238321e-05, "loss": 3.2574, "step": 31 }, { "epoch": 0.05961807172799255, "grad_norm": 1.6063334941864014, "learning_rate": 9.890738003669029e-05, "loss": 3.9851, "step": 32 }, { "epoch": 0.06148113646949231, "grad_norm": 1.7283437252044678, "learning_rate": 9.871850323926177e-05, "loss": 3.705, "step": 33 }, { "epoch": 0.06334420121099209, "grad_norm": 1.5343276262283325, "learning_rate": 9.851478631379982e-05, "loss": 3.5288, "step": 34 }, { "epoch": 0.06520726595249185, "grad_norm": 1.7014079093933105, "learning_rate": 9.829629131445342e-05, "loss": 3.8288, "step": 35 }, { "epoch": 0.06707033069399161, "grad_norm": 1.7944095134735107, "learning_rate": 9.806308479691595e-05, "loss": 3.8894, "step": 36 }, { "epoch": 0.06893339543549139, "grad_norm": 1.5314217805862427, "learning_rate": 9.781523779815179e-05, "loss": 3.2699, "step": 37 }, { "epoch": 0.07079646017699115, "grad_norm": 1.6168136596679688, "learning_rate": 9.755282581475769e-05, "loss": 3.8024, "step": 38 }, { "epoch": 0.07265952491849091, "grad_norm": 1.3477504253387451, "learning_rate": 9.727592877996585e-05, "loss": 3.0357, "step": 39 }, { "epoch": 0.07452258965999069, "grad_norm": 2.0698187351226807, "learning_rate": 9.698463103929542e-05, "loss": 3.7317, "step": 40 }, { "epoch": 0.07638565440149045, "grad_norm": 1.6671048402786255, "learning_rate": 9.667902132486009e-05, "loss": 3.4916, "step": 41 }, { "epoch": 0.07824871914299023, "grad_norm": 1.8396328687667847, "learning_rate": 9.635919272833938e-05, "loss": 3.5342, "step": 42 }, { "epoch": 0.08011178388448999, "grad_norm": 1.5536775588989258, "learning_rate": 9.602524267262203e-05, "loss": 3.8207, "step": 43 }, { "epoch": 0.08197484862598975, "grad_norm": 1.5929099321365356, "learning_rate": 9.567727288213005e-05, "loss": 3.8526, "step": 44 }, { "epoch": 0.08383791336748952, "grad_norm": 2.140622854232788, "learning_rate": 9.53153893518325e-05, "loss": 3.9536, "step": 45 }, { "epoch": 0.08570097810898929, "grad_norm": 2.204415798187256, "learning_rate": 9.493970231495835e-05, "loss": 3.8141, "step": 46 }, { "epoch": 0.08756404285048905, "grad_norm": 2.16359806060791, "learning_rate": 9.45503262094184e-05, "loss": 4.0383, "step": 47 }, { "epoch": 0.08942710759198882, "grad_norm": 2.2646803855895996, "learning_rate": 9.414737964294636e-05, "loss": 3.9917, "step": 48 }, { "epoch": 0.09129017233348859, "grad_norm": 2.4855337142944336, "learning_rate": 9.373098535696979e-05, "loss": 3.8354, "step": 49 }, { "epoch": 0.09315323707498836, "grad_norm": 2.349766254425049, "learning_rate": 9.330127018922194e-05, "loss": 3.9016, "step": 50 }, { "epoch": 0.09315323707498836, "eval_loss": 3.6060919761657715, "eval_runtime": 32.1835, "eval_samples_per_second": 28.12, "eval_steps_per_second": 14.076, "step": 50 }, { "epoch": 0.09501630181648812, "grad_norm": 2.1820530891418457, "learning_rate": 9.285836503510562e-05, "loss": 2.9137, "step": 51 }, { "epoch": 0.09687936655798789, "grad_norm": 2.1806118488311768, "learning_rate": 9.24024048078213e-05, "loss": 3.1755, "step": 52 }, { "epoch": 0.09874243129948766, "grad_norm": 1.8996943235397339, "learning_rate": 9.193352839727121e-05, "loss": 3.3476, "step": 53 }, { "epoch": 0.10060549604098742, "grad_norm": 1.8744741678237915, "learning_rate": 9.145187862775209e-05, "loss": 3.196, "step": 54 }, { "epoch": 0.10246856078248719, "grad_norm": 1.70234215259552, "learning_rate": 9.09576022144496e-05, "loss": 3.3386, "step": 55 }, { "epoch": 0.10433162552398696, "grad_norm": 1.7539249658584595, "learning_rate": 9.045084971874738e-05, "loss": 3.4391, "step": 56 }, { "epoch": 0.10619469026548672, "grad_norm": 1.4793843030929565, "learning_rate": 8.993177550236464e-05, "loss": 3.9207, "step": 57 }, { "epoch": 0.10805775500698649, "grad_norm": 1.1978991031646729, "learning_rate": 8.940053768033609e-05, "loss": 3.2176, "step": 58 }, { "epoch": 0.10992081974848626, "grad_norm": 1.1151164770126343, "learning_rate": 8.885729807284856e-05, "loss": 3.0396, "step": 59 }, { "epoch": 0.11178388448998602, "grad_norm": 1.0803781747817993, "learning_rate": 8.83022221559489e-05, "loss": 2.9356, "step": 60 }, { "epoch": 0.1136469492314858, "grad_norm": 1.1919324398040771, "learning_rate": 8.773547901113862e-05, "loss": 3.0124, "step": 61 }, { "epoch": 0.11551001397298556, "grad_norm": 1.0512971878051758, "learning_rate": 8.715724127386972e-05, "loss": 3.2844, "step": 62 }, { "epoch": 0.11737307871448532, "grad_norm": 1.3854892253875732, "learning_rate": 8.656768508095853e-05, "loss": 3.3558, "step": 63 }, { "epoch": 0.1192361434559851, "grad_norm": 1.2489628791809082, "learning_rate": 8.596699001693255e-05, "loss": 3.2082, "step": 64 }, { "epoch": 0.12109920819748486, "grad_norm": 1.0986685752868652, "learning_rate": 8.535533905932738e-05, "loss": 3.0937, "step": 65 }, { "epoch": 0.12296227293898462, "grad_norm": 1.2437379360198975, "learning_rate": 8.473291852294987e-05, "loss": 3.2174, "step": 66 }, { "epoch": 0.1248253376804844, "grad_norm": 1.1477092504501343, "learning_rate": 8.409991800312493e-05, "loss": 3.2221, "step": 67 }, { "epoch": 0.12668840242198418, "grad_norm": 1.4317594766616821, "learning_rate": 8.345653031794292e-05, "loss": 3.2315, "step": 68 }, { "epoch": 0.12855146716348392, "grad_norm": 1.2513046264648438, "learning_rate": 8.280295144952536e-05, "loss": 3.4117, "step": 69 }, { "epoch": 0.1304145319049837, "grad_norm": 1.2725088596343994, "learning_rate": 8.213938048432697e-05, "loss": 3.4154, "step": 70 }, { "epoch": 0.13227759664648348, "grad_norm": 1.301317572593689, "learning_rate": 8.146601955249188e-05, "loss": 3.2894, "step": 71 }, { "epoch": 0.13414066138798322, "grad_norm": 1.2711513042449951, "learning_rate": 8.07830737662829e-05, "loss": 3.352, "step": 72 }, { "epoch": 0.136003726129483, "grad_norm": 1.2225326299667358, "learning_rate": 8.009075115760243e-05, "loss": 3.2548, "step": 73 }, { "epoch": 0.13786679087098277, "grad_norm": 1.2163617610931396, "learning_rate": 7.938926261462366e-05, "loss": 3.5588, "step": 74 }, { "epoch": 0.13972985561248252, "grad_norm": 2.666508436203003, "learning_rate": 7.86788218175523e-05, "loss": 3.3621, "step": 75 }, { "epoch": 0.1415929203539823, "grad_norm": 1.1624727249145508, "learning_rate": 7.795964517353735e-05, "loss": 3.3753, "step": 76 }, { "epoch": 0.14345598509548207, "grad_norm": 1.2207733392715454, "learning_rate": 7.723195175075136e-05, "loss": 3.0322, "step": 77 }, { "epoch": 0.14531904983698182, "grad_norm": 1.153766393661499, "learning_rate": 7.649596321166024e-05, "loss": 3.3144, "step": 78 }, { "epoch": 0.1471821145784816, "grad_norm": 1.0764729976654053, "learning_rate": 7.575190374550272e-05, "loss": 2.8701, "step": 79 }, { "epoch": 0.14904517931998137, "grad_norm": 1.2744287252426147, "learning_rate": 7.500000000000001e-05, "loss": 3.5407, "step": 80 }, { "epoch": 0.15090824406148112, "grad_norm": 1.4500547647476196, "learning_rate": 7.424048101231686e-05, "loss": 3.3808, "step": 81 }, { "epoch": 0.1527713088029809, "grad_norm": 2.205737352371216, "learning_rate": 7.347357813929454e-05, "loss": 3.2512, "step": 82 }, { "epoch": 0.15463437354448067, "grad_norm": 1.1358613967895508, "learning_rate": 7.269952498697734e-05, "loss": 3.4559, "step": 83 }, { "epoch": 0.15649743828598045, "grad_norm": 1.6348944902420044, "learning_rate": 7.191855733945387e-05, "loss": 3.9512, "step": 84 }, { "epoch": 0.1583605030274802, "grad_norm": 1.229777455329895, "learning_rate": 7.113091308703498e-05, "loss": 2.9064, "step": 85 }, { "epoch": 0.16022356776897997, "grad_norm": 1.1657277345657349, "learning_rate": 7.033683215379002e-05, "loss": 3.0757, "step": 86 }, { "epoch": 0.16208663251047975, "grad_norm": 1.4670532941818237, "learning_rate": 6.953655642446368e-05, "loss": 3.4218, "step": 87 }, { "epoch": 0.1639496972519795, "grad_norm": 1.5080535411834717, "learning_rate": 6.873032967079561e-05, "loss": 3.3886, "step": 88 }, { "epoch": 0.16581276199347927, "grad_norm": 1.4212788343429565, "learning_rate": 6.7918397477265e-05, "loss": 3.6124, "step": 89 }, { "epoch": 0.16767582673497905, "grad_norm": 1.553883671760559, "learning_rate": 6.710100716628344e-05, "loss": 3.7264, "step": 90 }, { "epoch": 0.1695388914764788, "grad_norm": 1.4848854541778564, "learning_rate": 6.627840772285784e-05, "loss": 3.1516, "step": 91 }, { "epoch": 0.17140195621797857, "grad_norm": 1.2597936391830444, "learning_rate": 6.545084971874738e-05, "loss": 3.3816, "step": 92 }, { "epoch": 0.17326502095947835, "grad_norm": 1.4741157293319702, "learning_rate": 6.461858523613684e-05, "loss": 3.4027, "step": 93 }, { "epoch": 0.1751280857009781, "grad_norm": 1.5933159589767456, "learning_rate": 6.378186779084995e-05, "loss": 3.4609, "step": 94 }, { "epoch": 0.17699115044247787, "grad_norm": 1.6153674125671387, "learning_rate": 6.294095225512603e-05, "loss": 3.7559, "step": 95 }, { "epoch": 0.17885421518397765, "grad_norm": 1.5263915061950684, "learning_rate": 6.209609477998338e-05, "loss": 3.5021, "step": 96 }, { "epoch": 0.1807172799254774, "grad_norm": 2.1421830654144287, "learning_rate": 6.124755271719325e-05, "loss": 4.2071, "step": 97 }, { "epoch": 0.18258034466697717, "grad_norm": 1.6600545644760132, "learning_rate": 6.0395584540887963e-05, "loss": 3.4893, "step": 98 }, { "epoch": 0.18444340940847695, "grad_norm": 2.4432666301727295, "learning_rate": 5.9540449768827246e-05, "loss": 3.889, "step": 99 }, { "epoch": 0.18630647414997673, "grad_norm": 3.4110183715820312, "learning_rate": 5.868240888334653e-05, "loss": 4.2759, "step": 100 }, { "epoch": 0.18630647414997673, "eval_loss": 3.3999626636505127, "eval_runtime": 32.1259, "eval_samples_per_second": 28.17, "eval_steps_per_second": 14.101, "step": 100 }, { "epoch": 0.18816953889147647, "grad_norm": 1.373574137687683, "learning_rate": 5.782172325201155e-05, "loss": 3.0844, "step": 101 }, { "epoch": 0.19003260363297625, "grad_norm": 1.2612931728363037, "learning_rate": 5.695865504800327e-05, "loss": 3.4433, "step": 102 }, { "epoch": 0.19189566837447602, "grad_norm": 1.346120834350586, "learning_rate": 5.6093467170257374e-05, "loss": 3.3105, "step": 103 }, { "epoch": 0.19375873311597577, "grad_norm": 1.3103049993515015, "learning_rate": 5.522642316338268e-05, "loss": 3.7383, "step": 104 }, { "epoch": 0.19562179785747555, "grad_norm": 1.2798504829406738, "learning_rate": 5.435778713738292e-05, "loss": 3.0195, "step": 105 }, { "epoch": 0.19748486259897532, "grad_norm": 1.2848749160766602, "learning_rate": 5.348782368720626e-05, "loss": 2.9208, "step": 106 }, { "epoch": 0.19934792734047507, "grad_norm": 1.1921271085739136, "learning_rate": 5.26167978121472e-05, "loss": 3.238, "step": 107 }, { "epoch": 0.20121099208197485, "grad_norm": 1.2079963684082031, "learning_rate": 5.174497483512506e-05, "loss": 3.2653, "step": 108 }, { "epoch": 0.20307405682347462, "grad_norm": 1.150623083114624, "learning_rate": 5.0872620321864185e-05, "loss": 3.3047, "step": 109 }, { "epoch": 0.20493712156497437, "grad_norm": 1.12302565574646, "learning_rate": 5e-05, "loss": 2.9365, "step": 110 }, { "epoch": 0.20680018630647415, "grad_norm": 0.8901563882827759, "learning_rate": 4.912737967813583e-05, "loss": 2.5866, "step": 111 }, { "epoch": 0.20866325104797392, "grad_norm": 1.5094679594039917, "learning_rate": 4.825502516487497e-05, "loss": 3.5975, "step": 112 }, { "epoch": 0.21052631578947367, "grad_norm": 1.1921519041061401, "learning_rate": 4.738320218785281e-05, "loss": 3.0366, "step": 113 }, { "epoch": 0.21238938053097345, "grad_norm": 1.2302180528640747, "learning_rate": 4.6512176312793736e-05, "loss": 3.1477, "step": 114 }, { "epoch": 0.21425244527247322, "grad_norm": 0.9722185730934143, "learning_rate": 4.564221286261709e-05, "loss": 2.9138, "step": 115 }, { "epoch": 0.21611551001397297, "grad_norm": 1.2122098207473755, "learning_rate": 4.477357683661734e-05, "loss": 3.3952, "step": 116 }, { "epoch": 0.21797857475547275, "grad_norm": 1.0260534286499023, "learning_rate": 4.390653282974264e-05, "loss": 3.0228, "step": 117 }, { "epoch": 0.21984163949697252, "grad_norm": 1.134804606437683, "learning_rate": 4.3041344951996746e-05, "loss": 3.2426, "step": 118 }, { "epoch": 0.2217047042384723, "grad_norm": 1.2314471006393433, "learning_rate": 4.2178276747988446e-05, "loss": 3.1156, "step": 119 }, { "epoch": 0.22356776897997205, "grad_norm": 1.9310802221298218, "learning_rate": 4.131759111665349e-05, "loss": 3.2605, "step": 120 }, { "epoch": 0.22543083372147182, "grad_norm": 1.244099497795105, "learning_rate": 4.045955023117276e-05, "loss": 3.3355, "step": 121 }, { "epoch": 0.2272938984629716, "grad_norm": 1.1614549160003662, "learning_rate": 3.960441545911204e-05, "loss": 2.9461, "step": 122 }, { "epoch": 0.22915696320447135, "grad_norm": 1.1840636730194092, "learning_rate": 3.875244728280676e-05, "loss": 3.1649, "step": 123 }, { "epoch": 0.23102002794597112, "grad_norm": 1.1917698383331299, "learning_rate": 3.790390522001662e-05, "loss": 3.2872, "step": 124 }, { "epoch": 0.2328830926874709, "grad_norm": 1.3176980018615723, "learning_rate": 3.705904774487396e-05, "loss": 3.4799, "step": 125 }, { "epoch": 0.23474615742897065, "grad_norm": 1.1151647567749023, "learning_rate": 3.6218132209150045e-05, "loss": 2.9211, "step": 126 }, { "epoch": 0.23660922217047042, "grad_norm": 1.0221425294876099, "learning_rate": 3.5381414763863166e-05, "loss": 2.8233, "step": 127 }, { "epoch": 0.2384722869119702, "grad_norm": 1.9823639392852783, "learning_rate": 3.4549150281252636e-05, "loss": 3.0263, "step": 128 }, { "epoch": 0.24033535165346995, "grad_norm": 1.2429327964782715, "learning_rate": 3.372159227714218e-05, "loss": 3.3876, "step": 129 }, { "epoch": 0.24219841639496972, "grad_norm": 1.3118374347686768, "learning_rate": 3.289899283371657e-05, "loss": 3.3716, "step": 130 }, { "epoch": 0.2440614811364695, "grad_norm": 1.3129647970199585, "learning_rate": 3.2081602522734986e-05, "loss": 3.2332, "step": 131 }, { "epoch": 0.24592454587796925, "grad_norm": 1.2996779680252075, "learning_rate": 3.12696703292044e-05, "loss": 3.4357, "step": 132 }, { "epoch": 0.24778761061946902, "grad_norm": 1.2652643918991089, "learning_rate": 3.046344357553632e-05, "loss": 3.3381, "step": 133 }, { "epoch": 0.2496506753609688, "grad_norm": 1.2161061763763428, "learning_rate": 2.9663167846209998e-05, "loss": 3.4872, "step": 134 }, { "epoch": 0.2515137401024686, "grad_norm": 1.251867651939392, "learning_rate": 2.886908691296504e-05, "loss": 3.2131, "step": 135 }, { "epoch": 0.25337680484396835, "grad_norm": 1.3443360328674316, "learning_rate": 2.8081442660546125e-05, "loss": 3.1788, "step": 136 }, { "epoch": 0.25523986958546807, "grad_norm": 1.188568115234375, "learning_rate": 2.7300475013022663e-05, "loss": 3.4101, "step": 137 }, { "epoch": 0.25710293432696785, "grad_norm": 1.5125073194503784, "learning_rate": 2.6526421860705473e-05, "loss": 3.6851, "step": 138 }, { "epoch": 0.2589659990684676, "grad_norm": 1.3027081489562988, "learning_rate": 2.575951898768315e-05, "loss": 3.3351, "step": 139 }, { "epoch": 0.2608290638099674, "grad_norm": 1.2359957695007324, "learning_rate": 2.500000000000001e-05, "loss": 3.2338, "step": 140 }, { "epoch": 0.2626921285514672, "grad_norm": 1.3322489261627197, "learning_rate": 2.4248096254497288e-05, "loss": 2.8686, "step": 141 }, { "epoch": 0.26455519329296695, "grad_norm": 2.0788941383361816, "learning_rate": 2.350403678833976e-05, "loss": 3.6646, "step": 142 }, { "epoch": 0.26641825803446667, "grad_norm": 1.3164119720458984, "learning_rate": 2.2768048249248648e-05, "loss": 3.4545, "step": 143 }, { "epoch": 0.26828132277596645, "grad_norm": 1.652212381362915, "learning_rate": 2.2040354826462668e-05, "loss": 3.5121, "step": 144 }, { "epoch": 0.2701443875174662, "grad_norm": 1.5663607120513916, "learning_rate": 2.132117818244771e-05, "loss": 3.7725, "step": 145 }, { "epoch": 0.272007452258966, "grad_norm": 1.5914528369903564, "learning_rate": 2.061073738537635e-05, "loss": 3.7862, "step": 146 }, { "epoch": 0.2738705170004658, "grad_norm": 1.9115759134292603, "learning_rate": 1.9909248842397584e-05, "loss": 4.0243, "step": 147 }, { "epoch": 0.27573358174196555, "grad_norm": 1.703169584274292, "learning_rate": 1.9216926233717085e-05, "loss": 3.3129, "step": 148 }, { "epoch": 0.2775966464834653, "grad_norm": 1.8172022104263306, "learning_rate": 1.8533980447508137e-05, "loss": 3.9249, "step": 149 }, { "epoch": 0.27945971122496505, "grad_norm": 2.922093391418457, "learning_rate": 1.7860619515673033e-05, "loss": 4.2185, "step": 150 }, { "epoch": 0.27945971122496505, "eval_loss": 3.3465943336486816, "eval_runtime": 32.1695, "eval_samples_per_second": 28.132, "eval_steps_per_second": 14.082, "step": 150 }, { "epoch": 0.2813227759664648, "grad_norm": 1.0994431972503662, "learning_rate": 1.7197048550474643e-05, "loss": 2.7072, "step": 151 }, { "epoch": 0.2831858407079646, "grad_norm": 1.0657951831817627, "learning_rate": 1.6543469682057106e-05, "loss": 3.1659, "step": 152 }, { "epoch": 0.2850489054494644, "grad_norm": 1.3122929334640503, "learning_rate": 1.5900081996875083e-05, "loss": 3.1423, "step": 153 }, { "epoch": 0.28691197019096415, "grad_norm": 1.0391207933425903, "learning_rate": 1.526708147705013e-05, "loss": 2.8446, "step": 154 }, { "epoch": 0.2887750349324639, "grad_norm": 1.1318707466125488, "learning_rate": 1.4644660940672627e-05, "loss": 3.1408, "step": 155 }, { "epoch": 0.29063809967396365, "grad_norm": 1.162157416343689, "learning_rate": 1.4033009983067452e-05, "loss": 2.852, "step": 156 }, { "epoch": 0.2925011644154634, "grad_norm": 1.4153108596801758, "learning_rate": 1.3432314919041478e-05, "loss": 3.2088, "step": 157 }, { "epoch": 0.2943642291569632, "grad_norm": 1.2663054466247559, "learning_rate": 1.2842758726130283e-05, "loss": 3.3267, "step": 158 }, { "epoch": 0.296227293898463, "grad_norm": 1.0257394313812256, "learning_rate": 1.22645209888614e-05, "loss": 3.1659, "step": 159 }, { "epoch": 0.29809035863996275, "grad_norm": 1.4004676342010498, "learning_rate": 1.1697777844051105e-05, "loss": 3.1246, "step": 160 }, { "epoch": 0.2999534233814625, "grad_norm": 1.103905439376831, "learning_rate": 1.1142701927151456e-05, "loss": 3.2662, "step": 161 }, { "epoch": 0.30181648812296225, "grad_norm": 1.0684690475463867, "learning_rate": 1.0599462319663905e-05, "loss": 3.0609, "step": 162 }, { "epoch": 0.303679552864462, "grad_norm": 1.1447290182113647, "learning_rate": 1.006822449763537e-05, "loss": 2.9505, "step": 163 }, { "epoch": 0.3055426176059618, "grad_norm": 1.2237478494644165, "learning_rate": 9.549150281252633e-06, "loss": 3.2494, "step": 164 }, { "epoch": 0.3074056823474616, "grad_norm": 1.0637820959091187, "learning_rate": 9.042397785550405e-06, "loss": 2.9076, "step": 165 }, { "epoch": 0.30926874708896135, "grad_norm": 1.2343932390213013, "learning_rate": 8.548121372247918e-06, "loss": 3.3097, "step": 166 }, { "epoch": 0.3111318118304611, "grad_norm": 1.0267237424850464, "learning_rate": 8.066471602728803e-06, "loss": 2.92, "step": 167 }, { "epoch": 0.3129948765719609, "grad_norm": 1.1801772117614746, "learning_rate": 7.597595192178702e-06, "loss": 3.0281, "step": 168 }, { "epoch": 0.3148579413134606, "grad_norm": 1.3864883184432983, "learning_rate": 7.1416349648943894e-06, "loss": 3.2964, "step": 169 }, { "epoch": 0.3167210060549604, "grad_norm": 1.212416410446167, "learning_rate": 6.698729810778065e-06, "loss": 3.1213, "step": 170 }, { "epoch": 0.3185840707964602, "grad_norm": 1.1211936473846436, "learning_rate": 6.269014643030213e-06, "loss": 2.8045, "step": 171 }, { "epoch": 0.32044713553795995, "grad_norm": 1.1482990980148315, "learning_rate": 5.852620357053651e-06, "loss": 3.0468, "step": 172 }, { "epoch": 0.3223102002794597, "grad_norm": 1.3345028162002563, "learning_rate": 5.449673790581611e-06, "loss": 2.9481, "step": 173 }, { "epoch": 0.3241732650209595, "grad_norm": 1.0090028047561646, "learning_rate": 5.060297685041659e-06, "loss": 3.0541, "step": 174 }, { "epoch": 0.3260363297624592, "grad_norm": 1.2291425466537476, "learning_rate": 4.684610648167503e-06, "loss": 3.4524, "step": 175 }, { "epoch": 0.327899394503959, "grad_norm": 1.4188092947006226, "learning_rate": 4.322727117869951e-06, "loss": 3.387, "step": 176 }, { "epoch": 0.32976245924545877, "grad_norm": 1.2818740606307983, "learning_rate": 3.974757327377981e-06, "loss": 3.5364, "step": 177 }, { "epoch": 0.33162552398695855, "grad_norm": 1.3271952867507935, "learning_rate": 3.6408072716606346e-06, "loss": 3.1821, "step": 178 }, { "epoch": 0.3334885887284583, "grad_norm": 1.2895276546478271, "learning_rate": 3.3209786751399187e-06, "loss": 3.0432, "step": 179 }, { "epoch": 0.3353516534699581, "grad_norm": 1.2879928350448608, "learning_rate": 3.0153689607045845e-06, "loss": 3.367, "step": 180 }, { "epoch": 0.3372147182114579, "grad_norm": 1.3287880420684814, "learning_rate": 2.724071220034158e-06, "loss": 3.2295, "step": 181 }, { "epoch": 0.3390777829529576, "grad_norm": 1.314154863357544, "learning_rate": 2.4471741852423237e-06, "loss": 3.3232, "step": 182 }, { "epoch": 0.34094084769445737, "grad_norm": 1.3569366931915283, "learning_rate": 2.1847622018482283e-06, "loss": 2.9797, "step": 183 }, { "epoch": 0.34280391243595715, "grad_norm": 1.5291014909744263, "learning_rate": 1.9369152030840556e-06, "loss": 3.2835, "step": 184 }, { "epoch": 0.3446669771774569, "grad_norm": 1.3267407417297363, "learning_rate": 1.70370868554659e-06, "loss": 3.1526, "step": 185 }, { "epoch": 0.3465300419189567, "grad_norm": 1.36220383644104, "learning_rate": 1.4852136862001764e-06, "loss": 3.7465, "step": 186 }, { "epoch": 0.3483931066604565, "grad_norm": 1.2978960275650024, "learning_rate": 1.2814967607382432e-06, "loss": 3.7277, "step": 187 }, { "epoch": 0.3502561714019562, "grad_norm": 1.3940683603286743, "learning_rate": 1.0926199633097157e-06, "loss": 3.4685, "step": 188 }, { "epoch": 0.35211923614345597, "grad_norm": 1.4953304529190063, "learning_rate": 9.186408276168013e-07, "loss": 3.5472, "step": 189 }, { "epoch": 0.35398230088495575, "grad_norm": 1.4648510217666626, "learning_rate": 7.596123493895991e-07, "loss": 3.3383, "step": 190 }, { "epoch": 0.3558453656264555, "grad_norm": 1.5643545389175415, "learning_rate": 6.15582970243117e-07, "loss": 3.4377, "step": 191 }, { "epoch": 0.3577084303679553, "grad_norm": 1.4430068731307983, "learning_rate": 4.865965629214819e-07, "loss": 3.6607, "step": 192 }, { "epoch": 0.3595714951094551, "grad_norm": 1.5247834920883179, "learning_rate": 3.7269241793390085e-07, "loss": 3.8584, "step": 193 }, { "epoch": 0.3614345598509548, "grad_norm": 1.5962194204330444, "learning_rate": 2.7390523158633554e-07, "loss": 3.6967, "step": 194 }, { "epoch": 0.36329762459245457, "grad_norm": 3.414898633956909, "learning_rate": 1.9026509541272275e-07, "loss": 3.7934, "step": 195 }, { "epoch": 0.36516068933395435, "grad_norm": 1.7514005899429321, "learning_rate": 1.2179748700879012e-07, "loss": 3.9039, "step": 196 }, { "epoch": 0.3670237540754541, "grad_norm": 1.91195809841156, "learning_rate": 6.852326227130834e-08, "loss": 3.604, "step": 197 }, { "epoch": 0.3688868188169539, "grad_norm": 2.1014304161071777, "learning_rate": 3.04586490452119e-08, "loss": 3.6654, "step": 198 }, { "epoch": 0.3707498835584537, "grad_norm": 2.118112564086914, "learning_rate": 7.615242180436522e-09, "loss": 3.8243, "step": 199 }, { "epoch": 0.37261294829995345, "grad_norm": 3.0300862789154053, "learning_rate": 0.0, "loss": 3.6882, "step": 200 }, { "epoch": 0.37261294829995345, "eval_loss": 3.3320932388305664, "eval_runtime": 32.1296, "eval_samples_per_second": 28.167, "eval_steps_per_second": 14.099, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.4428946137088e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }