|
{ |
|
"best_metric": 1.3063520193099976, |
|
"best_model_checkpoint": "/mnt/users/n3thakur/vectara/huggingface-dpo/trained_models/v3/Meta-Llama-3-8B-Instruct-miracl-mix-raft-sft-25th-apr-v1.0/checkpoint-2000", |
|
"epoch": 0.9996544972935621, |
|
"eval_steps": 200, |
|
"global_step": 2170, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.0262953847325709, |
|
"learning_rate": 4.608294930875576e-08, |
|
"loss": 1.7621, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.954880939596983, |
|
"learning_rate": 2.3041474654377884e-07, |
|
"loss": 1.7602, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.0004644202041775, |
|
"learning_rate": 4.608294930875577e-07, |
|
"loss": 1.8162, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.0280115397490373, |
|
"learning_rate": 6.912442396313365e-07, |
|
"loss": 1.7724, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.8732665912953524, |
|
"learning_rate": 9.216589861751154e-07, |
|
"loss": 1.7378, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.9441885280056963, |
|
"learning_rate": 1.1520737327188942e-06, |
|
"loss": 1.7507, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.9845213443275476, |
|
"learning_rate": 1.382488479262673e-06, |
|
"loss": 1.7561, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9521516823054887, |
|
"learning_rate": 1.6129032258064516e-06, |
|
"loss": 1.7479, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.0047303931298321, |
|
"learning_rate": 1.8433179723502307e-06, |
|
"loss": 1.7773, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9230639224993269, |
|
"learning_rate": 2.0737327188940094e-06, |
|
"loss": 1.7752, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9112137084493539, |
|
"learning_rate": 2.3041474654377884e-06, |
|
"loss": 1.7213, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.8909029043612109, |
|
"learning_rate": 2.5345622119815673e-06, |
|
"loss": 1.7881, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7189360071641709, |
|
"learning_rate": 2.764976958525346e-06, |
|
"loss": 1.6636, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.6692633878715751, |
|
"learning_rate": 2.9953917050691243e-06, |
|
"loss": 1.6508, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.6188511577538577, |
|
"learning_rate": 3.225806451612903e-06, |
|
"loss": 1.7011, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.6279410712564498, |
|
"learning_rate": 3.4562211981566825e-06, |
|
"loss": 1.6403, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.5342697084988228, |
|
"learning_rate": 3.6866359447004615e-06, |
|
"loss": 1.656, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.44402512139023, |
|
"learning_rate": 3.91705069124424e-06, |
|
"loss": 1.6617, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.4644619191475398, |
|
"learning_rate": 4.147465437788019e-06, |
|
"loss": 1.6303, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.371289417426217, |
|
"learning_rate": 4.377880184331797e-06, |
|
"loss": 1.6453, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3309787275557066, |
|
"learning_rate": 4.608294930875577e-06, |
|
"loss": 1.5851, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.31477252449769144, |
|
"learning_rate": 4.838709677419355e-06, |
|
"loss": 1.604, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2612544823481869, |
|
"learning_rate": 5.0691244239631346e-06, |
|
"loss": 1.5725, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2605484332821398, |
|
"learning_rate": 5.299539170506913e-06, |
|
"loss": 1.5644, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.25148232016385397, |
|
"learning_rate": 5.529953917050692e-06, |
|
"loss": 1.5467, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.2601998287824473, |
|
"learning_rate": 5.76036866359447e-06, |
|
"loss": 1.589, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.2385121609938763, |
|
"learning_rate": 5.9907834101382485e-06, |
|
"loss": 1.6055, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.24629649793790628, |
|
"learning_rate": 6.221198156682028e-06, |
|
"loss": 1.5373, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.2468655185850127, |
|
"learning_rate": 6.451612903225806e-06, |
|
"loss": 1.5614, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.22223391970377232, |
|
"learning_rate": 6.682027649769586e-06, |
|
"loss": 1.5624, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.2062049955510628, |
|
"learning_rate": 6.912442396313365e-06, |
|
"loss": 1.5013, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.19876212655323225, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 1.575, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.19765238479942285, |
|
"learning_rate": 7.373271889400923e-06, |
|
"loss": 1.5167, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.19240215611308686, |
|
"learning_rate": 7.603686635944701e-06, |
|
"loss": 1.5071, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.2001053295395004, |
|
"learning_rate": 7.83410138248848e-06, |
|
"loss": 1.4932, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.20863957442778325, |
|
"learning_rate": 8.064516129032258e-06, |
|
"loss": 1.5371, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.1880073967302754, |
|
"learning_rate": 8.294930875576038e-06, |
|
"loss": 1.492, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.18052519991071567, |
|
"learning_rate": 8.525345622119815e-06, |
|
"loss": 1.5039, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.17252733686762506, |
|
"learning_rate": 8.755760368663595e-06, |
|
"loss": 1.492, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.1754856007551659, |
|
"learning_rate": 8.986175115207374e-06, |
|
"loss": 1.4926, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.16521189205999245, |
|
"learning_rate": 9.216589861751153e-06, |
|
"loss": 1.4903, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 1.3960996866226196, |
|
"eval_runtime": 1753.0374, |
|
"eval_samples_per_second": 2.162, |
|
"eval_steps_per_second": 0.27, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.1618648599795676, |
|
"learning_rate": 9.447004608294931e-06, |
|
"loss": 1.4499, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.1609921643942943, |
|
"learning_rate": 9.67741935483871e-06, |
|
"loss": 1.4775, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.161807092383862, |
|
"learning_rate": 9.90783410138249e-06, |
|
"loss": 1.4779, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.15789342984068674, |
|
"learning_rate": 9.999941779365509e-06, |
|
"loss": 1.4064, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.17953421611953463, |
|
"learning_rate": 9.99958599150926e-06, |
|
"loss": 1.4216, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.17033046165043145, |
|
"learning_rate": 9.998906783581494e-06, |
|
"loss": 1.4872, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.15935058975437466, |
|
"learning_rate": 9.997904199519748e-06, |
|
"loss": 1.4473, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.16188090301066688, |
|
"learning_rate": 9.996578304180551e-06, |
|
"loss": 1.4484, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.1692826467439708, |
|
"learning_rate": 9.994929183335237e-06, |
|
"loss": 1.4576, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.16233371930510018, |
|
"learning_rate": 9.992956943664401e-06, |
|
"loss": 1.4674, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.1601701723529579, |
|
"learning_rate": 9.99066171275098e-06, |
|
"loss": 1.434, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.1515934744211598, |
|
"learning_rate": 9.988043639072021e-06, |
|
"loss": 1.469, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.1558360755989006, |
|
"learning_rate": 9.985102891989063e-06, |
|
"loss": 1.4688, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.20565866787405954, |
|
"learning_rate": 9.98183966173718e-06, |
|
"loss": 1.4794, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.16589082756750548, |
|
"learning_rate": 9.97825415941269e-06, |
|
"loss": 1.4514, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.16179243903344054, |
|
"learning_rate": 9.974346616959476e-06, |
|
"loss": 1.4802, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.15782689090886812, |
|
"learning_rate": 9.970117287154004e-06, |
|
"loss": 1.4356, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.15954652235182576, |
|
"learning_rate": 9.965566443588956e-06, |
|
"loss": 1.3886, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.16253218390477409, |
|
"learning_rate": 9.960694380655539e-06, |
|
"loss": 1.456, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.15571215758010262, |
|
"learning_rate": 9.955501413524438e-06, |
|
"loss": 1.4038, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.15764668416666167, |
|
"learning_rate": 9.949987878125427e-06, |
|
"loss": 1.4292, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.15565461047093965, |
|
"learning_rate": 9.944154131125643e-06, |
|
"loss": 1.3845, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.16274505869667716, |
|
"learning_rate": 9.938000549906509e-06, |
|
"loss": 1.4143, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.15771377018169355, |
|
"learning_rate": 9.93152753253932e-06, |
|
"loss": 1.414, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.15819595549860918, |
|
"learning_rate": 9.924735497759497e-06, |
|
"loss": 1.398, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.16836842547624103, |
|
"learning_rate": 9.917624884939495e-06, |
|
"loss": 1.415, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.16454346246919038, |
|
"learning_rate": 9.910196154060381e-06, |
|
"loss": 1.5025, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.15952712351687925, |
|
"learning_rate": 9.902449785682084e-06, |
|
"loss": 1.4602, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.15913354058980336, |
|
"learning_rate": 9.894386280912298e-06, |
|
"loss": 1.4437, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.16526633029775475, |
|
"learning_rate": 9.88600616137407e-06, |
|
"loss": 1.443, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.1570973278965336, |
|
"learning_rate": 9.877309969172065e-06, |
|
"loss": 1.4001, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.16323401596231638, |
|
"learning_rate": 9.868298266857477e-06, |
|
"loss": 1.4115, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.16999252145703372, |
|
"learning_rate": 9.858971637391662e-06, |
|
"loss": 1.431, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.1598876979960763, |
|
"learning_rate": 9.849330684108409e-06, |
|
"loss": 1.3925, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.16716694688842804, |
|
"learning_rate": 9.83937603067492e-06, |
|
"loss": 1.4369, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.17160997265475317, |
|
"learning_rate": 9.829108321051461e-06, |
|
"loss": 1.4236, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.1709419213466886, |
|
"learning_rate": 9.818528219449705e-06, |
|
"loss": 1.4156, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.1706660498176678, |
|
"learning_rate": 9.807636410289767e-06, |
|
"loss": 1.3531, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.1654207117166432, |
|
"learning_rate": 9.796433598155928e-06, |
|
"loss": 1.4282, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.17005569486504588, |
|
"learning_rate": 9.784920507751052e-06, |
|
"loss": 1.465, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 1.3499208688735962, |
|
"eval_runtime": 1761.5948, |
|
"eval_samples_per_second": 2.151, |
|
"eval_steps_per_second": 0.269, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.16465910018813473, |
|
"learning_rate": 9.773097883849715e-06, |
|
"loss": 1.4856, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.1687241699251977, |
|
"learning_rate": 9.760966491250018e-06, |
|
"loss": 1.4448, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.173826161389088, |
|
"learning_rate": 9.748527114724111e-06, |
|
"loss": 1.4588, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.1664583424134419, |
|
"learning_rate": 9.735780558967434e-06, |
|
"loss": 1.3651, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.16305809613976227, |
|
"learning_rate": 9.72272764854666e-06, |
|
"loss": 1.386, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.16712240916605367, |
|
"learning_rate": 9.709369227846346e-06, |
|
"loss": 1.4249, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.1685889272801495, |
|
"learning_rate": 9.695706161014322e-06, |
|
"loss": 1.4629, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.16213828032523545, |
|
"learning_rate": 9.681739331905784e-06, |
|
"loss": 1.4633, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.1722162760562697, |
|
"learning_rate": 9.667469644026118e-06, |
|
"loss": 1.4147, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.16634245789106444, |
|
"learning_rate": 9.652898020472449e-06, |
|
"loss": 1.4254, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.17414333030226264, |
|
"learning_rate": 9.638025403873939e-06, |
|
"loss": 1.3734, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.17078620575685172, |
|
"learning_rate": 9.622852756330797e-06, |
|
"loss": 1.4313, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.17201073443780537, |
|
"learning_rate": 9.60738105935204e-06, |
|
"loss": 1.4412, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.1781118635582674, |
|
"learning_rate": 9.59161131379201e-06, |
|
"loss": 1.4102, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.17417835939253798, |
|
"learning_rate": 9.575544539785626e-06, |
|
"loss": 1.4311, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.17050610923384396, |
|
"learning_rate": 9.559181776682387e-06, |
|
"loss": 1.4627, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.176092363473546, |
|
"learning_rate": 9.542524082979138e-06, |
|
"loss": 1.4517, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.17498193119749225, |
|
"learning_rate": 9.525572536251608e-06, |
|
"loss": 1.3956, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.17003085925682157, |
|
"learning_rate": 9.50832823308468e-06, |
|
"loss": 1.4012, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.17064790853058265, |
|
"learning_rate": 9.490792289001476e-06, |
|
"loss": 1.3523, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.1799693810618531, |
|
"learning_rate": 9.472965838391187e-06, |
|
"loss": 1.4446, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.16868170325096435, |
|
"learning_rate": 9.454850034435679e-06, |
|
"loss": 1.3912, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.16981991686336434, |
|
"learning_rate": 9.436446049034913e-06, |
|
"loss": 1.3986, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.1755120644725739, |
|
"learning_rate": 9.417755072731121e-06, |
|
"loss": 1.4117, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.1757934575464805, |
|
"learning_rate": 9.398778314631801e-06, |
|
"loss": 1.3587, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.173876963970309, |
|
"learning_rate": 9.379517002331489e-06, |
|
"loss": 1.3862, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.17746114421749437, |
|
"learning_rate": 9.359972381832358e-06, |
|
"loss": 1.4309, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.16869333613216586, |
|
"learning_rate": 9.340145717463609e-06, |
|
"loss": 1.4118, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.17746743334088458, |
|
"learning_rate": 9.320038291799679e-06, |
|
"loss": 1.4433, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.17587914404814756, |
|
"learning_rate": 9.299651405577286e-06, |
|
"loss": 1.4421, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.18048857244310448, |
|
"learning_rate": 9.278986377611266e-06, |
|
"loss": 1.4221, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.17371693231140117, |
|
"learning_rate": 9.258044544709276e-06, |
|
"loss": 1.4131, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.17693122935797964, |
|
"learning_rate": 9.236827261585306e-06, |
|
"loss": 1.4205, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.18630601640696606, |
|
"learning_rate": 9.215335900772048e-06, |
|
"loss": 1.4067, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.17220695844147543, |
|
"learning_rate": 9.193571852532112e-06, |
|
"loss": 1.3834, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.18435801453629633, |
|
"learning_rate": 9.17153652476808e-06, |
|
"loss": 1.3485, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.1719982815834864, |
|
"learning_rate": 9.14923134293144e-06, |
|
"loss": 1.4265, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.17622887749418029, |
|
"learning_rate": 9.126657749930365e-06, |
|
"loss": 1.4242, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.1772303960409497, |
|
"learning_rate": 9.103817206036383e-06, |
|
"loss": 1.3901, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.18039291853040396, |
|
"learning_rate": 9.080711188789903e-06, |
|
"loss": 1.4193, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 1.333003282546997, |
|
"eval_runtime": 1759.7499, |
|
"eval_samples_per_second": 2.154, |
|
"eval_steps_per_second": 0.269, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.18468507528075692, |
|
"learning_rate": 9.057341192904641e-06, |
|
"loss": 1.4663, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.17990669625911423, |
|
"learning_rate": 9.033708730170925e-06, |
|
"loss": 1.4289, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.17925115015014306, |
|
"learning_rate": 9.009815329357893e-06, |
|
"loss": 1.4337, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.1742038025383068, |
|
"learning_rate": 8.985662536114614e-06, |
|
"loss": 1.4156, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.18266353155608991, |
|
"learning_rate": 8.961251912870077e-06, |
|
"loss": 1.3896, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.18057511349721395, |
|
"learning_rate": 8.936585038732143e-06, |
|
"loss": 1.3764, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.18596859871805246, |
|
"learning_rate": 8.91166350938537e-06, |
|
"loss": 1.4193, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.18986971270095016, |
|
"learning_rate": 8.886488936987817e-06, |
|
"loss": 1.3955, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.17418454140586195, |
|
"learning_rate": 8.861062950066723e-06, |
|
"loss": 1.427, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.18149732001872015, |
|
"learning_rate": 8.835387193413185e-06, |
|
"loss": 1.4046, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.18147198520857166, |
|
"learning_rate": 8.809463327975741e-06, |
|
"loss": 1.4058, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.18099108296498378, |
|
"learning_rate": 8.783293030752932e-06, |
|
"loss": 1.4066, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.1794240687483909, |
|
"learning_rate": 8.756877994684818e-06, |
|
"loss": 1.3921, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.1839307276635119, |
|
"learning_rate": 8.730219928543458e-06, |
|
"loss": 1.4054, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.18145863325096048, |
|
"learning_rate": 8.703320556822375e-06, |
|
"loss": 1.4053, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.1808487553528171, |
|
"learning_rate": 8.676181619624996e-06, |
|
"loss": 1.4055, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.1862476804387168, |
|
"learning_rate": 8.648804872552092e-06, |
|
"loss": 1.3841, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.19563150015616995, |
|
"learning_rate": 8.6211920865882e-06, |
|
"loss": 1.371, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.18923848385153544, |
|
"learning_rate": 8.593345047987069e-06, |
|
"loss": 1.3988, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.18078263758464272, |
|
"learning_rate": 8.565265558156101e-06, |
|
"loss": 1.4024, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.18258637947544226, |
|
"learning_rate": 8.536955433539824e-06, |
|
"loss": 1.371, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.19116331272141834, |
|
"learning_rate": 8.508416505502383e-06, |
|
"loss": 1.4456, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.17469959227839357, |
|
"learning_rate": 8.479650620209072e-06, |
|
"loss": 1.385, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.18410741548679613, |
|
"learning_rate": 8.450659638506908e-06, |
|
"loss": 1.4095, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.19634261946282605, |
|
"learning_rate": 8.421445435804255e-06, |
|
"loss": 1.3513, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.17826721350047323, |
|
"learning_rate": 8.3920099019495e-06, |
|
"loss": 1.3792, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.1826653979119606, |
|
"learning_rate": 8.362354941108803e-06, |
|
"loss": 1.4448, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.18664731594802075, |
|
"learning_rate": 8.33248247164292e-06, |
|
"loss": 1.3751, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.18231556377003602, |
|
"learning_rate": 8.3023944259831e-06, |
|
"loss": 1.3773, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.18711137034484868, |
|
"learning_rate": 8.272092750506084e-06, |
|
"loss": 1.4096, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.1877955269203901, |
|
"learning_rate": 8.241579405408192e-06, |
|
"loss": 1.3902, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.18482019451091206, |
|
"learning_rate": 8.21085636457851e-06, |
|
"loss": 1.3734, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.19891330231660218, |
|
"learning_rate": 8.179925615471218e-06, |
|
"loss": 1.4061, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.18663983192529415, |
|
"learning_rate": 8.148789158977012e-06, |
|
"loss": 1.3326, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.1874487096476331, |
|
"learning_rate": 8.117449009293668e-06, |
|
"loss": 1.3384, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.18710305973456598, |
|
"learning_rate": 8.085907193795745e-06, |
|
"loss": 1.3828, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.18416014945175566, |
|
"learning_rate": 8.05416575290344e-06, |
|
"loss": 1.3737, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.18615555988464447, |
|
"learning_rate": 8.022226739950587e-06, |
|
"loss": 1.4359, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.18594902983475312, |
|
"learning_rate": 7.990092221051835e-06, |
|
"loss": 1.389, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.18537070852284854, |
|
"learning_rate": 7.95776427496899e-06, |
|
"loss": 1.3593, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_loss": 1.323183298110962, |
|
"eval_runtime": 1742.9319, |
|
"eval_samples_per_second": 2.174, |
|
"eval_steps_per_second": 0.272, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.1908169492182471, |
|
"learning_rate": 7.925244992976538e-06, |
|
"loss": 1.3406, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.18784810075497232, |
|
"learning_rate": 7.89253647872637e-06, |
|
"loss": 1.3842, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.19406647113841424, |
|
"learning_rate": 7.859640848111686e-06, |
|
"loss": 1.4286, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.19197603494160256, |
|
"learning_rate": 7.826560229130132e-06, |
|
"loss": 1.3928, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.19099716433921685, |
|
"learning_rate": 7.793296761746126e-06, |
|
"loss": 1.362, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.18788356013356616, |
|
"learning_rate": 7.759852597752447e-06, |
|
"loss": 1.4034, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.1921444320557867, |
|
"learning_rate": 7.726229900631015e-06, |
|
"loss": 1.3793, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.20734046130350145, |
|
"learning_rate": 7.692430845412946e-06, |
|
"loss": 1.4203, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.19179256662995678, |
|
"learning_rate": 7.658457618537853e-06, |
|
"loss": 1.4021, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.18555040743415147, |
|
"learning_rate": 7.624312417712403e-06, |
|
"loss": 1.423, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.19398963612254347, |
|
"learning_rate": 7.58999745176815e-06, |
|
"loss": 1.4367, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.19074855950766817, |
|
"learning_rate": 7.555514940518647e-06, |
|
"loss": 1.3695, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.1893550054117395, |
|
"learning_rate": 7.520867114615844e-06, |
|
"loss": 1.3939, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.19196726666071628, |
|
"learning_rate": 7.486056215405797e-06, |
|
"loss": 1.3964, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2100904294893222, |
|
"learning_rate": 7.451084494783668e-06, |
|
"loss": 1.3753, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.1870023533707271, |
|
"learning_rate": 7.415954215048057e-06, |
|
"loss": 1.379, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.19635898208120364, |
|
"learning_rate": 7.38066764875465e-06, |
|
"loss": 1.4329, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.1896615635850299, |
|
"learning_rate": 7.345227078569218e-06, |
|
"loss": 1.357, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.19424130426015207, |
|
"learning_rate": 7.309634797119941e-06, |
|
"loss": 1.3774, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.22888693201104138, |
|
"learning_rate": 7.273893106849108e-06, |
|
"loss": 1.3976, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.1919456484613934, |
|
"learning_rate": 7.23800431986417e-06, |
|
"loss": 1.378, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.19241540158105003, |
|
"learning_rate": 7.201970757788172e-06, |
|
"loss": 1.4094, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.19333437065467562, |
|
"learning_rate": 7.165794751609569e-06, |
|
"loss": 1.3971, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.19460601771644864, |
|
"learning_rate": 7.1294786415314336e-06, |
|
"loss": 1.3879, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.18754355788787921, |
|
"learning_rate": 7.093024776820076e-06, |
|
"loss": 1.3534, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.18648196033472134, |
|
"learning_rate": 7.056435515653059e-06, |
|
"loss": 1.3969, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.18580702737411495, |
|
"learning_rate": 7.019713224966664e-06, |
|
"loss": 1.4416, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.24918096880464727, |
|
"learning_rate": 6.9828602803027664e-06, |
|
"loss": 1.3814, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.19003799001704857, |
|
"learning_rate": 6.945879065655164e-06, |
|
"loss": 1.3581, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.19777328354162663, |
|
"learning_rate": 6.90877197331536e-06, |
|
"loss": 1.3883, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.1978053982444075, |
|
"learning_rate": 6.871541403717808e-06, |
|
"loss": 1.4298, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.19360877663534273, |
|
"learning_rate": 6.83418976528462e-06, |
|
"loss": 1.3623, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.19124472077513613, |
|
"learning_rate": 6.7967194742697866e-06, |
|
"loss": 1.3965, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.2026314484251062, |
|
"learning_rate": 6.759132954602852e-06, |
|
"loss": 1.3889, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.20177996042623167, |
|
"learning_rate": 6.721432637732117e-06, |
|
"loss": 1.3987, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.19199095023199664, |
|
"learning_rate": 6.6836209624673575e-06, |
|
"loss": 1.3658, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.19092380360827313, |
|
"learning_rate": 6.64570037482205e-06, |
|
"loss": 1.3601, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.21156775391897173, |
|
"learning_rate": 6.607673327855149e-06, |
|
"loss": 1.4427, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.19213500215723073, |
|
"learning_rate": 6.569542281512388e-06, |
|
"loss": 1.3934, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.19675677797230362, |
|
"learning_rate": 6.531309702467159e-06, |
|
"loss": 1.3552, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 1.3166489601135254, |
|
"eval_runtime": 1748.0429, |
|
"eval_samples_per_second": 2.168, |
|
"eval_steps_per_second": 0.271, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.18781337537819495, |
|
"learning_rate": 6.492978063960942e-06, |
|
"loss": 1.3937, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.1935060574506341, |
|
"learning_rate": 6.45454984564331e-06, |
|
"loss": 1.4284, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.1936429054806515, |
|
"learning_rate": 6.41602753341152e-06, |
|
"loss": 1.3618, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.19582370428755932, |
|
"learning_rate": 6.377413619249713e-06, |
|
"loss": 1.3822, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.18931877193304708, |
|
"learning_rate": 6.338710601067691e-06, |
|
"loss": 1.3473, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.1952618362908433, |
|
"learning_rate": 6.2999209825393445e-06, |
|
"loss": 1.369, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.196265797323174, |
|
"learning_rate": 6.2610472729406905e-06, |
|
"loss": 1.3679, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.1909001830769802, |
|
"learning_rate": 6.222091986987534e-06, |
|
"loss": 1.3939, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.1974314278029084, |
|
"learning_rate": 6.18305764467281e-06, |
|
"loss": 1.4111, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.19874201405315123, |
|
"learning_rate": 6.143946771103561e-06, |
|
"loss": 1.383, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.20109125948229767, |
|
"learning_rate": 6.104761896337581e-06, |
|
"loss": 1.3548, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.18937099603698346, |
|
"learning_rate": 6.0655055552197616e-06, |
|
"loss": 1.4427, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.20257668978871882, |
|
"learning_rate": 6.026180287218106e-06, |
|
"loss": 1.3773, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.19836764097355777, |
|
"learning_rate": 5.986788636259453e-06, |
|
"loss": 1.3945, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.19345413549116036, |
|
"learning_rate": 5.9473331505649125e-06, |
|
"loss": 1.4439, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.19576415671480885, |
|
"learning_rate": 5.907816382485026e-06, |
|
"loss": 1.3432, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.19643295445396305, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 1.3459, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.19760654138918068, |
|
"learning_rate": 5.828609228227603e-06, |
|
"loss": 1.4334, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.1907622356589435, |
|
"learning_rate": 5.788923965911028e-06, |
|
"loss": 1.3195, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.19974010922162466, |
|
"learning_rate": 5.749187668599574e-06, |
|
"loss": 1.3973, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.1899811113208983, |
|
"learning_rate": 5.709402906809307e-06, |
|
"loss": 1.3788, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.19125136139691074, |
|
"learning_rate": 5.669572254191431e-06, |
|
"loss": 1.3749, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.19773999696793723, |
|
"learning_rate": 5.6296982873658e-06, |
|
"loss": 1.3812, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.19522230435252472, |
|
"learning_rate": 5.5897835857542315e-06, |
|
"loss": 1.3639, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.19203158758627778, |
|
"learning_rate": 5.549830731413655e-06, |
|
"loss": 1.3988, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.19785365288187984, |
|
"learning_rate": 5.509842308869075e-06, |
|
"loss": 1.4031, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.20110593532241236, |
|
"learning_rate": 5.469820904946383e-06, |
|
"loss": 1.3447, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.20730230846403253, |
|
"learning_rate": 5.429769108605013e-06, |
|
"loss": 1.433, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.19538915157122386, |
|
"learning_rate": 5.389689510770462e-06, |
|
"loss": 1.3751, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.19657731321379315, |
|
"learning_rate": 5.3495847041666935e-06, |
|
"loss": 1.4427, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.19885598934336826, |
|
"learning_rate": 5.30945728314841e-06, |
|
"loss": 1.3526, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.19763223830130308, |
|
"learning_rate": 5.269309843533222e-06, |
|
"loss": 1.3792, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.1934600019166271, |
|
"learning_rate": 5.229144982433736e-06, |
|
"loss": 1.3827, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.19215817298202406, |
|
"learning_rate": 5.188965298089538e-06, |
|
"loss": 1.3609, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.1950665854099098, |
|
"learning_rate": 5.148773389699123e-06, |
|
"loss": 1.3728, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.20163181813335146, |
|
"learning_rate": 5.108571857251754e-06, |
|
"loss": 1.3937, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.19492701026725848, |
|
"learning_rate": 5.068363301359263e-06, |
|
"loss": 1.3976, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.1915934216230785, |
|
"learning_rate": 5.0281503230878304e-06, |
|
"loss": 1.3778, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.19100537033272963, |
|
"learning_rate": 4.98793552378971e-06, |
|
"loss": 1.4221, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.19909784027087774, |
|
"learning_rate": 4.947721504934966e-06, |
|
"loss": 1.3685, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_loss": 1.3122756481170654, |
|
"eval_runtime": 1747.9099, |
|
"eval_samples_per_second": 2.168, |
|
"eval_steps_per_second": 0.271, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.19577442673853007, |
|
"learning_rate": 4.907510867943167e-06, |
|
"loss": 1.3595, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.2063936713267001, |
|
"learning_rate": 4.867306214015117e-06, |
|
"loss": 1.4202, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.19879562103497236, |
|
"learning_rate": 4.8271101439645765e-06, |
|
"loss": 1.3934, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.19945548173818886, |
|
"learning_rate": 4.786925258050024e-06, |
|
"loss": 1.3395, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.209259697791073, |
|
"learning_rate": 4.746754155806437e-06, |
|
"loss": 1.4072, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.19280116611929857, |
|
"learning_rate": 4.706599435877143e-06, |
|
"loss": 1.3976, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.1990941406891665, |
|
"learning_rate": 4.666463695845701e-06, |
|
"loss": 1.3912, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.19455600992105357, |
|
"learning_rate": 4.626349532067879e-06, |
|
"loss": 1.4003, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.20315382252326522, |
|
"learning_rate": 4.586259539503687e-06, |
|
"loss": 1.3876, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.19085048019709233, |
|
"learning_rate": 4.546196311549515e-06, |
|
"loss": 1.415, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.19155602915410036, |
|
"learning_rate": 4.506162439870366e-06, |
|
"loss": 1.388, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.1962993457750995, |
|
"learning_rate": 4.466160514232206e-06, |
|
"loss": 1.4069, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.19440064625069065, |
|
"learning_rate": 4.426193122334433e-06, |
|
"loss": 1.3625, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.19907155435638502, |
|
"learning_rate": 4.386262849642474e-06, |
|
"loss": 1.3621, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.19846401001306227, |
|
"learning_rate": 4.346372279220543e-06, |
|
"loss": 1.3438, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.19990637435356196, |
|
"learning_rate": 4.306523991564536e-06, |
|
"loss": 1.3857, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.1983045043565906, |
|
"learning_rate": 4.266720564435105e-06, |
|
"loss": 1.3477, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.2008086079053878, |
|
"learning_rate": 4.226964572690905e-06, |
|
"loss": 1.4032, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.2394515243143434, |
|
"learning_rate": 4.187258588122019e-06, |
|
"loss": 1.3757, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.1971799914885616, |
|
"learning_rate": 4.147605179283604e-06, |
|
"loss": 1.4156, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.2073477392149783, |
|
"learning_rate": 4.108006911329722e-06, |
|
"loss": 1.3881, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.2073153166108959, |
|
"learning_rate": 4.068466345847409e-06, |
|
"loss": 1.3687, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.20054177344121227, |
|
"learning_rate": 4.028986040690963e-06, |
|
"loss": 1.3785, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.20604849012426923, |
|
"learning_rate": 3.989568549816479e-06, |
|
"loss": 1.4169, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.19467062948633831, |
|
"learning_rate": 3.9502164231166354e-06, |
|
"loss": 1.4168, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.21219636801396732, |
|
"learning_rate": 3.910932206255742e-06, |
|
"loss": 1.3772, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.20051941796299297, |
|
"learning_rate": 3.87171844050507e-06, |
|
"loss": 1.3864, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.20033592892185176, |
|
"learning_rate": 3.8325776625784464e-06, |
|
"loss": 1.3984, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.19559628194598214, |
|
"learning_rate": 3.793512404468162e-06, |
|
"loss": 1.3954, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.21009377333677687, |
|
"learning_rate": 3.7545251932811824e-06, |
|
"loss": 1.3799, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.2058943301550432, |
|
"learning_rate": 3.7156185510756613e-06, |
|
"loss": 1.3763, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.20090178955938068, |
|
"learning_rate": 3.6767949946978026e-06, |
|
"loss": 1.4162, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.2034374577879598, |
|
"learning_rate": 3.6380570356190346e-06, |
|
"loss": 1.402, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.19652020002950302, |
|
"learning_rate": 3.5994071797735513e-06, |
|
"loss": 1.3667, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.20049219114904884, |
|
"learning_rate": 3.560847927396206e-06, |
|
"loss": 1.419, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.20857502981558484, |
|
"learning_rate": 3.5223817728607675e-06, |
|
"loss": 1.4082, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.19767809519926405, |
|
"learning_rate": 3.484011204518568e-06, |
|
"loss": 1.3947, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.1948689279113259, |
|
"learning_rate": 3.4457387045375255e-06, |
|
"loss": 1.3625, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.20429495021474384, |
|
"learning_rate": 3.4075667487415785e-06, |
|
"loss": 1.3978, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.2022436664727946, |
|
"learning_rate": 3.3694978064505258e-06, |
|
"loss": 1.3487, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 1.3093819618225098, |
|
"eval_runtime": 1769.0297, |
|
"eval_samples_per_second": 2.142, |
|
"eval_steps_per_second": 0.268, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.2010813579540422, |
|
"learning_rate": 3.331534340320287e-06, |
|
"loss": 1.3582, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.19489960246084403, |
|
"learning_rate": 3.293678806183596e-06, |
|
"loss": 1.42, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.20100567658267351, |
|
"learning_rate": 3.255933652891133e-06, |
|
"loss": 1.3887, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.198604846014806, |
|
"learning_rate": 3.218301322153111e-06, |
|
"loss": 1.3543, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.20073641523853392, |
|
"learning_rate": 3.180784248381322e-06, |
|
"loss": 1.3513, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.19424293274594104, |
|
"learning_rate": 3.1433848585316607e-06, |
|
"loss": 1.3885, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.2028571716247717, |
|
"learning_rate": 3.10610557194712e-06, |
|
"loss": 1.3824, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.1973272586923241, |
|
"learning_rate": 3.068948800201289e-06, |
|
"loss": 1.3332, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.20459650028827572, |
|
"learning_rate": 3.0319169469423487e-06, |
|
"loss": 1.3715, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.1966290880736998, |
|
"learning_rate": 2.995012407737581e-06, |
|
"loss": 1.3985, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.20394208158347749, |
|
"learning_rate": 2.958237569918404e-06, |
|
"loss": 1.3867, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.1971397504636877, |
|
"learning_rate": 2.9215948124259343e-06, |
|
"loss": 1.3739, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.1984132646647921, |
|
"learning_rate": 2.885086505657094e-06, |
|
"loss": 1.4459, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.20375542452519896, |
|
"learning_rate": 2.848715011311271e-06, |
|
"loss": 1.3606, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.19875885515941272, |
|
"learning_rate": 2.8124826822375473e-06, |
|
"loss": 1.4034, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.20571233168262154, |
|
"learning_rate": 2.7763918622824903e-06, |
|
"loss": 1.4358, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.20325983672817444, |
|
"learning_rate": 2.7404448861385293e-06, |
|
"loss": 1.3271, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.204674596724046, |
|
"learning_rate": 2.7046440791929306e-06, |
|
"loss": 1.3656, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.19864331126229412, |
|
"learning_rate": 2.6689917573773615e-06, |
|
"loss": 1.3712, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.19759957217116436, |
|
"learning_rate": 2.633490227018092e-06, |
|
"loss": 1.4061, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.1963001810018751, |
|
"learning_rate": 2.5981417846867753e-06, |
|
"loss": 1.3753, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.2037326390652298, |
|
"learning_rate": 2.5629487170518974e-06, |
|
"loss": 1.3468, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.20092378895163732, |
|
"learning_rate": 2.527913300730863e-06, |
|
"loss": 1.3831, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.2075661167474541, |
|
"learning_rate": 2.4930378021426977e-06, |
|
"loss": 1.3786, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.2002833079588797, |
|
"learning_rate": 2.4583244773614675e-06, |
|
"loss": 1.4058, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.20306778592495606, |
|
"learning_rate": 2.423775571970301e-06, |
|
"loss": 1.3704, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.22057110661612167, |
|
"learning_rate": 2.3893933209161465e-06, |
|
"loss": 1.3965, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.20413023154970353, |
|
"learning_rate": 2.3551799483651894e-06, |
|
"loss": 1.3935, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.2013070052195424, |
|
"learning_rate": 2.321137667558965e-06, |
|
"loss": 1.3757, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.20133662802149876, |
|
"learning_rate": 2.2872686806712037e-06, |
|
"loss": 1.3533, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.19816355686283976, |
|
"learning_rate": 2.2535751786653476e-06, |
|
"loss": 1.4014, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.20020195119951492, |
|
"learning_rate": 2.220059341152837e-06, |
|
"loss": 1.3721, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.20298393289470038, |
|
"learning_rate": 2.1867233362521127e-06, |
|
"loss": 1.3255, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.20399564011279728, |
|
"learning_rate": 2.153569320448348e-06, |
|
"loss": 1.3928, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.20016486527706043, |
|
"learning_rate": 2.120599438453968e-06, |
|
"loss": 1.3769, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.20169522873545517, |
|
"learning_rate": 2.087815823069886e-06, |
|
"loss": 1.3745, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.1991770376256046, |
|
"learning_rate": 2.055220595047551e-06, |
|
"loss": 1.3542, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.2083925008192503, |
|
"learning_rate": 2.022815862951751e-06, |
|
"loss": 1.4182, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.20086735172689546, |
|
"learning_rate": 1.990603723024213e-06, |
|
"loss": 1.3524, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.20621921540546848, |
|
"learning_rate": 1.9585862590480005e-06, |
|
"loss": 1.3891, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_loss": 1.3076461553573608, |
|
"eval_runtime": 1777.0113, |
|
"eval_samples_per_second": 2.133, |
|
"eval_steps_per_second": 0.267, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.20007769280547363, |
|
"learning_rate": 1.926765542212707e-06, |
|
"loss": 1.3856, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.1980821730251063, |
|
"learning_rate": 1.8951436309804766e-06, |
|
"loss": 1.383, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.20314121257047116, |
|
"learning_rate": 1.8637225709528506e-06, |
|
"loss": 1.3752, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.2000738503413009, |
|
"learning_rate": 1.832504394738428e-06, |
|
"loss": 1.3501, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.19626992677384789, |
|
"learning_rate": 1.8014911218213832e-06, |
|
"loss": 1.3776, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.2132143728962325, |
|
"learning_rate": 1.770684758430824e-06, |
|
"loss": 1.3641, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.20230145941791186, |
|
"learning_rate": 1.7400872974110088e-06, |
|
"loss": 1.3714, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.20322390315184813, |
|
"learning_rate": 1.7097007180924375e-06, |
|
"loss": 1.3559, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.2097270533204938, |
|
"learning_rate": 1.6795269861638041e-06, |
|
"loss": 1.3555, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.20409576963271045, |
|
"learning_rate": 1.6495680535448405e-06, |
|
"loss": 1.3376, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.209992582333707, |
|
"learning_rate": 1.6198258582600418e-06, |
|
"loss": 1.3393, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.20186485511811675, |
|
"learning_rate": 1.590302324313303e-06, |
|
"loss": 1.3476, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.2039959334961091, |
|
"learning_rate": 1.5609993615634578e-06, |
|
"loss": 1.4172, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.2032115658244104, |
|
"learning_rate": 1.531918865600725e-06, |
|
"loss": 1.3866, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.20663125873556282, |
|
"learning_rate": 1.5030627176240903e-06, |
|
"loss": 1.3413, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.21419285282068773, |
|
"learning_rate": 1.4744327843196043e-06, |
|
"loss": 1.3685, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.20427116148089472, |
|
"learning_rate": 1.446030917739633e-06, |
|
"loss": 1.3864, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.21267201464638189, |
|
"learning_rate": 1.4178589551830585e-06, |
|
"loss": 1.3578, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.2021417320485247, |
|
"learning_rate": 1.3899187190764062e-06, |
|
"loss": 1.4034, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.19953428976865786, |
|
"learning_rate": 1.3622120168559656e-06, |
|
"loss": 1.3378, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.20322168013048264, |
|
"learning_rate": 1.3347406408508695e-06, |
|
"loss": 1.4032, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.19617963419360818, |
|
"learning_rate": 1.3075063681671408e-06, |
|
"loss": 1.3815, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.20338294719666272, |
|
"learning_rate": 1.280510960572745e-06, |
|
"loss": 1.376, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.20462364560864363, |
|
"learning_rate": 1.2537561643836087e-06, |
|
"loss": 1.3866, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.1963149280229846, |
|
"learning_rate": 1.2272437103506596e-06, |
|
"loss": 1.372, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.19716113021249304, |
|
"learning_rate": 1.200975313547867e-06, |
|
"loss": 1.3599, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.20573469916350295, |
|
"learning_rate": 1.1749526732612842e-06, |
|
"loss": 1.3562, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.20684676012413714, |
|
"learning_rate": 1.1491774728791416e-06, |
|
"loss": 1.3296, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.2090768120512491, |
|
"learning_rate": 1.1236513797829285e-06, |
|
"loss": 1.4248, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.20571837930886522, |
|
"learning_rate": 1.0983760452395415e-06, |
|
"loss": 1.3609, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.20867562597707268, |
|
"learning_rate": 1.07335310429447e-06, |
|
"loss": 1.3848, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.1988423228103918, |
|
"learning_rate": 1.048584175666012e-06, |
|
"loss": 1.3712, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.21063039041467455, |
|
"learning_rate": 1.0240708616405788e-06, |
|
"loss": 1.3611, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.20306439311302277, |
|
"learning_rate": 9.998147479690251e-07, |
|
"loss": 1.3478, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.2020964557722793, |
|
"learning_rate": 9.75817403764079e-07, |
|
"loss": 1.3433, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.20597190432737983, |
|
"learning_rate": 9.520803813988366e-07, |
|
"loss": 1.4058, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.20380209329114748, |
|
"learning_rate": 9.286052164063369e-07, |
|
"loss": 1.4028, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.2041857004062742, |
|
"learning_rate": 9.053934273802312e-07, |
|
"loss": 1.383, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.20684812528414637, |
|
"learning_rate": 8.824465158765433e-07, |
|
"loss": 1.3512, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.20386591712113425, |
|
"learning_rate": 8.597659663165364e-07, |
|
"loss": 1.3858, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_loss": 1.306676983833313, |
|
"eval_runtime": 1760.6924, |
|
"eval_samples_per_second": 2.153, |
|
"eval_steps_per_second": 0.269, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.19511280510666812, |
|
"learning_rate": 8.373532458906897e-07, |
|
"loss": 1.3261, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.20356348959673148, |
|
"learning_rate": 8.15209804463783e-07, |
|
"loss": 1.3288, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.2103018153717413, |
|
"learning_rate": 7.93337074481108e-07, |
|
"loss": 1.4425, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.20468220080909677, |
|
"learning_rate": 7.717364708758024e-07, |
|
"loss": 1.406, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.2040164153992187, |
|
"learning_rate": 7.504093909773174e-07, |
|
"loss": 1.3601, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.19961500414001193, |
|
"learning_rate": 7.293572144210332e-07, |
|
"loss": 1.3777, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.20035042813237278, |
|
"learning_rate": 7.085813030590022e-07, |
|
"loss": 1.3944, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.20685428154054034, |
|
"learning_rate": 6.880830008718564e-07, |
|
"loss": 1.3778, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.19642667708352796, |
|
"learning_rate": 6.678636338818645e-07, |
|
"loss": 1.3458, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.20222774080494071, |
|
"learning_rate": 6.47924510067151e-07, |
|
"loss": 1.3655, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.20419539962584285, |
|
"learning_rate": 6.282669192770896e-07, |
|
"loss": 1.424, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.1975487856167091, |
|
"learning_rate": 6.088921331488568e-07, |
|
"loss": 1.3424, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.2094178118159778, |
|
"learning_rate": 5.898014050251765e-07, |
|
"loss": 1.3611, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.20636955576818874, |
|
"learning_rate": 5.709959698732359e-07, |
|
"loss": 1.3779, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.21747927560303068, |
|
"learning_rate": 5.524770442047978e-07, |
|
"loss": 1.3308, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.20050302654738292, |
|
"learning_rate": 5.342458259975147e-07, |
|
"loss": 1.3865, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.2392412132808601, |
|
"learning_rate": 5.163034946174161e-07, |
|
"loss": 1.3792, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.20914070749895322, |
|
"learning_rate": 4.986512107426283e-07, |
|
"loss": 1.3812, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.21320420478322508, |
|
"learning_rate": 4.812901162882871e-07, |
|
"loss": 1.443, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.20606583697965636, |
|
"learning_rate": 4.6422133433266513e-07, |
|
"loss": 1.3546, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.20597370559054526, |
|
"learning_rate": 4.474459690445293e-07, |
|
"loss": 1.3803, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.20609624124174958, |
|
"learning_rate": 4.309651056117009e-07, |
|
"loss": 1.3806, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.21011536963816732, |
|
"learning_rate": 4.1477981017086387e-07, |
|
"loss": 1.3857, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.20861991251552262, |
|
"learning_rate": 3.9889112973859554e-07, |
|
"loss": 1.4178, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.20565429912809985, |
|
"learning_rate": 3.8330009214363197e-07, |
|
"loss": 1.3485, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.213262370271027, |
|
"learning_rate": 3.680077059603876e-07, |
|
"loss": 1.3857, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.21006993775062202, |
|
"learning_rate": 3.530149604436983e-07, |
|
"loss": 1.3718, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.20971631358735854, |
|
"learning_rate": 3.3832282546483686e-07, |
|
"loss": 1.3401, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.20675053040706537, |
|
"learning_rate": 3.239322514487686e-07, |
|
"loss": 1.3976, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.2070605402743848, |
|
"learning_rate": 3.098441693126719e-07, |
|
"loss": 1.3801, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.2036473853339382, |
|
"learning_rate": 2.9605949040571456e-07, |
|
"loss": 1.3975, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.20446499651024982, |
|
"learning_rate": 2.8257910645009935e-07, |
|
"loss": 1.3932, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.20745728206982056, |
|
"learning_rate": 2.6940388948338057e-07, |
|
"loss": 1.4214, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.20628378675298625, |
|
"learning_rate": 2.565346918020534e-07, |
|
"loss": 1.3234, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.19884308459081187, |
|
"learning_rate": 2.4397234590641696e-07, |
|
"loss": 1.4086, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.196985413218799, |
|
"learning_rate": 2.3171766444672227e-07, |
|
"loss": 1.4203, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.20225024562843938, |
|
"learning_rate": 2.1977144017060027e-07, |
|
"loss": 1.3859, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.20030763530819898, |
|
"learning_rate": 2.0813444587178156e-07, |
|
"loss": 1.3889, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.20327002619311596, |
|
"learning_rate": 1.9680743434010385e-07, |
|
"loss": 1.3745, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.20095518840823687, |
|
"learning_rate": 1.8579113831281525e-07, |
|
"loss": 1.3635, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 1.3063520193099976, |
|
"eval_runtime": 2013.9504, |
|
"eval_samples_per_second": 1.882, |
|
"eval_steps_per_second": 0.235, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.20459938594623875, |
|
"learning_rate": 1.7508627042717387e-07, |
|
"loss": 1.4269, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.2004705031433909, |
|
"learning_rate": 1.6469352317434627e-07, |
|
"loss": 1.3789, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.19922047944368607, |
|
"learning_rate": 1.5461356885461077e-07, |
|
"loss": 1.3811, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.20506581374899654, |
|
"learning_rate": 1.4484705953386968e-07, |
|
"loss": 1.3677, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.20196871274072786, |
|
"learning_rate": 1.35394627001465e-07, |
|
"loss": 1.3871, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.21302376799020897, |
|
"learning_rate": 1.2625688272930925e-07, |
|
"loss": 1.3673, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.20437076323448575, |
|
"learning_rate": 1.174344178323289e-07, |
|
"loss": 1.3701, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.20259273082335125, |
|
"learning_rate": 1.0892780303022377e-07, |
|
"loss": 1.4004, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.19927961571470354, |
|
"learning_rate": 1.007375886105555e-07, |
|
"loss": 1.3781, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.19648191268958623, |
|
"learning_rate": 9.286430439313876e-08, |
|
"loss": 1.3719, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.2081842753213948, |
|
"learning_rate": 8.530845969577594e-08, |
|
"loss": 1.3347, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.19777453793305202, |
|
"learning_rate": 7.80705433013046e-08, |
|
"loss": 1.3645, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.2001901456526902, |
|
"learning_rate": 7.115102342598101e-08, |
|
"loss": 1.3549, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.2085779076020987, |
|
"learning_rate": 6.455034768919288e-08, |
|
"loss": 1.395, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.2007001569199437, |
|
"learning_rate": 5.826894308449904e-08, |
|
"loss": 1.3418, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.20019336918988379, |
|
"learning_rate": 5.230721595201049e-08, |
|
"loss": 1.3808, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.20532075618369505, |
|
"learning_rate": 4.666555195210365e-08, |
|
"loss": 1.3624, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.19955856360806487, |
|
"learning_rate": 4.134431604047195e-08, |
|
"loss": 1.3851, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.19995218707394016, |
|
"learning_rate": 3.63438524445181e-08, |
|
"loss": 1.404, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.2027473195079258, |
|
"learning_rate": 3.166448464108629e-08, |
|
"loss": 1.3654, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.21316411475600813, |
|
"learning_rate": 2.7306515335532857e-08, |
|
"loss": 1.4004, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.19985677477921807, |
|
"learning_rate": 2.327022644215193e-08, |
|
"loss": 1.3813, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.20946530787894166, |
|
"learning_rate": 1.9555879065930038e-08, |
|
"loss": 1.4226, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.1975351975591122, |
|
"learning_rate": 1.6163713485662923e-08, |
|
"loss": 1.3792, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.20571768755825784, |
|
"learning_rate": 1.3093949138406892e-08, |
|
"loss": 1.3918, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.44915439425778153, |
|
"learning_rate": 1.03467846052846e-08, |
|
"loss": 1.3642, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.20214902180431968, |
|
"learning_rate": 7.922397598642551e-09, |
|
"loss": 1.3599, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.19767842515824202, |
|
"learning_rate": 5.820944950549745e-09, |
|
"loss": 1.3599, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.1991438801798082, |
|
"learning_rate": 4.042562602655231e-09, |
|
"loss": 1.3446, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.20173015884228035, |
|
"learning_rate": 2.5873655973945864e-09, |
|
"loss": 1.3461, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.20119946240562925, |
|
"learning_rate": 1.4554480705458729e-09, |
|
"loss": 1.3474, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.20468648298029762, |
|
"learning_rate": 6.468832451417273e-10, |
|
"loss": 1.3649, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.19998165788056904, |
|
"learning_rate": 1.617234267320411e-10, |
|
"loss": 1.3677, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.21026892718522863, |
|
"learning_rate": 0.0, |
|
"loss": 1.3748, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 2170, |
|
"total_flos": 7055767844683776.0, |
|
"train_loss": 1.4133363889659056, |
|
"train_runtime": 112943.9974, |
|
"train_samples_per_second": 0.615, |
|
"train_steps_per_second": 0.019 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2170, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"total_flos": 7055767844683776.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|