|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9968051118210862, |
|
"eval_steps": 59, |
|
"global_step": 234, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004259850905218318, |
|
"grad_norm": 0.42660918831825256, |
|
"learning_rate": 2e-05, |
|
"loss": 1.4194, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004259850905218318, |
|
"eval_loss": 1.3981385231018066, |
|
"eval_runtime": 17.5749, |
|
"eval_samples_per_second": 22.475, |
|
"eval_steps_per_second": 2.845, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008519701810436636, |
|
"grad_norm": 0.38132771849632263, |
|
"learning_rate": 4e-05, |
|
"loss": 1.4291, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.012779552715654952, |
|
"grad_norm": 0.4677501916885376, |
|
"learning_rate": 6e-05, |
|
"loss": 1.606, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01703940362087327, |
|
"grad_norm": 0.4839603900909424, |
|
"learning_rate": 8e-05, |
|
"loss": 1.5193, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.021299254526091587, |
|
"grad_norm": 0.52900630235672, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7253, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.025559105431309903, |
|
"grad_norm": 0.4611320495605469, |
|
"learning_rate": 0.00012, |
|
"loss": 1.4042, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.029818956336528223, |
|
"grad_norm": 0.5078997611999512, |
|
"learning_rate": 0.00014, |
|
"loss": 1.8641, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03407880724174654, |
|
"grad_norm": 0.5692968368530273, |
|
"learning_rate": 0.00016, |
|
"loss": 1.0603, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.038338658146964855, |
|
"grad_norm": 0.5424911379814148, |
|
"learning_rate": 0.00018, |
|
"loss": 0.9217, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.042598509052183174, |
|
"grad_norm": 0.6595712304115295, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0443, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.046858359957401494, |
|
"grad_norm": 0.552948534488678, |
|
"learning_rate": 0.00019999016517595753, |
|
"loss": 0.9727, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.051118210862619806, |
|
"grad_norm": 0.523713231086731, |
|
"learning_rate": 0.00019996066263830531, |
|
"loss": 1.0042, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.055378061767838126, |
|
"grad_norm": 0.3326718807220459, |
|
"learning_rate": 0.0001999114981900887, |
|
"loss": 0.6851, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.059637912673056445, |
|
"grad_norm": 0.40246546268463135, |
|
"learning_rate": 0.00019984268150178167, |
|
"loss": 0.6865, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06389776357827476, |
|
"grad_norm": 0.3299888074398041, |
|
"learning_rate": 0.00019975422610938462, |
|
"loss": 0.6413, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06815761448349308, |
|
"grad_norm": 0.321532666683197, |
|
"learning_rate": 0.00019964614941176195, |
|
"loss": 0.6425, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0724174653887114, |
|
"grad_norm": 0.30551549792289734, |
|
"learning_rate": 0.0001995184726672197, |
|
"loss": 0.6573, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07667731629392971, |
|
"grad_norm": 0.3162730932235718, |
|
"learning_rate": 0.00019937122098932428, |
|
"loss": 0.7957, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.08093716719914804, |
|
"grad_norm": 0.2646523714065552, |
|
"learning_rate": 0.00019920442334196248, |
|
"loss": 0.6842, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08519701810436635, |
|
"grad_norm": 0.35260164737701416, |
|
"learning_rate": 0.00019901811253364456, |
|
"loss": 0.7045, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08945686900958466, |
|
"grad_norm": 0.36994901299476624, |
|
"learning_rate": 0.00019881232521105089, |
|
"loss": 0.7506, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.09371671991480299, |
|
"grad_norm": 0.3145638108253479, |
|
"learning_rate": 0.0001985871018518236, |
|
"loss": 0.6522, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0979765708200213, |
|
"grad_norm": 0.28740495443344116, |
|
"learning_rate": 0.00019834248675660486, |
|
"loss": 0.5763, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.10223642172523961, |
|
"grad_norm": 0.29527685046195984, |
|
"learning_rate": 0.00019807852804032305, |
|
"loss": 0.8533, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.10649627263045794, |
|
"grad_norm": 0.3023378849029541, |
|
"learning_rate": 0.00019779527762272877, |
|
"loss": 0.738, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11075612353567625, |
|
"grad_norm": 0.2749658524990082, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 0.6354, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.11501597444089456, |
|
"grad_norm": 0.3914307951927185, |
|
"learning_rate": 0.0001971711283246951, |
|
"loss": 0.8604, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.11927582534611289, |
|
"grad_norm": 0.47873714566230774, |
|
"learning_rate": 0.00019683035221222618, |
|
"loss": 0.7972, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.1235356762513312, |
|
"grad_norm": 0.22174575924873352, |
|
"learning_rate": 0.0001964705299102376, |
|
"loss": 0.4385, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.12779552715654952, |
|
"grad_norm": 0.244963139295578, |
|
"learning_rate": 0.00019609173219450998, |
|
"loss": 0.7168, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13205537806176784, |
|
"grad_norm": 0.32758575677871704, |
|
"learning_rate": 0.0001956940335732209, |
|
"loss": 0.7231, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.13631522896698617, |
|
"grad_norm": 0.21992172300815582, |
|
"learning_rate": 0.00019527751227228963, |
|
"loss": 0.662, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.14057507987220447, |
|
"grad_norm": 0.2899262309074402, |
|
"learning_rate": 0.0001948422502199903, |
|
"loss": 0.4651, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1448349307774228, |
|
"grad_norm": 0.23878340423107147, |
|
"learning_rate": 0.00019438833303083678, |
|
"loss": 0.5367, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.14909478168264112, |
|
"grad_norm": 0.20475314557552338, |
|
"learning_rate": 0.0001939158499887428, |
|
"loss": 0.4024, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.15335463258785942, |
|
"grad_norm": 0.25068745017051697, |
|
"learning_rate": 0.00019342489402945998, |
|
"loss": 0.6575, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.15761448349307774, |
|
"grad_norm": 0.3811924159526825, |
|
"learning_rate": 0.00019291556172229785, |
|
"loss": 0.6405, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.16187433439829607, |
|
"grad_norm": 0.2627577483654022, |
|
"learning_rate": 0.0001923879532511287, |
|
"loss": 0.6961, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.16613418530351437, |
|
"grad_norm": 0.32665154337882996, |
|
"learning_rate": 0.00019184217239468212, |
|
"loss": 0.6983, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1703940362087327, |
|
"grad_norm": 0.24597743153572083, |
|
"learning_rate": 0.00019127832650613189, |
|
"loss": 0.54, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17465388711395102, |
|
"grad_norm": 0.2611660361289978, |
|
"learning_rate": 0.00019069652649198005, |
|
"loss": 0.6281, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.17891373801916932, |
|
"grad_norm": 0.2969326078891754, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 0.6817, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.18317358892438765, |
|
"grad_norm": 0.27561935782432556, |
|
"learning_rate": 0.00018947952534793661, |
|
"loss": 0.626, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.18743343982960597, |
|
"grad_norm": 0.33468887209892273, |
|
"learning_rate": 0.00018884456359788724, |
|
"loss": 0.7383, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.19169329073482427, |
|
"grad_norm": 0.2937297224998474, |
|
"learning_rate": 0.0001881921264348355, |
|
"loss": 0.6972, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1959531416400426, |
|
"grad_norm": 0.33218011260032654, |
|
"learning_rate": 0.00018752234219087538, |
|
"loss": 0.6749, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.20021299254526093, |
|
"grad_norm": 0.2661404311656952, |
|
"learning_rate": 0.00018683534261021057, |
|
"loss": 0.4882, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.20447284345047922, |
|
"grad_norm": 0.2451002150774002, |
|
"learning_rate": 0.00018613126282324092, |
|
"loss": 0.637, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.20873269435569755, |
|
"grad_norm": 0.27517661452293396, |
|
"learning_rate": 0.00018541024131998274, |
|
"loss": 0.5483, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.21299254526091588, |
|
"grad_norm": 0.24373459815979004, |
|
"learning_rate": 0.00018467241992282843, |
|
"loss": 0.5112, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21725239616613418, |
|
"grad_norm": 0.3239864408969879, |
|
"learning_rate": 0.00018391794375865024, |
|
"loss": 0.8005, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.2215122470713525, |
|
"grad_norm": 0.29262682795524597, |
|
"learning_rate": 0.00018314696123025454, |
|
"loss": 0.6769, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.22577209797657083, |
|
"grad_norm": 0.28277888894081116, |
|
"learning_rate": 0.00018235962398719147, |
|
"loss": 0.6892, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.23003194888178913, |
|
"grad_norm": 0.41741546988487244, |
|
"learning_rate": 0.00018155608689592604, |
|
"loss": 0.6763, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.23429179978700745, |
|
"grad_norm": 0.2734082043170929, |
|
"learning_rate": 0.00018073650800937624, |
|
"loss": 0.697, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.23855165069222578, |
|
"grad_norm": 0.2646290957927704, |
|
"learning_rate": 0.00017990104853582493, |
|
"loss": 0.5936, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.24281150159744408, |
|
"grad_norm": 0.27723610401153564, |
|
"learning_rate": 0.00017904987280721035, |
|
"loss": 0.5875, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.2470713525026624, |
|
"grad_norm": 0.2668153643608093, |
|
"learning_rate": 0.000178183148246803, |
|
"loss": 0.5219, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.25133120340788073, |
|
"grad_norm": 0.29033368825912476, |
|
"learning_rate": 0.0001773010453362737, |
|
"loss": 0.5997, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.25133120340788073, |
|
"eval_loss": 0.5784963965415955, |
|
"eval_runtime": 17.4317, |
|
"eval_samples_per_second": 22.66, |
|
"eval_steps_per_second": 2.868, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.25559105431309903, |
|
"grad_norm": 0.2783537209033966, |
|
"learning_rate": 0.00017640373758216077, |
|
"loss": 0.483, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2598509052183174, |
|
"grad_norm": 0.31082215905189514, |
|
"learning_rate": 0.0001754914014817416, |
|
"loss": 0.6473, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2641107561235357, |
|
"grad_norm": 0.3206618130207062, |
|
"learning_rate": 0.00017456421648831655, |
|
"loss": 0.6289, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.268370607028754, |
|
"grad_norm": 0.2875254154205322, |
|
"learning_rate": 0.00017362236497591094, |
|
"loss": 0.594, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.27263045793397234, |
|
"grad_norm": 0.22950579226016998, |
|
"learning_rate": 0.0001726660322034027, |
|
"loss": 0.3886, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.27689030883919064, |
|
"grad_norm": 0.24293649196624756, |
|
"learning_rate": 0.00017169540627808274, |
|
"loss": 0.6129, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.28115015974440893, |
|
"grad_norm": 0.2611636519432068, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.6891, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.2854100106496273, |
|
"grad_norm": 0.284407377243042, |
|
"learning_rate": 0.00016971204141768233, |
|
"loss": 0.516, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2896698615548456, |
|
"grad_norm": 0.21485944092273712, |
|
"learning_rate": 0.00016869969260349018, |
|
"loss": 0.3826, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2939297124600639, |
|
"grad_norm": 0.29337963461875916, |
|
"learning_rate": 0.00016767383080152742, |
|
"loss": 0.5696, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.29818956336528224, |
|
"grad_norm": 0.27099764347076416, |
|
"learning_rate": 0.0001666346577952004, |
|
"loss": 0.4708, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.30244941427050054, |
|
"grad_norm": 0.29055824875831604, |
|
"learning_rate": 0.00016558237798618245, |
|
"loss": 0.5844, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.30670926517571884, |
|
"grad_norm": 0.22874757647514343, |
|
"learning_rate": 0.00016451719835420877, |
|
"loss": 0.4412, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3109691160809372, |
|
"grad_norm": 0.2926221489906311, |
|
"learning_rate": 0.00016343932841636456, |
|
"loss": 0.5757, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.3152289669861555, |
|
"grad_norm": 0.30070438981056213, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.6063, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3194888178913738, |
|
"grad_norm": 0.2475481927394867, |
|
"learning_rate": 0.00016124636813039502, |
|
"loss": 0.5056, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.32374866879659214, |
|
"grad_norm": 0.2851349711418152, |
|
"learning_rate": 0.00016013170912984058, |
|
"loss": 0.7547, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.32800851970181044, |
|
"grad_norm": 0.25569260120391846, |
|
"learning_rate": 0.00015900522243371282, |
|
"loss": 0.5168, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.33226837060702874, |
|
"grad_norm": 0.3774610757827759, |
|
"learning_rate": 0.0001578671296179806, |
|
"loss": 0.6691, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.3365282215122471, |
|
"grad_norm": 0.2339468151330948, |
|
"learning_rate": 0.00015671765454149559, |
|
"loss": 0.5021, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.3407880724174654, |
|
"grad_norm": 0.3066350519657135, |
|
"learning_rate": 0.00015555702330196023, |
|
"loss": 0.6838, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3450479233226837, |
|
"grad_norm": 0.271908164024353, |
|
"learning_rate": 0.00015438546419145488, |
|
"loss": 0.4837, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.34930777422790205, |
|
"grad_norm": 0.304290771484375, |
|
"learning_rate": 0.00015320320765153367, |
|
"loss": 0.6768, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.35356762513312034, |
|
"grad_norm": 0.25685280561447144, |
|
"learning_rate": 0.00015201048622789747, |
|
"loss": 0.4335, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.35782747603833864, |
|
"grad_norm": 0.3003567159175873, |
|
"learning_rate": 0.00015080753452465296, |
|
"loss": 0.5836, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.362087326943557, |
|
"grad_norm": 0.2585873007774353, |
|
"learning_rate": 0.0001495945891581668, |
|
"loss": 0.5391, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3663471778487753, |
|
"grad_norm": 0.30791282653808594, |
|
"learning_rate": 0.000148371888710524, |
|
"loss": 0.5103, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.3706070287539936, |
|
"grad_norm": 0.23016773164272308, |
|
"learning_rate": 0.0001471396736825998, |
|
"loss": 0.4269, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.37486687965921195, |
|
"grad_norm": 0.3137454390525818, |
|
"learning_rate": 0.00014589818644675378, |
|
"loss": 0.5116, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.37912673056443025, |
|
"grad_norm": 0.28078484535217285, |
|
"learning_rate": 0.00014464767119915629, |
|
"loss": 0.4388, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.38338658146964855, |
|
"grad_norm": 0.3163893222808838, |
|
"learning_rate": 0.00014338837391175582, |
|
"loss": 0.6122, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3876464323748669, |
|
"grad_norm": 0.34674668312072754, |
|
"learning_rate": 0.0001421205422838971, |
|
"loss": 0.7114, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.3919062832800852, |
|
"grad_norm": 0.2210942953824997, |
|
"learning_rate": 0.00014084442569359964, |
|
"loss": 0.3351, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.3961661341853035, |
|
"grad_norm": 0.30586308240890503, |
|
"learning_rate": 0.0001395602751485059, |
|
"loss": 0.4845, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.40042598509052185, |
|
"grad_norm": 0.2695784568786621, |
|
"learning_rate": 0.000138268343236509, |
|
"loss": 0.4992, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.40468583599574015, |
|
"grad_norm": 0.2989813983440399, |
|
"learning_rate": 0.00013696888407606952, |
|
"loss": 0.585, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.40894568690095845, |
|
"grad_norm": 0.2759920656681061, |
|
"learning_rate": 0.0001356621532662313, |
|
"loss": 0.4492, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.4132055378061768, |
|
"grad_norm": 0.33117353916168213, |
|
"learning_rate": 0.0001343484078363461, |
|
"loss": 0.5606, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.4174653887113951, |
|
"grad_norm": 0.24572253227233887, |
|
"learning_rate": 0.00013302790619551674, |
|
"loss": 0.3261, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4217252396166134, |
|
"grad_norm": 0.322480171918869, |
|
"learning_rate": 0.00013170090808176883, |
|
"loss": 0.5527, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.42598509052183176, |
|
"grad_norm": 0.3101179301738739, |
|
"learning_rate": 0.00013036767451096148, |
|
"loss": 0.5419, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.43024494142705005, |
|
"grad_norm": 0.3218703269958496, |
|
"learning_rate": 0.00012902846772544624, |
|
"loss": 0.5441, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.43450479233226835, |
|
"grad_norm": 0.26214686036109924, |
|
"learning_rate": 0.00012768355114248494, |
|
"loss": 0.5388, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.4387646432374867, |
|
"grad_norm": 0.421612411737442, |
|
"learning_rate": 0.00012633318930243648, |
|
"loss": 0.7557, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.443024494142705, |
|
"grad_norm": 0.5120344758033752, |
|
"learning_rate": 0.0001249776478167227, |
|
"loss": 0.7028, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.4472843450479233, |
|
"grad_norm": 0.27614736557006836, |
|
"learning_rate": 0.00012361719331558345, |
|
"loss": 0.3954, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.45154419595314166, |
|
"grad_norm": 0.269520103931427, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 0.4851, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.45580404685835996, |
|
"grad_norm": 0.2739225924015045, |
|
"learning_rate": 0.000120882616567217, |
|
"loss": 0.4907, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.46006389776357826, |
|
"grad_norm": 0.33920663595199585, |
|
"learning_rate": 0.00011950903220161285, |
|
"loss": 0.6288, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.4643237486687966, |
|
"grad_norm": 0.279832124710083, |
|
"learning_rate": 0.00011813161047802985, |
|
"loss": 0.447, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.4685835995740149, |
|
"grad_norm": 0.31790605187416077, |
|
"learning_rate": 0.00011675062233047364, |
|
"loss": 0.5933, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4728434504792332, |
|
"grad_norm": 0.24926939606666565, |
|
"learning_rate": 0.000115366339394453, |
|
"loss": 0.4061, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.47710330138445156, |
|
"grad_norm": 0.3327280282974243, |
|
"learning_rate": 0.00011397903395354996, |
|
"loss": 0.484, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.48136315228966986, |
|
"grad_norm": 0.37822094559669495, |
|
"learning_rate": 0.00011258897888586255, |
|
"loss": 0.6416, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.48562300319488816, |
|
"grad_norm": 0.35605669021606445, |
|
"learning_rate": 0.00011119644761033078, |
|
"loss": 0.6136, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.4898828541001065, |
|
"grad_norm": 0.3513132929801941, |
|
"learning_rate": 0.0001098017140329561, |
|
"loss": 0.6299, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.4941427050053248, |
|
"grad_norm": 0.3040708899497986, |
|
"learning_rate": 0.00010840505249292476, |
|
"loss": 0.4658, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.4984025559105431, |
|
"grad_norm": 0.19006308913230896, |
|
"learning_rate": 0.00010700673770864673, |
|
"loss": 0.2694, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5026624068157615, |
|
"grad_norm": 0.30643633008003235, |
|
"learning_rate": 0.00010560704472371919, |
|
"loss": 0.4492, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5026624068157615, |
|
"eval_loss": 0.5326976180076599, |
|
"eval_runtime": 17.5872, |
|
"eval_samples_per_second": 22.46, |
|
"eval_steps_per_second": 2.843, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5069222577209798, |
|
"grad_norm": 0.3698013722896576, |
|
"learning_rate": 0.00010420624885282653, |
|
"loss": 0.6993, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.5111821086261981, |
|
"grad_norm": 0.2801634967327118, |
|
"learning_rate": 0.0001028046256275869, |
|
"loss": 0.4059, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5154419595314164, |
|
"grad_norm": 0.2864643931388855, |
|
"learning_rate": 0.00010140245074235624, |
|
"loss": 0.5024, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5197018104366348, |
|
"grad_norm": 0.30105265974998474, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6774, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5239616613418531, |
|
"grad_norm": 0.39152050018310547, |
|
"learning_rate": 9.859754925764378e-05, |
|
"loss": 0.625, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5282215122470714, |
|
"grad_norm": 0.3618883192539215, |
|
"learning_rate": 9.719537437241312e-05, |
|
"loss": 0.6978, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5324813631522897, |
|
"grad_norm": 0.23670899868011475, |
|
"learning_rate": 9.579375114717351e-05, |
|
"loss": 0.3379, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.536741214057508, |
|
"grad_norm": 0.3124864101409912, |
|
"learning_rate": 9.439295527628081e-05, |
|
"loss": 0.525, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5410010649627263, |
|
"grad_norm": 0.3667398989200592, |
|
"learning_rate": 9.299326229135326e-05, |
|
"loss": 0.6164, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5452609158679447, |
|
"grad_norm": 0.2894105613231659, |
|
"learning_rate": 9.159494750707526e-05, |
|
"loss": 0.4335, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.549520766773163, |
|
"grad_norm": 0.30680200457572937, |
|
"learning_rate": 9.019828596704394e-05, |
|
"loss": 0.4507, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5537806176783813, |
|
"grad_norm": 0.3676758110523224, |
|
"learning_rate": 8.880355238966923e-05, |
|
"loss": 0.5955, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5580404685835996, |
|
"grad_norm": 0.3194178342819214, |
|
"learning_rate": 8.741102111413748e-05, |
|
"loss": 0.5675, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5623003194888179, |
|
"grad_norm": 0.29750558733940125, |
|
"learning_rate": 8.602096604645009e-05, |
|
"loss": 0.5785, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5665601703940362, |
|
"grad_norm": 0.37204545736312866, |
|
"learning_rate": 8.463366060554698e-05, |
|
"loss": 0.612, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5708200212992546, |
|
"grad_norm": 0.36891940236091614, |
|
"learning_rate": 8.324937766952638e-05, |
|
"loss": 0.5463, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5750798722044729, |
|
"grad_norm": 0.2863575518131256, |
|
"learning_rate": 8.186838952197018e-05, |
|
"loss": 0.4884, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5793397231096912, |
|
"grad_norm": 0.354523241519928, |
|
"learning_rate": 8.049096779838719e-05, |
|
"loss": 0.7727, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5835995740149095, |
|
"grad_norm": 0.30339759588241577, |
|
"learning_rate": 7.911738343278304e-05, |
|
"loss": 0.5543, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.5878594249201278, |
|
"grad_norm": 0.27778202295303345, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 0.4716, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.5921192758253461, |
|
"grad_norm": 0.38618960976600647, |
|
"learning_rate": 7.63828066844166e-05, |
|
"loss": 0.6519, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.5963791267305645, |
|
"grad_norm": 0.3573627769947052, |
|
"learning_rate": 7.502235218327731e-05, |
|
"loss": 0.5128, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6006389776357828, |
|
"grad_norm": 0.30529165267944336, |
|
"learning_rate": 7.366681069756352e-05, |
|
"loss": 0.5184, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.6048988285410011, |
|
"grad_norm": 0.2819828987121582, |
|
"learning_rate": 7.231644885751507e-05, |
|
"loss": 0.4259, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.6091586794462194, |
|
"grad_norm": 0.32307252287864685, |
|
"learning_rate": 7.097153227455379e-05, |
|
"loss": 0.6048, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.6134185303514377, |
|
"grad_norm": 0.31262722611427307, |
|
"learning_rate": 6.963232548903853e-05, |
|
"loss": 0.4834, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.617678381256656, |
|
"grad_norm": 0.318851500749588, |
|
"learning_rate": 6.829909191823121e-05, |
|
"loss": 0.5011, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6219382321618744, |
|
"grad_norm": 0.44246405363082886, |
|
"learning_rate": 6.697209380448333e-05, |
|
"loss": 0.4384, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.6261980830670927, |
|
"grad_norm": 0.3459945023059845, |
|
"learning_rate": 6.565159216365389e-05, |
|
"loss": 0.5657, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.630457933972311, |
|
"grad_norm": 0.33843329548835754, |
|
"learning_rate": 6.43378467337687e-05, |
|
"loss": 0.5711, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.6347177848775293, |
|
"grad_norm": 0.3812694549560547, |
|
"learning_rate": 6.30311159239305e-05, |
|
"loss": 0.6142, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6389776357827476, |
|
"grad_norm": 0.29333916306495667, |
|
"learning_rate": 6.173165676349103e-05, |
|
"loss": 0.585, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6432374866879659, |
|
"grad_norm": 0.2884041666984558, |
|
"learning_rate": 6.043972485149414e-05, |
|
"loss": 0.4866, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6474973375931843, |
|
"grad_norm": 0.33954814076423645, |
|
"learning_rate": 5.9155574306400395e-05, |
|
"loss": 0.571, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6517571884984026, |
|
"grad_norm": 0.33935782313346863, |
|
"learning_rate": 5.787945771610296e-05, |
|
"loss": 0.5037, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6560170394036209, |
|
"grad_norm": 0.27371054887771606, |
|
"learning_rate": 5.6611626088244194e-05, |
|
"loss": 0.3322, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6602768903088392, |
|
"grad_norm": 0.30788496136665344, |
|
"learning_rate": 5.5352328800843724e-05, |
|
"loss": 0.4454, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6645367412140575, |
|
"grad_norm": 0.34366151690483093, |
|
"learning_rate": 5.410181355324622e-05, |
|
"loss": 0.5788, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.6687965921192758, |
|
"grad_norm": 0.33698371052742004, |
|
"learning_rate": 5.286032631740023e-05, |
|
"loss": 0.4378, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.6730564430244942, |
|
"grad_norm": 0.4181162416934967, |
|
"learning_rate": 5.162811128947602e-05, |
|
"loss": 0.5367, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.6773162939297125, |
|
"grad_norm": 0.4480881690979004, |
|
"learning_rate": 5.0405410841833253e-05, |
|
"loss": 0.6633, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.6815761448349308, |
|
"grad_norm": 0.37488028407096863, |
|
"learning_rate": 4.919246547534708e-05, |
|
"loss": 0.5402, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6858359957401491, |
|
"grad_norm": 0.2964366376399994, |
|
"learning_rate": 4.7989513772102537e-05, |
|
"loss": 0.4109, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.6900958466453674, |
|
"grad_norm": 0.35376259684562683, |
|
"learning_rate": 4.6796792348466356e-05, |
|
"loss": 0.636, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.6943556975505857, |
|
"grad_norm": 0.3158915638923645, |
|
"learning_rate": 4.561453580854516e-05, |
|
"loss": 0.4893, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.6986155484558041, |
|
"grad_norm": 0.420785516500473, |
|
"learning_rate": 4.444297669803981e-05, |
|
"loss": 0.7147, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.7028753993610224, |
|
"grad_norm": 0.3272782564163208, |
|
"learning_rate": 4.328234545850442e-05, |
|
"loss": 0.3444, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.7071352502662407, |
|
"grad_norm": 0.30052492022514343, |
|
"learning_rate": 4.213287038201943e-05, |
|
"loss": 0.5209, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.711395101171459, |
|
"grad_norm": 0.37648481130599976, |
|
"learning_rate": 4.0994777566287204e-05, |
|
"loss": 0.684, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.7156549520766773, |
|
"grad_norm": 0.3135606646537781, |
|
"learning_rate": 3.9868290870159405e-05, |
|
"loss": 0.4871, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.7199148029818956, |
|
"grad_norm": 0.33847576379776, |
|
"learning_rate": 3.875363186960499e-05, |
|
"loss": 0.5294, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.724174653887114, |
|
"grad_norm": 0.3337070047855377, |
|
"learning_rate": 3.7651019814126654e-05, |
|
"loss": 0.4425, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7284345047923323, |
|
"grad_norm": 0.4173165261745453, |
|
"learning_rate": 3.6560671583635467e-05, |
|
"loss": 0.637, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7326943556975506, |
|
"grad_norm": 0.41098451614379883, |
|
"learning_rate": 3.548280164579126e-05, |
|
"loss": 0.52, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.7369542066027689, |
|
"grad_norm": 0.3789665699005127, |
|
"learning_rate": 3.4417622013817595e-05, |
|
"loss": 0.5995, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.7412140575079872, |
|
"grad_norm": 0.3996846675872803, |
|
"learning_rate": 3.336534220479961e-05, |
|
"loss": 0.6237, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7454739084132055, |
|
"grad_norm": 0.3990687131881714, |
|
"learning_rate": 3.2326169198472556e-05, |
|
"loss": 0.555, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7497337593184239, |
|
"grad_norm": 0.32280924916267395, |
|
"learning_rate": 3.130030739650983e-05, |
|
"loss": 0.4742, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7539936102236422, |
|
"grad_norm": 0.4192362129688263, |
|
"learning_rate": 3.0287958582317676e-05, |
|
"loss": 0.6569, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.7539936102236422, |
|
"eval_loss": 0.5110090970993042, |
|
"eval_runtime": 20.8066, |
|
"eval_samples_per_second": 18.984, |
|
"eval_steps_per_second": 2.403, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.7582534611288605, |
|
"grad_norm": 0.35410746932029724, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.4828, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7625133120340788, |
|
"grad_norm": 0.4463326036930084, |
|
"learning_rate": 2.8304593721917285e-05, |
|
"loss": 0.6976, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.7667731629392971, |
|
"grad_norm": 0.29797378182411194, |
|
"learning_rate": 2.7333967796597315e-05, |
|
"loss": 0.564, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7710330138445154, |
|
"grad_norm": 0.31337812542915344, |
|
"learning_rate": 2.6377635024089087e-05, |
|
"loss": 0.5607, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.7752928647497338, |
|
"grad_norm": 0.40470513701438904, |
|
"learning_rate": 2.5435783511683443e-05, |
|
"loss": 0.6428, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.7795527156549521, |
|
"grad_norm": 0.413817822933197, |
|
"learning_rate": 2.450859851825842e-05, |
|
"loss": 0.7303, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.7838125665601704, |
|
"grad_norm": 0.2931414842605591, |
|
"learning_rate": 2.3596262417839255e-05, |
|
"loss": 0.4051, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.7880724174653887, |
|
"grad_norm": 0.34086865186691284, |
|
"learning_rate": 2.26989546637263e-05, |
|
"loss": 0.4329, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.792332268370607, |
|
"grad_norm": 0.40336307883262634, |
|
"learning_rate": 2.181685175319702e-05, |
|
"loss": 0.5791, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.7965921192758253, |
|
"grad_norm": 0.30092838406562805, |
|
"learning_rate": 2.095012719278966e-05, |
|
"loss": 0.4491, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.8008519701810437, |
|
"grad_norm": 0.31043168902397156, |
|
"learning_rate": 2.009895146417512e-05, |
|
"loss": 0.4681, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.805111821086262, |
|
"grad_norm": 0.3712119162082672, |
|
"learning_rate": 1.926349199062376e-05, |
|
"loss": 0.549, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.8093716719914803, |
|
"grad_norm": 0.3679051995277405, |
|
"learning_rate": 1.8443913104073983e-05, |
|
"loss": 0.4971, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8136315228966986, |
|
"grad_norm": 0.3244669735431671, |
|
"learning_rate": 1.7640376012808536e-05, |
|
"loss": 0.4732, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.8178913738019169, |
|
"grad_norm": 0.28653696179389954, |
|
"learning_rate": 1.6853038769745467e-05, |
|
"loss": 0.3469, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.8221512247071352, |
|
"grad_norm": 0.3144218325614929, |
|
"learning_rate": 1.6082056241349786e-05, |
|
"loss": 0.5127, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.8264110756123536, |
|
"grad_norm": 0.3801470994949341, |
|
"learning_rate": 1.5327580077171587e-05, |
|
"loss": 0.5178, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.8306709265175719, |
|
"grad_norm": 0.37223386764526367, |
|
"learning_rate": 1.4589758680017263e-05, |
|
"loss": 0.5114, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8349307774227902, |
|
"grad_norm": 0.4167802333831787, |
|
"learning_rate": 1.3868737176759106e-05, |
|
"loss": 0.6949, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.8391906283280085, |
|
"grad_norm": 0.620794951915741, |
|
"learning_rate": 1.3164657389789458e-05, |
|
"loss": 0.7015, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.8434504792332268, |
|
"grad_norm": 0.32053133845329285, |
|
"learning_rate": 1.2477657809124631e-05, |
|
"loss": 0.5328, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.8477103301384451, |
|
"grad_norm": 0.41892528533935547, |
|
"learning_rate": 1.1807873565164506e-05, |
|
"loss": 0.5929, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8519701810436635, |
|
"grad_norm": 0.2980664372444153, |
|
"learning_rate": 1.1155436402112785e-05, |
|
"loss": 0.4209, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8562300319488818, |
|
"grad_norm": 0.3290930986404419, |
|
"learning_rate": 1.0520474652063394e-05, |
|
"loss": 0.4423, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.8604898828541001, |
|
"grad_norm": 0.3246372640132904, |
|
"learning_rate": 9.903113209758096e-06, |
|
"loss": 0.4631, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.8647497337593184, |
|
"grad_norm": 0.3644905388355255, |
|
"learning_rate": 9.303473508019944e-06, |
|
"loss": 0.552, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.8690095846645367, |
|
"grad_norm": 0.49974295496940613, |
|
"learning_rate": 8.72167349386811e-06, |
|
"loss": 0.7516, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.873269435569755, |
|
"grad_norm": 0.3242340087890625, |
|
"learning_rate": 8.157827605317892e-06, |
|
"loss": 0.412, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8775292864749734, |
|
"grad_norm": 0.33690881729125977, |
|
"learning_rate": 7.612046748871327e-06, |
|
"loss": 0.4841, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.8817891373801917, |
|
"grad_norm": 0.3246766924858093, |
|
"learning_rate": 7.084438277702188e-06, |
|
"loss": 0.4341, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.88604898828541, |
|
"grad_norm": 0.4262131452560425, |
|
"learning_rate": 6.5751059705400295e-06, |
|
"loss": 0.6306, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.8903088391906283, |
|
"grad_norm": 0.32158687710762024, |
|
"learning_rate": 6.084150011257239e-06, |
|
"loss": 0.4687, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.8945686900958466, |
|
"grad_norm": 0.377208948135376, |
|
"learning_rate": 5.611666969163243e-06, |
|
"loss": 0.5849, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.898828541001065, |
|
"grad_norm": 0.30956804752349854, |
|
"learning_rate": 5.157749780009735e-06, |
|
"loss": 0.4355, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.9030883919062833, |
|
"grad_norm": 0.4885202944278717, |
|
"learning_rate": 4.722487727710368e-06, |
|
"loss": 0.6129, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.9073482428115016, |
|
"grad_norm": 0.3384571075439453, |
|
"learning_rate": 4.305966426779118e-06, |
|
"loss": 0.4345, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.9116080937167199, |
|
"grad_norm": 0.4629303514957428, |
|
"learning_rate": 3.908267805490051e-06, |
|
"loss": 0.6158, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.9158679446219382, |
|
"grad_norm": 0.3206894099712372, |
|
"learning_rate": 3.529470089762421e-06, |
|
"loss": 0.4689, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.9201277955271565, |
|
"grad_norm": 0.41424816846847534, |
|
"learning_rate": 3.169647787773866e-06, |
|
"loss": 0.5235, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.9243876464323749, |
|
"grad_norm": 0.3189912736415863, |
|
"learning_rate": 2.8288716753049005e-06, |
|
"loss": 0.4262, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.9286474973375932, |
|
"grad_norm": 0.3202993869781494, |
|
"learning_rate": 2.5072087818176382e-06, |
|
"loss": 0.493, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.9329073482428115, |
|
"grad_norm": 0.32974228262901306, |
|
"learning_rate": 2.20472237727124e-06, |
|
"loss": 0.5512, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.9371671991480298, |
|
"grad_norm": 0.27346375584602356, |
|
"learning_rate": 1.921471959676957e-06, |
|
"loss": 0.4154, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9414270500532481, |
|
"grad_norm": 0.29039615392684937, |
|
"learning_rate": 1.657513243395159e-06, |
|
"loss": 0.4367, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.9456869009584664, |
|
"grad_norm": 0.3864074647426605, |
|
"learning_rate": 1.4128981481764115e-06, |
|
"loss": 0.5531, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.9499467518636848, |
|
"grad_norm": 0.4020756185054779, |
|
"learning_rate": 1.1876747889491223e-06, |
|
"loss": 0.6631, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.9542066027689031, |
|
"grad_norm": 0.37475478649139404, |
|
"learning_rate": 9.818874663554357e-07, |
|
"loss": 0.4979, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9584664536741214, |
|
"grad_norm": 0.3484041690826416, |
|
"learning_rate": 7.955766580375335e-07, |
|
"loss": 0.5207, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9627263045793397, |
|
"grad_norm": 0.3385999798774719, |
|
"learning_rate": 6.287790106757396e-07, |
|
"loss": 0.4832, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.966986155484558, |
|
"grad_norm": 0.27909693121910095, |
|
"learning_rate": 4.815273327803182e-07, |
|
"loss": 0.3506, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.9712460063897763, |
|
"grad_norm": 0.34196606278419495, |
|
"learning_rate": 3.5385058823809156e-07, |
|
"loss": 0.5283, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.9755058572949947, |
|
"grad_norm": 0.39571547508239746, |
|
"learning_rate": 2.457738906153972e-07, |
|
"loss": 0.5099, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.979765708200213, |
|
"grad_norm": 0.4107287526130676, |
|
"learning_rate": 1.5731849821833954e-07, |
|
"loss": 0.5042, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9840255591054313, |
|
"grad_norm": 0.3254135251045227, |
|
"learning_rate": 8.850180991131219e-08, |
|
"loss": 0.4929, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.9882854100106496, |
|
"grad_norm": 0.2778495252132416, |
|
"learning_rate": 3.933736169471347e-08, |
|
"loss": 0.3571, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.9925452609158679, |
|
"grad_norm": 0.4703886806964874, |
|
"learning_rate": 9.834824042498358e-09, |
|
"loss": 0.7013, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.9968051118210862, |
|
"grad_norm": 0.3349379599094391, |
|
"learning_rate": 0.0, |
|
"loss": 0.4725, |
|
"step": 234 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 234, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1191352754700288e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|