|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.821256038647343, |
|
"eval_steps": 500, |
|
"global_step": 170, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004830917874396135, |
|
"grad_norm": 0.7645118236541748, |
|
"learning_rate": 5e-06, |
|
"loss": 1.259, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00966183574879227, |
|
"grad_norm": 0.9345910549163818, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4787, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.014492753623188406, |
|
"grad_norm": 0.9917318224906921, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.5453, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01932367149758454, |
|
"grad_norm": 1.0239824056625366, |
|
"learning_rate": 2e-05, |
|
"loss": 1.5964, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.024154589371980676, |
|
"grad_norm": 0.9726951718330383, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.5687, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.028985507246376812, |
|
"grad_norm": 0.7599917650222778, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5249, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.033816425120772944, |
|
"grad_norm": 0.5268093347549438, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.4637, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03864734299516908, |
|
"grad_norm": 0.5739946365356445, |
|
"learning_rate": 4e-05, |
|
"loss": 1.4514, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.043478260869565216, |
|
"grad_norm": 0.6630675792694092, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.442, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04830917874396135, |
|
"grad_norm": 0.5699703097343445, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4091, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05314009661835749, |
|
"grad_norm": 0.4952673017978668, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 1.3877, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.057971014492753624, |
|
"grad_norm": 0.5180989503860474, |
|
"learning_rate": 6e-05, |
|
"loss": 1.3707, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.06280193236714976, |
|
"grad_norm": 0.41192442178726196, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 1.3599, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.06763285024154589, |
|
"grad_norm": 0.28801196813583374, |
|
"learning_rate": 7e-05, |
|
"loss": 1.3484, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.07246376811594203, |
|
"grad_norm": 0.2618640959262848, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.3382, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07729468599033816, |
|
"grad_norm": 0.2657703161239624, |
|
"learning_rate": 8e-05, |
|
"loss": 1.335, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0821256038647343, |
|
"grad_norm": 0.2432931512594223, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.323, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.08695652173913043, |
|
"grad_norm": 0.24172987043857574, |
|
"learning_rate": 9e-05, |
|
"loss": 1.3317, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.09178743961352658, |
|
"grad_norm": 0.26086804270744324, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.3163, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0966183574879227, |
|
"grad_norm": 0.2007642686367035, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2877, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10144927536231885, |
|
"grad_norm": 0.2327784299850464, |
|
"learning_rate": 9.946524064171123e-05, |
|
"loss": 1.2899, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.10628019323671498, |
|
"grad_norm": 0.20648740231990814, |
|
"learning_rate": 9.893048128342246e-05, |
|
"loss": 1.287, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 0.22094646096229553, |
|
"learning_rate": 9.83957219251337e-05, |
|
"loss": 1.2873, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.11594202898550725, |
|
"grad_norm": 0.18131175637245178, |
|
"learning_rate": 9.786096256684493e-05, |
|
"loss": 1.2594, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.12077294685990338, |
|
"grad_norm": 0.16657911241054535, |
|
"learning_rate": 9.732620320855615e-05, |
|
"loss": 1.2639, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.12560386473429952, |
|
"grad_norm": 0.1740303933620453, |
|
"learning_rate": 9.679144385026739e-05, |
|
"loss": 1.2603, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.13043478260869565, |
|
"grad_norm": 0.15640808641910553, |
|
"learning_rate": 9.625668449197861e-05, |
|
"loss": 1.274, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.13526570048309178, |
|
"grad_norm": 0.16680403053760529, |
|
"learning_rate": 9.572192513368984e-05, |
|
"loss": 1.2724, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.14009661835748793, |
|
"grad_norm": 0.15997755527496338, |
|
"learning_rate": 9.518716577540108e-05, |
|
"loss": 1.2672, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.14492753623188406, |
|
"grad_norm": 0.15305301547050476, |
|
"learning_rate": 9.46524064171123e-05, |
|
"loss": 1.2718, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1497584541062802, |
|
"grad_norm": 0.14839769899845123, |
|
"learning_rate": 9.411764705882353e-05, |
|
"loss": 1.2707, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.15458937198067632, |
|
"grad_norm": 0.14878958463668823, |
|
"learning_rate": 9.358288770053476e-05, |
|
"loss": 1.2648, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.15942028985507245, |
|
"grad_norm": 0.17154482007026672, |
|
"learning_rate": 9.3048128342246e-05, |
|
"loss": 1.2641, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1642512077294686, |
|
"grad_norm": 0.1447138488292694, |
|
"learning_rate": 9.251336898395723e-05, |
|
"loss": 1.27, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.16908212560386474, |
|
"grad_norm": 0.1631896197795868, |
|
"learning_rate": 9.197860962566846e-05, |
|
"loss": 1.276, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 0.14892889559268951, |
|
"learning_rate": 9.144385026737968e-05, |
|
"loss": 1.2747, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.178743961352657, |
|
"grad_norm": 0.1588708907365799, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 1.276, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.18357487922705315, |
|
"grad_norm": 0.151743546128273, |
|
"learning_rate": 9.037433155080214e-05, |
|
"loss": 1.2788, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.18840579710144928, |
|
"grad_norm": 0.15703994035720825, |
|
"learning_rate": 8.983957219251337e-05, |
|
"loss": 1.2936, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1932367149758454, |
|
"grad_norm": 0.1660437434911728, |
|
"learning_rate": 8.930481283422461e-05, |
|
"loss": 1.2824, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.19806763285024154, |
|
"grad_norm": 0.15268553793430328, |
|
"learning_rate": 8.877005347593583e-05, |
|
"loss": 1.3056, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2028985507246377, |
|
"grad_norm": 0.1577601134777069, |
|
"learning_rate": 8.823529411764706e-05, |
|
"loss": 1.3142, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.20772946859903382, |
|
"grad_norm": 0.16757714748382568, |
|
"learning_rate": 8.770053475935829e-05, |
|
"loss": 1.3389, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.21256038647342995, |
|
"grad_norm": 0.1712018847465515, |
|
"learning_rate": 8.716577540106952e-05, |
|
"loss": 1.3437, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"grad_norm": 0.1829441487789154, |
|
"learning_rate": 8.663101604278076e-05, |
|
"loss": 1.358, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.20615732669830322, |
|
"learning_rate": 8.609625668449198e-05, |
|
"loss": 1.3878, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.22705314009661837, |
|
"grad_norm": 0.23940807580947876, |
|
"learning_rate": 8.556149732620321e-05, |
|
"loss": 1.4699, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.2318840579710145, |
|
"grad_norm": 0.41468575596809387, |
|
"learning_rate": 8.502673796791443e-05, |
|
"loss": 1.4893, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.23671497584541062, |
|
"grad_norm": 0.5656126737594604, |
|
"learning_rate": 8.449197860962568e-05, |
|
"loss": 1.4967, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.24154589371980675, |
|
"grad_norm": 4.2125325202941895, |
|
"learning_rate": 8.39572192513369e-05, |
|
"loss": 1.6295, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2463768115942029, |
|
"grad_norm": 1.100631833076477, |
|
"learning_rate": 8.342245989304814e-05, |
|
"loss": 1.0523, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.25120772946859904, |
|
"grad_norm": 0.44898825883865356, |
|
"learning_rate": 8.288770053475936e-05, |
|
"loss": 1.106, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.2560386473429952, |
|
"grad_norm": 0.2860921025276184, |
|
"learning_rate": 8.23529411764706e-05, |
|
"loss": 1.1398, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.2608695652173913, |
|
"grad_norm": 0.28824105858802795, |
|
"learning_rate": 8.181818181818183e-05, |
|
"loss": 1.1547, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.26570048309178745, |
|
"grad_norm": 0.32123416662216187, |
|
"learning_rate": 8.128342245989305e-05, |
|
"loss": 1.1646, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.27053140096618356, |
|
"grad_norm": 0.2752850353717804, |
|
"learning_rate": 8.074866310160429e-05, |
|
"loss": 1.1752, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2753623188405797, |
|
"grad_norm": 0.22371803224086761, |
|
"learning_rate": 8.021390374331551e-05, |
|
"loss": 1.1934, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.28019323671497587, |
|
"grad_norm": 0.23126192390918732, |
|
"learning_rate": 7.967914438502674e-05, |
|
"loss": 1.2057, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.28502415458937197, |
|
"grad_norm": 0.24694480001926422, |
|
"learning_rate": 7.914438502673798e-05, |
|
"loss": 1.1858, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.2898550724637681, |
|
"grad_norm": 0.20105589926242828, |
|
"learning_rate": 7.86096256684492e-05, |
|
"loss": 1.202, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2946859903381642, |
|
"grad_norm": 0.15975232422351837, |
|
"learning_rate": 7.807486631016043e-05, |
|
"loss": 1.2014, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2995169082125604, |
|
"grad_norm": 0.17269295454025269, |
|
"learning_rate": 7.754010695187165e-05, |
|
"loss": 1.1878, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.30434782608695654, |
|
"grad_norm": 0.1990584284067154, |
|
"learning_rate": 7.700534759358289e-05, |
|
"loss": 1.1951, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.30917874396135264, |
|
"grad_norm": 0.18062998354434967, |
|
"learning_rate": 7.647058823529411e-05, |
|
"loss": 1.1978, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.3140096618357488, |
|
"grad_norm": 0.15606802701950073, |
|
"learning_rate": 7.593582887700536e-05, |
|
"loss": 1.2153, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3188405797101449, |
|
"grad_norm": 0.1434660404920578, |
|
"learning_rate": 7.540106951871658e-05, |
|
"loss": 1.2074, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.32367149758454106, |
|
"grad_norm": 0.1473468840122223, |
|
"learning_rate": 7.486631016042782e-05, |
|
"loss": 1.1962, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.3285024154589372, |
|
"grad_norm": 0.1452961415052414, |
|
"learning_rate": 7.433155080213904e-05, |
|
"loss": 1.209, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.14672888815402985, |
|
"learning_rate": 7.379679144385027e-05, |
|
"loss": 1.2126, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.33816425120772947, |
|
"grad_norm": 0.14124266803264618, |
|
"learning_rate": 7.326203208556151e-05, |
|
"loss": 1.2111, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.34299516908212563, |
|
"grad_norm": 0.13139352202415466, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 1.216, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 0.1385214626789093, |
|
"learning_rate": 7.219251336898396e-05, |
|
"loss": 1.2199, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3526570048309179, |
|
"grad_norm": 0.1288975179195404, |
|
"learning_rate": 7.165775401069518e-05, |
|
"loss": 1.201, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.357487922705314, |
|
"grad_norm": 0.13003361225128174, |
|
"learning_rate": 7.112299465240642e-05, |
|
"loss": 1.2186, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.36231884057971014, |
|
"grad_norm": 0.13762855529785156, |
|
"learning_rate": 7.058823529411765e-05, |
|
"loss": 1.2209, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3671497584541063, |
|
"grad_norm": 0.13935087621212006, |
|
"learning_rate": 7.005347593582889e-05, |
|
"loss": 1.2219, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.3719806763285024, |
|
"grad_norm": 0.13384683430194855, |
|
"learning_rate": 6.951871657754011e-05, |
|
"loss": 1.2419, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.37681159420289856, |
|
"grad_norm": 0.12453139573335648, |
|
"learning_rate": 6.898395721925133e-05, |
|
"loss": 1.2154, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.38164251207729466, |
|
"grad_norm": 0.13903535902500153, |
|
"learning_rate": 6.844919786096257e-05, |
|
"loss": 1.2378, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.3864734299516908, |
|
"grad_norm": 0.13833968341350555, |
|
"learning_rate": 6.79144385026738e-05, |
|
"loss": 1.2244, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.391304347826087, |
|
"grad_norm": 0.13052114844322205, |
|
"learning_rate": 6.737967914438504e-05, |
|
"loss": 1.226, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3961352657004831, |
|
"grad_norm": 0.13437196612358093, |
|
"learning_rate": 6.684491978609626e-05, |
|
"loss": 1.2457, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.40096618357487923, |
|
"grad_norm": 0.13693881034851074, |
|
"learning_rate": 6.631016042780749e-05, |
|
"loss": 1.2384, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.4057971014492754, |
|
"grad_norm": 0.13426737487316132, |
|
"learning_rate": 6.577540106951871e-05, |
|
"loss": 1.2427, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.4106280193236715, |
|
"grad_norm": 0.13844414055347443, |
|
"learning_rate": 6.524064171122995e-05, |
|
"loss": 1.2661, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.41545893719806765, |
|
"grad_norm": 0.13957887887954712, |
|
"learning_rate": 6.470588235294118e-05, |
|
"loss": 1.2696, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.42028985507246375, |
|
"grad_norm": 0.13465899229049683, |
|
"learning_rate": 6.41711229946524e-05, |
|
"loss": 1.2591, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.4251207729468599, |
|
"grad_norm": 0.1441555917263031, |
|
"learning_rate": 6.363636363636364e-05, |
|
"loss": 1.2766, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.42995169082125606, |
|
"grad_norm": 0.1500505656003952, |
|
"learning_rate": 6.310160427807486e-05, |
|
"loss": 1.2887, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 0.15137352049350739, |
|
"learning_rate": 6.25668449197861e-05, |
|
"loss": 1.2792, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4396135265700483, |
|
"grad_norm": 0.15071454644203186, |
|
"learning_rate": 6.203208556149733e-05, |
|
"loss": 1.2876, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.1570783108472824, |
|
"learning_rate": 6.149732620320857e-05, |
|
"loss": 1.3223, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.4492753623188406, |
|
"grad_norm": 0.16483648121356964, |
|
"learning_rate": 6.096256684491979e-05, |
|
"loss": 1.3285, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.45410628019323673, |
|
"grad_norm": 0.1700102537870407, |
|
"learning_rate": 6.0427807486631016e-05, |
|
"loss": 1.3349, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.45893719806763283, |
|
"grad_norm": 0.1778935194015503, |
|
"learning_rate": 5.9893048128342244e-05, |
|
"loss": 1.3644, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.463768115942029, |
|
"grad_norm": 0.19471201300621033, |
|
"learning_rate": 5.9358288770053486e-05, |
|
"loss": 1.4028, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.46859903381642515, |
|
"grad_norm": 0.25646305084228516, |
|
"learning_rate": 5.882352941176471e-05, |
|
"loss": 1.4694, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.47342995169082125, |
|
"grad_norm": 0.277474045753479, |
|
"learning_rate": 5.8288770053475936e-05, |
|
"loss": 1.5052, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4782608695652174, |
|
"grad_norm": 0.33792170882225037, |
|
"learning_rate": 5.7754010695187164e-05, |
|
"loss": 1.5414, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.4830917874396135, |
|
"grad_norm": 0.6432341933250427, |
|
"learning_rate": 5.721925133689839e-05, |
|
"loss": 1.6046, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.48792270531400966, |
|
"grad_norm": 0.5846565365791321, |
|
"learning_rate": 5.6684491978609634e-05, |
|
"loss": 0.9549, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.4927536231884058, |
|
"grad_norm": 0.49914559721946716, |
|
"learning_rate": 5.614973262032086e-05, |
|
"loss": 1.0978, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.4975845410628019, |
|
"grad_norm": 0.33365777134895325, |
|
"learning_rate": 5.561497326203209e-05, |
|
"loss": 1.1242, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.5024154589371981, |
|
"grad_norm": 0.7763075828552246, |
|
"learning_rate": 5.508021390374332e-05, |
|
"loss": 1.1525, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.5072463768115942, |
|
"grad_norm": 0.2733679413795471, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 1.1607, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5120772946859904, |
|
"grad_norm": 0.26070019602775574, |
|
"learning_rate": 5.401069518716578e-05, |
|
"loss": 1.1609, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.5169082125603864, |
|
"grad_norm": 0.2708110809326172, |
|
"learning_rate": 5.347593582887701e-05, |
|
"loss": 1.1815, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 0.26362475752830505, |
|
"learning_rate": 5.294117647058824e-05, |
|
"loss": 1.1943, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.5265700483091788, |
|
"grad_norm": 0.21209686994552612, |
|
"learning_rate": 5.2406417112299466e-05, |
|
"loss": 1.1944, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.5314009661835749, |
|
"grad_norm": 0.21895644068717957, |
|
"learning_rate": 5.1871657754010694e-05, |
|
"loss": 1.192, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5362318840579711, |
|
"grad_norm": 0.20762521028518677, |
|
"learning_rate": 5.1336898395721935e-05, |
|
"loss": 1.1987, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.5410628019323671, |
|
"grad_norm": 0.18395845592021942, |
|
"learning_rate": 5.0802139037433164e-05, |
|
"loss": 1.1925, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.5458937198067633, |
|
"grad_norm": 0.1653558313846588, |
|
"learning_rate": 5.026737967914439e-05, |
|
"loss": 1.1961, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.5507246376811594, |
|
"grad_norm": 0.1628160923719406, |
|
"learning_rate": 4.973262032085561e-05, |
|
"loss": 1.1993, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.17022235691547394, |
|
"learning_rate": 4.919786096256685e-05, |
|
"loss": 1.2101, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5603864734299517, |
|
"grad_norm": 0.1710771918296814, |
|
"learning_rate": 4.8663101604278076e-05, |
|
"loss": 1.2059, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5652173913043478, |
|
"grad_norm": 0.17160499095916748, |
|
"learning_rate": 4.8128342245989304e-05, |
|
"loss": 1.206, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5700483091787439, |
|
"grad_norm": 0.15871083736419678, |
|
"learning_rate": 4.759358288770054e-05, |
|
"loss": 1.2067, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5748792270531401, |
|
"grad_norm": 0.14880803227424622, |
|
"learning_rate": 4.705882352941177e-05, |
|
"loss": 1.218, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.5797101449275363, |
|
"grad_norm": 0.14413942396640778, |
|
"learning_rate": 4.6524064171123e-05, |
|
"loss": 1.1958, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5845410628019324, |
|
"grad_norm": 0.14965958893299103, |
|
"learning_rate": 4.598930481283423e-05, |
|
"loss": 1.2206, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5893719806763285, |
|
"grad_norm": 0.14546802639961243, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 1.2107, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5942028985507246, |
|
"grad_norm": 0.14043672382831573, |
|
"learning_rate": 4.491978609625669e-05, |
|
"loss": 1.2059, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5990338164251208, |
|
"grad_norm": 0.1403893083333969, |
|
"learning_rate": 4.4385026737967915e-05, |
|
"loss": 1.2327, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.6038647342995169, |
|
"grad_norm": 0.13266952335834503, |
|
"learning_rate": 4.385026737967914e-05, |
|
"loss": 1.194, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6086956521739131, |
|
"grad_norm": 0.1347023993730545, |
|
"learning_rate": 4.331550802139038e-05, |
|
"loss": 1.1951, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.6135265700483091, |
|
"grad_norm": 0.13116984069347382, |
|
"learning_rate": 4.2780748663101606e-05, |
|
"loss": 1.2206, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.6183574879227053, |
|
"grad_norm": 0.14027127623558044, |
|
"learning_rate": 4.224598930481284e-05, |
|
"loss": 1.2304, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.6231884057971014, |
|
"grad_norm": 0.13990454375743866, |
|
"learning_rate": 4.171122994652407e-05, |
|
"loss": 1.2229, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.6280193236714976, |
|
"grad_norm": 0.13515028357505798, |
|
"learning_rate": 4.11764705882353e-05, |
|
"loss": 1.2191, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6328502415458938, |
|
"grad_norm": 0.1329352706670761, |
|
"learning_rate": 4.0641711229946525e-05, |
|
"loss": 1.2308, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.6376811594202898, |
|
"grad_norm": 0.13061358034610748, |
|
"learning_rate": 4.0106951871657754e-05, |
|
"loss": 1.2446, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.642512077294686, |
|
"grad_norm": 0.13514551520347595, |
|
"learning_rate": 3.957219251336899e-05, |
|
"loss": 1.2425, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.6473429951690821, |
|
"grad_norm": 0.13962285220623016, |
|
"learning_rate": 3.903743315508022e-05, |
|
"loss": 1.2574, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.6521739130434783, |
|
"grad_norm": 0.14081381261348724, |
|
"learning_rate": 3.8502673796791445e-05, |
|
"loss": 1.247, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.6570048309178744, |
|
"grad_norm": 0.1396479606628418, |
|
"learning_rate": 3.796791443850268e-05, |
|
"loss": 1.2379, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.6618357487922706, |
|
"grad_norm": 0.13990682363510132, |
|
"learning_rate": 3.743315508021391e-05, |
|
"loss": 1.2637, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.1405869573354721, |
|
"learning_rate": 3.6898395721925136e-05, |
|
"loss": 1.2458, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.6714975845410628, |
|
"grad_norm": 0.14569957554340363, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 1.2755, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.6763285024154589, |
|
"grad_norm": 0.14868015050888062, |
|
"learning_rate": 3.582887700534759e-05, |
|
"loss": 1.2836, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6811594202898551, |
|
"grad_norm": 0.1531868726015091, |
|
"learning_rate": 3.529411764705883e-05, |
|
"loss": 1.295, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.6859903381642513, |
|
"grad_norm": 0.16108393669128418, |
|
"learning_rate": 3.4759358288770055e-05, |
|
"loss": 1.3239, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.6908212560386473, |
|
"grad_norm": 0.1609143316745758, |
|
"learning_rate": 3.4224598930481284e-05, |
|
"loss": 1.3301, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 0.16705213487148285, |
|
"learning_rate": 3.368983957219252e-05, |
|
"loss": 1.3484, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.7004830917874396, |
|
"grad_norm": 0.18058659136295319, |
|
"learning_rate": 3.3155080213903747e-05, |
|
"loss": 1.3794, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.7053140096618358, |
|
"grad_norm": 0.20221418142318726, |
|
"learning_rate": 3.2620320855614975e-05, |
|
"loss": 1.4019, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.7101449275362319, |
|
"grad_norm": 0.24968379735946655, |
|
"learning_rate": 3.20855614973262e-05, |
|
"loss": 1.4674, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.714975845410628, |
|
"grad_norm": 0.3043461740016937, |
|
"learning_rate": 3.155080213903743e-05, |
|
"loss": 1.4972, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.7198067632850241, |
|
"grad_norm": 0.33808770775794983, |
|
"learning_rate": 3.1016042780748666e-05, |
|
"loss": 1.5237, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.7246376811594203, |
|
"grad_norm": 0.5125290155410767, |
|
"learning_rate": 3.0481283422459894e-05, |
|
"loss": 1.5533, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7294685990338164, |
|
"grad_norm": 0.25133025646209717, |
|
"learning_rate": 2.9946524064171122e-05, |
|
"loss": 0.8978, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.7342995169082126, |
|
"grad_norm": 0.30135515332221985, |
|
"learning_rate": 2.9411764705882354e-05, |
|
"loss": 1.0278, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.7391304347826086, |
|
"grad_norm": 0.2961145043373108, |
|
"learning_rate": 2.8877005347593582e-05, |
|
"loss": 1.1024, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.7439613526570048, |
|
"grad_norm": 0.273294597864151, |
|
"learning_rate": 2.8342245989304817e-05, |
|
"loss": 1.1373, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.748792270531401, |
|
"grad_norm": 0.23799936473369598, |
|
"learning_rate": 2.7807486631016045e-05, |
|
"loss": 1.1419, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.7536231884057971, |
|
"grad_norm": 0.21061824262142181, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 1.1512, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.7584541062801933, |
|
"grad_norm": 0.21470795571804047, |
|
"learning_rate": 2.6737967914438505e-05, |
|
"loss": 1.1616, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.7632850241545893, |
|
"grad_norm": 0.21231332421302795, |
|
"learning_rate": 2.6203208556149733e-05, |
|
"loss": 1.1662, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.7681159420289855, |
|
"grad_norm": 0.20699279010295868, |
|
"learning_rate": 2.5668449197860968e-05, |
|
"loss": 1.1716, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.7729468599033816, |
|
"grad_norm": 0.20150579512119293, |
|
"learning_rate": 2.5133689839572196e-05, |
|
"loss": 1.1836, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 0.1880892962217331, |
|
"learning_rate": 2.4598930481283424e-05, |
|
"loss": 1.1794, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.782608695652174, |
|
"grad_norm": 0.17840184271335602, |
|
"learning_rate": 2.4064171122994652e-05, |
|
"loss": 1.1757, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.7874396135265701, |
|
"grad_norm": 0.18409621715545654, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 1.1935, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.7922705314009661, |
|
"grad_norm": 0.18802137672901154, |
|
"learning_rate": 2.2994652406417115e-05, |
|
"loss": 1.1795, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.7971014492753623, |
|
"grad_norm": 0.1941538006067276, |
|
"learning_rate": 2.2459893048128343e-05, |
|
"loss": 1.2069, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.8019323671497585, |
|
"grad_norm": 0.18578243255615234, |
|
"learning_rate": 2.192513368983957e-05, |
|
"loss": 1.1965, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.8067632850241546, |
|
"grad_norm": 0.17622320353984833, |
|
"learning_rate": 2.1390374331550803e-05, |
|
"loss": 1.1839, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.8115942028985508, |
|
"grad_norm": 0.16080059111118317, |
|
"learning_rate": 2.0855614973262035e-05, |
|
"loss": 1.1926, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.8164251207729468, |
|
"grad_norm": 0.14835765957832336, |
|
"learning_rate": 2.0320855614973263e-05, |
|
"loss": 1.1958, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.821256038647343, |
|
"grad_norm": 0.14560697972774506, |
|
"learning_rate": 1.9786096256684494e-05, |
|
"loss": 1.2047, |
|
"step": 170 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 207, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.493930894749139e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|