diff --git "a/adapter_info/trainer_state.json" "b/adapter_info/trainer_state.json" new file mode 100644--- /dev/null +++ "b/adapter_info/trainer_state.json" @@ -0,0 +1,10948 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1558, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006418485237483953, + "grad_norm": 0.32000207901000977, + "learning_rate": 2.1276595744680853e-06, + "loss": 9.1875, + "step": 1 + }, + { + "epoch": 0.0012836970474967907, + "grad_norm": 0.31870341300964355, + "learning_rate": 4.255319148936171e-06, + "loss": 9.25, + "step": 2 + }, + { + "epoch": 0.0019255455712451862, + "grad_norm": 0.30978211760520935, + "learning_rate": 6.3829787234042555e-06, + "loss": 9.125, + "step": 3 + }, + { + "epoch": 0.0025673940949935813, + "grad_norm": 0.3134618401527405, + "learning_rate": 8.510638297872341e-06, + "loss": 9.125, + "step": 4 + }, + { + "epoch": 0.003209242618741977, + "grad_norm": 0.3147919178009033, + "learning_rate": 1.0638297872340426e-05, + "loss": 9.125, + "step": 5 + }, + { + "epoch": 0.0038510911424903724, + "grad_norm": 0.3050297796726227, + "learning_rate": 1.2765957446808511e-05, + "loss": 9.125, + "step": 6 + }, + { + "epoch": 0.004492939666238768, + "grad_norm": 0.29189687967300415, + "learning_rate": 1.4893617021276596e-05, + "loss": 9.125, + "step": 7 + }, + { + "epoch": 0.005134788189987163, + "grad_norm": 0.2791145145893097, + "learning_rate": 1.7021276595744682e-05, + "loss": 9.0, + "step": 8 + }, + { + "epoch": 0.005776636713735558, + "grad_norm": 0.24908146262168884, + "learning_rate": 1.9148936170212766e-05, + "loss": 8.875, + "step": 9 + }, + { + "epoch": 0.006418485237483954, + "grad_norm": 0.2112804502248764, + "learning_rate": 2.1276595744680852e-05, + "loss": 8.8125, + "step": 10 + }, + { + "epoch": 0.007060333761232349, + "grad_norm": 0.15402039885520935, + "learning_rate": 2.340425531914894e-05, + "loss": 8.625, + "step": 11 + }, + { + "epoch": 0.007702182284980745, + "grad_norm": 0.10005302727222443, + "learning_rate": 2.5531914893617022e-05, + "loss": 8.5, + "step": 12 + }, + { + "epoch": 0.00834403080872914, + "grad_norm": 0.061644140630960464, + "learning_rate": 2.765957446808511e-05, + "loss": 8.4375, + "step": 13 + }, + { + "epoch": 0.008985879332477536, + "grad_norm": 0.04229239746928215, + "learning_rate": 2.9787234042553192e-05, + "loss": 8.375, + "step": 14 + }, + { + "epoch": 0.009627727856225931, + "grad_norm": 0.04680441692471504, + "learning_rate": 3.191489361702128e-05, + "loss": 8.375, + "step": 15 + }, + { + "epoch": 0.010269576379974325, + "grad_norm": 0.0876237079501152, + "learning_rate": 3.4042553191489365e-05, + "loss": 8.3125, + "step": 16 + }, + { + "epoch": 0.010911424903722721, + "grad_norm": 0.12335914373397827, + "learning_rate": 3.617021276595745e-05, + "loss": 8.375, + "step": 17 + }, + { + "epoch": 0.011553273427471117, + "grad_norm": 0.0966993197798729, + "learning_rate": 3.829787234042553e-05, + "loss": 8.3125, + "step": 18 + }, + { + "epoch": 0.012195121951219513, + "grad_norm": 0.047657210379838943, + "learning_rate": 4.0425531914893614e-05, + "loss": 8.25, + "step": 19 + }, + { + "epoch": 0.012836970474967908, + "grad_norm": 0.024272233247756958, + "learning_rate": 4.2553191489361704e-05, + "loss": 8.25, + "step": 20 + }, + { + "epoch": 0.013478818998716302, + "grad_norm": 0.02427881769835949, + "learning_rate": 4.468085106382979e-05, + "loss": 8.25, + "step": 21 + }, + { + "epoch": 0.014120667522464698, + "grad_norm": 0.026496585458517075, + "learning_rate": 4.680851063829788e-05, + "loss": 8.25, + "step": 22 + }, + { + "epoch": 0.014762516046213094, + "grad_norm": 0.03327532112598419, + "learning_rate": 4.893617021276596e-05, + "loss": 8.25, + "step": 23 + }, + { + "epoch": 0.01540436456996149, + "grad_norm": 0.03587386757135391, + "learning_rate": 5.1063829787234044e-05, + "loss": 8.25, + "step": 24 + }, + { + "epoch": 0.016046213093709884, + "grad_norm": 0.03461160138249397, + "learning_rate": 5.319148936170213e-05, + "loss": 8.25, + "step": 25 + }, + { + "epoch": 0.01668806161745828, + "grad_norm": 0.029515139758586884, + "learning_rate": 5.531914893617022e-05, + "loss": 8.25, + "step": 26 + }, + { + "epoch": 0.017329910141206675, + "grad_norm": 0.027110258117318153, + "learning_rate": 5.744680851063831e-05, + "loss": 8.25, + "step": 27 + }, + { + "epoch": 0.01797175866495507, + "grad_norm": 0.02379121631383896, + "learning_rate": 5.9574468085106384e-05, + "loss": 8.1875, + "step": 28 + }, + { + "epoch": 0.018613607188703467, + "grad_norm": 0.02544436603784561, + "learning_rate": 6.170212765957447e-05, + "loss": 8.1875, + "step": 29 + }, + { + "epoch": 0.019255455712451863, + "grad_norm": 0.030174219980835915, + "learning_rate": 6.382978723404256e-05, + "loss": 8.1875, + "step": 30 + }, + { + "epoch": 0.01989730423620026, + "grad_norm": 0.03604472056031227, + "learning_rate": 6.595744680851063e-05, + "loss": 8.1875, + "step": 31 + }, + { + "epoch": 0.02053915275994865, + "grad_norm": 0.037900377064943314, + "learning_rate": 6.808510638297873e-05, + "loss": 8.125, + "step": 32 + }, + { + "epoch": 0.021181001283697046, + "grad_norm": 0.03888288140296936, + "learning_rate": 7.021276595744681e-05, + "loss": 8.125, + "step": 33 + }, + { + "epoch": 0.021822849807445442, + "grad_norm": 0.04872816056013107, + "learning_rate": 7.23404255319149e-05, + "loss": 8.0625, + "step": 34 + }, + { + "epoch": 0.022464698331193838, + "grad_norm": 0.07964374125003815, + "learning_rate": 7.446808510638298e-05, + "loss": 7.9375, + "step": 35 + }, + { + "epoch": 0.023106546854942234, + "grad_norm": 0.08704540133476257, + "learning_rate": 7.659574468085106e-05, + "loss": 7.875, + "step": 36 + }, + { + "epoch": 0.02374839537869063, + "grad_norm": 0.14245420694351196, + "learning_rate": 7.872340425531916e-05, + "loss": 7.625, + "step": 37 + }, + { + "epoch": 0.024390243902439025, + "grad_norm": 0.20091165602207184, + "learning_rate": 8.085106382978723e-05, + "loss": 7.25, + "step": 38 + }, + { + "epoch": 0.02503209242618742, + "grad_norm": 0.3081730902194977, + "learning_rate": 8.297872340425533e-05, + "loss": 6.4688, + "step": 39 + }, + { + "epoch": 0.025673940949935817, + "grad_norm": 0.3360954523086548, + "learning_rate": 8.510638297872341e-05, + "loss": 5.25, + "step": 40 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 0.28741300106048584, + "learning_rate": 8.723404255319149e-05, + "loss": 4.0625, + "step": 41 + }, + { + "epoch": 0.026957637997432605, + "grad_norm": 0.2542773187160492, + "learning_rate": 8.936170212765958e-05, + "loss": 3.1094, + "step": 42 + }, + { + "epoch": 0.027599486521181, + "grad_norm": 0.16312512755393982, + "learning_rate": 9.148936170212766e-05, + "loss": 2.5, + "step": 43 + }, + { + "epoch": 0.028241335044929396, + "grad_norm": 0.10257931798696518, + "learning_rate": 9.361702127659576e-05, + "loss": 2.1562, + "step": 44 + }, + { + "epoch": 0.028883183568677792, + "grad_norm": 0.07631522417068481, + "learning_rate": 9.574468085106384e-05, + "loss": 2.0, + "step": 45 + }, + { + "epoch": 0.029525032092426188, + "grad_norm": 0.06334506720304489, + "learning_rate": 9.787234042553192e-05, + "loss": 1.8281, + "step": 46 + }, + { + "epoch": 0.030166880616174584, + "grad_norm": 0.05079680308699608, + "learning_rate": 0.0001, + "loss": 1.7031, + "step": 47 + }, + { + "epoch": 0.03080872913992298, + "grad_norm": 0.050113581120967865, + "learning_rate": 9.999989192862701e-05, + "loss": 1.6719, + "step": 48 + }, + { + "epoch": 0.031450577663671375, + "grad_norm": 0.041200995445251465, + "learning_rate": 9.999956771497528e-05, + "loss": 1.6094, + "step": 49 + }, + { + "epoch": 0.03209242618741977, + "grad_norm": 0.03537944331765175, + "learning_rate": 9.999902736044627e-05, + "loss": 1.5312, + "step": 50 + }, + { + "epoch": 0.03273427471116817, + "grad_norm": 0.03154836595058441, + "learning_rate": 9.999827086737589e-05, + "loss": 1.5, + "step": 51 + }, + { + "epoch": 0.03337612323491656, + "grad_norm": 0.027803990989923477, + "learning_rate": 9.999729823903436e-05, + "loss": 1.4766, + "step": 52 + }, + { + "epoch": 0.03401797175866496, + "grad_norm": 0.023708201944828033, + "learning_rate": 9.99961094796262e-05, + "loss": 1.375, + "step": 53 + }, + { + "epoch": 0.03465982028241335, + "grad_norm": 0.021473145112395287, + "learning_rate": 9.999470459429021e-05, + "loss": 1.3281, + "step": 54 + }, + { + "epoch": 0.03530166880616174, + "grad_norm": 0.019184721633791924, + "learning_rate": 9.999308358909955e-05, + "loss": 1.2891, + "step": 55 + }, + { + "epoch": 0.03594351732991014, + "grad_norm": 0.01847364939749241, + "learning_rate": 9.999124647106159e-05, + "loss": 1.2656, + "step": 56 + }, + { + "epoch": 0.036585365853658534, + "grad_norm": 0.016471078619360924, + "learning_rate": 9.99891932481179e-05, + "loss": 1.2344, + "step": 57 + }, + { + "epoch": 0.037227214377406934, + "grad_norm": 0.016381070017814636, + "learning_rate": 9.99869239291443e-05, + "loss": 1.2812, + "step": 58 + }, + { + "epoch": 0.037869062901155326, + "grad_norm": 0.01620948687195778, + "learning_rate": 9.998443852395067e-05, + "loss": 1.2188, + "step": 59 + }, + { + "epoch": 0.038510911424903725, + "grad_norm": 0.01424532663077116, + "learning_rate": 9.998173704328112e-05, + "loss": 1.1797, + "step": 60 + }, + { + "epoch": 0.03915275994865212, + "grad_norm": 0.014445069245994091, + "learning_rate": 9.997881949881371e-05, + "loss": 1.1953, + "step": 61 + }, + { + "epoch": 0.03979460847240052, + "grad_norm": 0.013568165712058544, + "learning_rate": 9.99756859031606e-05, + "loss": 1.1562, + "step": 62 + }, + { + "epoch": 0.04043645699614891, + "grad_norm": 0.01342830155044794, + "learning_rate": 9.997233626986781e-05, + "loss": 1.1719, + "step": 63 + }, + { + "epoch": 0.0410783055198973, + "grad_norm": 0.013186159543693066, + "learning_rate": 9.99687706134154e-05, + "loss": 1.0938, + "step": 64 + }, + { + "epoch": 0.0417201540436457, + "grad_norm": 0.013615522533655167, + "learning_rate": 9.996498894921713e-05, + "loss": 1.1172, + "step": 65 + }, + { + "epoch": 0.04236200256739409, + "grad_norm": 0.011763419024646282, + "learning_rate": 9.996099129362059e-05, + "loss": 1.0781, + "step": 66 + }, + { + "epoch": 0.04300385109114249, + "grad_norm": 0.009965788573026657, + "learning_rate": 9.995677766390707e-05, + "loss": 1.0938, + "step": 67 + }, + { + "epoch": 0.043645699614890884, + "grad_norm": 0.011237729340791702, + "learning_rate": 9.99523480782915e-05, + "loss": 1.0938, + "step": 68 + }, + { + "epoch": 0.044287548138639284, + "grad_norm": 0.010530981235206127, + "learning_rate": 9.994770255592233e-05, + "loss": 1.0938, + "step": 69 + }, + { + "epoch": 0.044929396662387676, + "grad_norm": 0.010274014435708523, + "learning_rate": 9.994284111688145e-05, + "loss": 1.0234, + "step": 70 + }, + { + "epoch": 0.045571245186136075, + "grad_norm": 0.009948200546205044, + "learning_rate": 9.993776378218418e-05, + "loss": 1.0469, + "step": 71 + }, + { + "epoch": 0.04621309370988447, + "grad_norm": 0.010145426727831364, + "learning_rate": 9.99324705737791e-05, + "loss": 1.0625, + "step": 72 + }, + { + "epoch": 0.04685494223363286, + "grad_norm": 0.01015029288828373, + "learning_rate": 9.992696151454799e-05, + "loss": 1.0703, + "step": 73 + }, + { + "epoch": 0.04749679075738126, + "grad_norm": 0.011197876185178757, + "learning_rate": 9.992123662830568e-05, + "loss": 1.0469, + "step": 74 + }, + { + "epoch": 0.04813863928112965, + "grad_norm": 0.010476046241819859, + "learning_rate": 9.991529593980006e-05, + "loss": 1.0234, + "step": 75 + }, + { + "epoch": 0.04878048780487805, + "grad_norm": 0.009835699573159218, + "learning_rate": 9.990913947471184e-05, + "loss": 0.9766, + "step": 76 + }, + { + "epoch": 0.04942233632862644, + "grad_norm": 0.009792305529117584, + "learning_rate": 9.990276725965455e-05, + "loss": 0.9805, + "step": 77 + }, + { + "epoch": 0.05006418485237484, + "grad_norm": 0.00878097303211689, + "learning_rate": 9.989617932217432e-05, + "loss": 0.918, + "step": 78 + }, + { + "epoch": 0.050706033376123234, + "grad_norm": 0.009141056798398495, + "learning_rate": 9.988937569074988e-05, + "loss": 0.9297, + "step": 79 + }, + { + "epoch": 0.051347881899871634, + "grad_norm": 0.008934931829571724, + "learning_rate": 9.98823563947923e-05, + "loss": 0.9727, + "step": 80 + }, + { + "epoch": 0.051989730423620026, + "grad_norm": 0.008960246108472347, + "learning_rate": 9.987512146464504e-05, + "loss": 0.9922, + "step": 81 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 0.00925202202051878, + "learning_rate": 9.986767093158359e-05, + "loss": 0.9453, + "step": 82 + }, + { + "epoch": 0.05327342747111682, + "grad_norm": 0.008465278893709183, + "learning_rate": 9.986000482781557e-05, + "loss": 0.9766, + "step": 83 + }, + { + "epoch": 0.05391527599486521, + "grad_norm": 0.008937939070165157, + "learning_rate": 9.985212318648039e-05, + "loss": 0.8906, + "step": 84 + }, + { + "epoch": 0.05455712451861361, + "grad_norm": 0.00869137141853571, + "learning_rate": 9.984402604164928e-05, + "loss": 0.8828, + "step": 85 + }, + { + "epoch": 0.055198973042362, + "grad_norm": 0.00921856239438057, + "learning_rate": 9.983571342832501e-05, + "loss": 0.875, + "step": 86 + }, + { + "epoch": 0.0558408215661104, + "grad_norm": 0.008353888988494873, + "learning_rate": 9.982718538244181e-05, + "loss": 0.9219, + "step": 87 + }, + { + "epoch": 0.05648267008985879, + "grad_norm": 0.008289925754070282, + "learning_rate": 9.981844194086517e-05, + "loss": 0.875, + "step": 88 + }, + { + "epoch": 0.05712451861360719, + "grad_norm": 0.009131287224590778, + "learning_rate": 9.980948314139173e-05, + "loss": 0.9375, + "step": 89 + }, + { + "epoch": 0.057766367137355584, + "grad_norm": 0.008783861063420773, + "learning_rate": 9.980030902274907e-05, + "loss": 0.9141, + "step": 90 + }, + { + "epoch": 0.05840821566110398, + "grad_norm": 0.009424981661140919, + "learning_rate": 9.979091962459557e-05, + "loss": 0.8672, + "step": 91 + }, + { + "epoch": 0.059050064184852376, + "grad_norm": 0.008605397306382656, + "learning_rate": 9.978131498752026e-05, + "loss": 0.8672, + "step": 92 + }, + { + "epoch": 0.05969191270860077, + "grad_norm": 0.007921144366264343, + "learning_rate": 9.977149515304257e-05, + "loss": 0.9102, + "step": 93 + }, + { + "epoch": 0.06033376123234917, + "grad_norm": 0.008213751949369907, + "learning_rate": 9.976146016361223e-05, + "loss": 0.8555, + "step": 94 + }, + { + "epoch": 0.06097560975609756, + "grad_norm": 0.0076583558693528175, + "learning_rate": 9.975121006260905e-05, + "loss": 0.8359, + "step": 95 + }, + { + "epoch": 0.06161745827984596, + "grad_norm": 0.00856377650052309, + "learning_rate": 9.97407448943427e-05, + "loss": 0.8203, + "step": 96 + }, + { + "epoch": 0.06225930680359435, + "grad_norm": 0.008036739192903042, + "learning_rate": 9.973006470405264e-05, + "loss": 0.8516, + "step": 97 + }, + { + "epoch": 0.06290115532734275, + "grad_norm": 0.008434290066361427, + "learning_rate": 9.971916953790772e-05, + "loss": 0.8086, + "step": 98 + }, + { + "epoch": 0.06354300385109114, + "grad_norm": 0.008587395772337914, + "learning_rate": 9.970805944300621e-05, + "loss": 0.8203, + "step": 99 + }, + { + "epoch": 0.06418485237483953, + "grad_norm": 0.0088276332244277, + "learning_rate": 9.96967344673754e-05, + "loss": 0.8086, + "step": 100 + }, + { + "epoch": 0.06482670089858793, + "grad_norm": 0.009358148090541363, + "learning_rate": 9.968519465997155e-05, + "loss": 0.8047, + "step": 101 + }, + { + "epoch": 0.06546854942233633, + "grad_norm": 0.008407055400311947, + "learning_rate": 9.967344007067954e-05, + "loss": 0.7617, + "step": 102 + }, + { + "epoch": 0.06611039794608473, + "grad_norm": 0.00826259609311819, + "learning_rate": 9.96614707503128e-05, + "loss": 0.8359, + "step": 103 + }, + { + "epoch": 0.06675224646983312, + "grad_norm": 0.009418654255568981, + "learning_rate": 9.964928675061291e-05, + "loss": 0.7812, + "step": 104 + }, + { + "epoch": 0.06739409499358151, + "grad_norm": 0.012697753496468067, + "learning_rate": 9.963688812424958e-05, + "loss": 0.7969, + "step": 105 + }, + { + "epoch": 0.06803594351732992, + "grad_norm": 0.008924663066864014, + "learning_rate": 9.962427492482026e-05, + "loss": 0.8359, + "step": 106 + }, + { + "epoch": 0.06867779204107831, + "grad_norm": 0.009173823520541191, + "learning_rate": 9.961144720684997e-05, + "loss": 0.7656, + "step": 107 + }, + { + "epoch": 0.0693196405648267, + "grad_norm": 0.009439458139240742, + "learning_rate": 9.959840502579108e-05, + "loss": 0.7578, + "step": 108 + }, + { + "epoch": 0.0699614890885751, + "grad_norm": 0.01025363802909851, + "learning_rate": 9.958514843802305e-05, + "loss": 0.8203, + "step": 109 + }, + { + "epoch": 0.07060333761232349, + "grad_norm": 0.01139143854379654, + "learning_rate": 9.957167750085217e-05, + "loss": 0.7891, + "step": 110 + }, + { + "epoch": 0.07124518613607189, + "grad_norm": 0.012185357511043549, + "learning_rate": 9.955799227251137e-05, + "loss": 0.75, + "step": 111 + }, + { + "epoch": 0.07188703465982028, + "grad_norm": 0.011034708470106125, + "learning_rate": 9.954409281215989e-05, + "loss": 0.8047, + "step": 112 + }, + { + "epoch": 0.07252888318356868, + "grad_norm": 0.009615590795874596, + "learning_rate": 9.952997917988308e-05, + "loss": 0.75, + "step": 113 + }, + { + "epoch": 0.07317073170731707, + "grad_norm": 0.010590028017759323, + "learning_rate": 9.951565143669213e-05, + "loss": 0.7422, + "step": 114 + }, + { + "epoch": 0.07381258023106547, + "grad_norm": 0.010860998183488846, + "learning_rate": 9.950110964452382e-05, + "loss": 0.7188, + "step": 115 + }, + { + "epoch": 0.07445442875481387, + "grad_norm": 0.010860625654459, + "learning_rate": 9.948635386624016e-05, + "loss": 0.7305, + "step": 116 + }, + { + "epoch": 0.07509627727856226, + "grad_norm": 0.011553982272744179, + "learning_rate": 9.947138416562826e-05, + "loss": 0.7617, + "step": 117 + }, + { + "epoch": 0.07573812580231065, + "grad_norm": 0.012888479046523571, + "learning_rate": 9.945620060739999e-05, + "loss": 0.6953, + "step": 118 + }, + { + "epoch": 0.07637997432605904, + "grad_norm": 0.01304571982473135, + "learning_rate": 9.944080325719163e-05, + "loss": 0.7461, + "step": 119 + }, + { + "epoch": 0.07702182284980745, + "grad_norm": 0.01109766960144043, + "learning_rate": 9.94251921815637e-05, + "loss": 0.7344, + "step": 120 + }, + { + "epoch": 0.07766367137355584, + "grad_norm": 0.010833417065441608, + "learning_rate": 9.940936744800064e-05, + "loss": 0.7148, + "step": 121 + }, + { + "epoch": 0.07830551989730423, + "grad_norm": 0.012143843807280064, + "learning_rate": 9.939332912491043e-05, + "loss": 0.7578, + "step": 122 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 0.01161271333694458, + "learning_rate": 9.937707728162448e-05, + "loss": 0.7617, + "step": 123 + }, + { + "epoch": 0.07958921694480103, + "grad_norm": 0.011860181577503681, + "learning_rate": 9.93606119883971e-05, + "loss": 0.7422, + "step": 124 + }, + { + "epoch": 0.08023106546854943, + "grad_norm": 0.009615721181035042, + "learning_rate": 9.934393331640536e-05, + "loss": 0.668, + "step": 125 + }, + { + "epoch": 0.08087291399229782, + "grad_norm": 0.01393971312791109, + "learning_rate": 9.932704133774877e-05, + "loss": 0.6953, + "step": 126 + }, + { + "epoch": 0.08151476251604621, + "grad_norm": 0.012269717641174793, + "learning_rate": 9.93099361254489e-05, + "loss": 0.7266, + "step": 127 + }, + { + "epoch": 0.0821566110397946, + "grad_norm": 0.014152932912111282, + "learning_rate": 9.929261775344909e-05, + "loss": 0.6562, + "step": 128 + }, + { + "epoch": 0.08279845956354301, + "grad_norm": 0.011507104150950909, + "learning_rate": 9.927508629661413e-05, + "loss": 0.7266, + "step": 129 + }, + { + "epoch": 0.0834403080872914, + "grad_norm": 0.014678554609417915, + "learning_rate": 9.925734183073001e-05, + "loss": 0.7109, + "step": 130 + }, + { + "epoch": 0.0840821566110398, + "grad_norm": 0.01359044574201107, + "learning_rate": 9.923938443250345e-05, + "loss": 0.6875, + "step": 131 + }, + { + "epoch": 0.08472400513478819, + "grad_norm": 0.012736028991639614, + "learning_rate": 9.922121417956168e-05, + "loss": 0.6641, + "step": 132 + }, + { + "epoch": 0.08536585365853659, + "grad_norm": 0.01216636598110199, + "learning_rate": 9.920283115045206e-05, + "loss": 0.7031, + "step": 133 + }, + { + "epoch": 0.08600770218228498, + "grad_norm": 0.011741497553884983, + "learning_rate": 9.918423542464177e-05, + "loss": 0.6719, + "step": 134 + }, + { + "epoch": 0.08664955070603338, + "grad_norm": 0.010003621689975262, + "learning_rate": 9.916542708251745e-05, + "loss": 0.6719, + "step": 135 + }, + { + "epoch": 0.08729139922978177, + "grad_norm": 0.012072731740772724, + "learning_rate": 9.91464062053848e-05, + "loss": 0.7188, + "step": 136 + }, + { + "epoch": 0.08793324775353016, + "grad_norm": 0.010896646417677402, + "learning_rate": 9.912717287546835e-05, + "loss": 0.7031, + "step": 137 + }, + { + "epoch": 0.08857509627727857, + "grad_norm": 0.011950834654271603, + "learning_rate": 9.910772717591097e-05, + "loss": 0.6836, + "step": 138 + }, + { + "epoch": 0.08921694480102696, + "grad_norm": 0.01246932614594698, + "learning_rate": 9.90880691907736e-05, + "loss": 0.6797, + "step": 139 + }, + { + "epoch": 0.08985879332477535, + "grad_norm": 0.013044403865933418, + "learning_rate": 9.906819900503486e-05, + "loss": 0.7109, + "step": 140 + }, + { + "epoch": 0.09050064184852374, + "grad_norm": 0.012423249892890453, + "learning_rate": 9.904811670459067e-05, + "loss": 0.6562, + "step": 141 + }, + { + "epoch": 0.09114249037227215, + "grad_norm": 0.010445546358823776, + "learning_rate": 9.902782237625393e-05, + "loss": 0.6211, + "step": 142 + }, + { + "epoch": 0.09178433889602054, + "grad_norm": 0.012769351713359356, + "learning_rate": 9.900731610775405e-05, + "loss": 0.6602, + "step": 143 + }, + { + "epoch": 0.09242618741976893, + "grad_norm": 0.01104013156145811, + "learning_rate": 9.898659798773667e-05, + "loss": 0.6875, + "step": 144 + }, + { + "epoch": 0.09306803594351733, + "grad_norm": 0.009402669034898281, + "learning_rate": 9.89656681057632e-05, + "loss": 0.668, + "step": 145 + }, + { + "epoch": 0.09370988446726572, + "grad_norm": 0.008899720385670662, + "learning_rate": 9.894452655231051e-05, + "loss": 0.6406, + "step": 146 + }, + { + "epoch": 0.09435173299101413, + "grad_norm": 0.009158738888800144, + "learning_rate": 9.892317341877045e-05, + "loss": 0.6484, + "step": 147 + }, + { + "epoch": 0.09499358151476252, + "grad_norm": 0.010559624060988426, + "learning_rate": 9.890160879744951e-05, + "loss": 0.6484, + "step": 148 + }, + { + "epoch": 0.09563543003851091, + "grad_norm": 0.010568162426352501, + "learning_rate": 9.887983278156842e-05, + "loss": 0.6406, + "step": 149 + }, + { + "epoch": 0.0962772785622593, + "grad_norm": 0.00897563062608242, + "learning_rate": 9.885784546526177e-05, + "loss": 0.6328, + "step": 150 + }, + { + "epoch": 0.09691912708600771, + "grad_norm": 0.009579532779753208, + "learning_rate": 9.88356469435775e-05, + "loss": 0.6367, + "step": 151 + }, + { + "epoch": 0.0975609756097561, + "grad_norm": 0.010554454289376736, + "learning_rate": 9.881323731247663e-05, + "loss": 0.6289, + "step": 152 + }, + { + "epoch": 0.0982028241335045, + "grad_norm": 0.009940484538674355, + "learning_rate": 9.879061666883272e-05, + "loss": 0.6016, + "step": 153 + }, + { + "epoch": 0.09884467265725289, + "grad_norm": 0.010707387700676918, + "learning_rate": 9.876778511043153e-05, + "loss": 0.6641, + "step": 154 + }, + { + "epoch": 0.09948652118100128, + "grad_norm": 0.009775303304195404, + "learning_rate": 9.874474273597059e-05, + "loss": 0.6406, + "step": 155 + }, + { + "epoch": 0.10012836970474968, + "grad_norm": 0.011378953233361244, + "learning_rate": 9.872148964505872e-05, + "loss": 0.6719, + "step": 156 + }, + { + "epoch": 0.10077021822849808, + "grad_norm": 0.009989460930228233, + "learning_rate": 9.869802593821568e-05, + "loss": 0.6641, + "step": 157 + }, + { + "epoch": 0.10141206675224647, + "grad_norm": 0.011383592151105404, + "learning_rate": 9.867435171687168e-05, + "loss": 0.625, + "step": 158 + }, + { + "epoch": 0.10205391527599486, + "grad_norm": 0.011871813796460629, + "learning_rate": 9.865046708336691e-05, + "loss": 0.6602, + "step": 159 + }, + { + "epoch": 0.10269576379974327, + "grad_norm": 0.009828433394432068, + "learning_rate": 9.862637214095121e-05, + "loss": 0.625, + "step": 160 + }, + { + "epoch": 0.10333761232349166, + "grad_norm": 0.010281259194016457, + "learning_rate": 9.860206699378349e-05, + "loss": 0.6328, + "step": 161 + }, + { + "epoch": 0.10397946084724005, + "grad_norm": 0.013299995101988316, + "learning_rate": 9.85775517469314e-05, + "loss": 0.625, + "step": 162 + }, + { + "epoch": 0.10462130937098844, + "grad_norm": 0.009967153891921043, + "learning_rate": 9.855282650637079e-05, + "loss": 0.5977, + "step": 163 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 0.011748317629098892, + "learning_rate": 9.852789137898526e-05, + "loss": 0.6133, + "step": 164 + }, + { + "epoch": 0.10590500641848524, + "grad_norm": 0.00852824654430151, + "learning_rate": 9.85027464725658e-05, + "loss": 0.6094, + "step": 165 + }, + { + "epoch": 0.10654685494223363, + "grad_norm": 0.011019607074558735, + "learning_rate": 9.847739189581013e-05, + "loss": 0.6445, + "step": 166 + }, + { + "epoch": 0.10718870346598203, + "grad_norm": 0.011637663468718529, + "learning_rate": 9.845182775832244e-05, + "loss": 0.625, + "step": 167 + }, + { + "epoch": 0.10783055198973042, + "grad_norm": 0.011865214444696903, + "learning_rate": 9.842605417061282e-05, + "loss": 0.6484, + "step": 168 + }, + { + "epoch": 0.10847240051347883, + "grad_norm": 0.011490019969642162, + "learning_rate": 9.840007124409669e-05, + "loss": 0.625, + "step": 169 + }, + { + "epoch": 0.10911424903722722, + "grad_norm": 0.010043886490166187, + "learning_rate": 9.837387909109452e-05, + "loss": 0.6094, + "step": 170 + }, + { + "epoch": 0.10975609756097561, + "grad_norm": 0.01057278923690319, + "learning_rate": 9.834747782483113e-05, + "loss": 0.6328, + "step": 171 + }, + { + "epoch": 0.110397946084724, + "grad_norm": 0.013821721076965332, + "learning_rate": 9.832086755943543e-05, + "loss": 0.625, + "step": 172 + }, + { + "epoch": 0.1110397946084724, + "grad_norm": 0.011414537206292152, + "learning_rate": 9.82940484099397e-05, + "loss": 0.5938, + "step": 173 + }, + { + "epoch": 0.1116816431322208, + "grad_norm": 0.011716284789144993, + "learning_rate": 9.826702049227925e-05, + "loss": 0.6484, + "step": 174 + }, + { + "epoch": 0.1123234916559692, + "grad_norm": 0.009901394136250019, + "learning_rate": 9.823978392329183e-05, + "loss": 0.6016, + "step": 175 + }, + { + "epoch": 0.11296534017971759, + "grad_norm": 0.011477384716272354, + "learning_rate": 9.821233882071717e-05, + "loss": 0.625, + "step": 176 + }, + { + "epoch": 0.11360718870346598, + "grad_norm": 0.011600608006119728, + "learning_rate": 9.81846853031965e-05, + "loss": 0.6094, + "step": 177 + }, + { + "epoch": 0.11424903722721438, + "grad_norm": 0.011693180538713932, + "learning_rate": 9.815682349027193e-05, + "loss": 0.6328, + "step": 178 + }, + { + "epoch": 0.11489088575096278, + "grad_norm": 0.013446972705423832, + "learning_rate": 9.812875350238604e-05, + "loss": 0.582, + "step": 179 + }, + { + "epoch": 0.11553273427471117, + "grad_norm": 0.012557251378893852, + "learning_rate": 9.810047546088133e-05, + "loss": 0.6016, + "step": 180 + }, + { + "epoch": 0.11617458279845956, + "grad_norm": 0.010684821754693985, + "learning_rate": 9.807198948799968e-05, + "loss": 0.6133, + "step": 181 + }, + { + "epoch": 0.11681643132220795, + "grad_norm": 0.011835362762212753, + "learning_rate": 9.804329570688177e-05, + "loss": 0.6133, + "step": 182 + }, + { + "epoch": 0.11745827984595636, + "grad_norm": 0.016150448471307755, + "learning_rate": 9.801439424156672e-05, + "loss": 0.6602, + "step": 183 + }, + { + "epoch": 0.11810012836970475, + "grad_norm": 0.00978788174688816, + "learning_rate": 9.798528521699132e-05, + "loss": 0.582, + "step": 184 + }, + { + "epoch": 0.11874197689345314, + "grad_norm": 0.012773280963301659, + "learning_rate": 9.795596875898967e-05, + "loss": 0.6094, + "step": 185 + }, + { + "epoch": 0.11938382541720154, + "grad_norm": 0.013691268861293793, + "learning_rate": 9.792644499429259e-05, + "loss": 0.6016, + "step": 186 + }, + { + "epoch": 0.12002567394094994, + "grad_norm": 0.012030140496790409, + "learning_rate": 9.789671405052701e-05, + "loss": 0.5703, + "step": 187 + }, + { + "epoch": 0.12066752246469833, + "grad_norm": 0.01144426316022873, + "learning_rate": 9.786677605621547e-05, + "loss": 0.5469, + "step": 188 + }, + { + "epoch": 0.12130937098844673, + "grad_norm": 0.010707143694162369, + "learning_rate": 9.783663114077562e-05, + "loss": 0.6172, + "step": 189 + }, + { + "epoch": 0.12195121951219512, + "grad_norm": 0.011395948939025402, + "learning_rate": 9.78062794345195e-05, + "loss": 0.543, + "step": 190 + }, + { + "epoch": 0.12259306803594351, + "grad_norm": 0.010497979819774628, + "learning_rate": 9.777572106865318e-05, + "loss": 0.5859, + "step": 191 + }, + { + "epoch": 0.12323491655969192, + "grad_norm": 0.011176045052707195, + "learning_rate": 9.774495617527603e-05, + "loss": 0.5781, + "step": 192 + }, + { + "epoch": 0.12387676508344031, + "grad_norm": 0.009919974952936172, + "learning_rate": 9.771398488738022e-05, + "loss": 0.5938, + "step": 193 + }, + { + "epoch": 0.1245186136071887, + "grad_norm": 0.009419148787856102, + "learning_rate": 9.768280733885014e-05, + "loss": 0.5508, + "step": 194 + }, + { + "epoch": 0.1251604621309371, + "grad_norm": 0.011592664755880833, + "learning_rate": 9.765142366446178e-05, + "loss": 0.5664, + "step": 195 + }, + { + "epoch": 0.1258023106546855, + "grad_norm": 0.01011605840176344, + "learning_rate": 9.761983399988223e-05, + "loss": 0.582, + "step": 196 + }, + { + "epoch": 0.12644415917843388, + "grad_norm": 0.011448321864008904, + "learning_rate": 9.758803848166904e-05, + "loss": 0.5625, + "step": 197 + }, + { + "epoch": 0.12708600770218229, + "grad_norm": 0.011436822824180126, + "learning_rate": 9.75560372472696e-05, + "loss": 0.5781, + "step": 198 + }, + { + "epoch": 0.1277278562259307, + "grad_norm": 0.01132872048765421, + "learning_rate": 9.752383043502063e-05, + "loss": 0.5938, + "step": 199 + }, + { + "epoch": 0.12836970474967907, + "grad_norm": 0.010940883308649063, + "learning_rate": 9.749141818414749e-05, + "loss": 0.543, + "step": 200 + }, + { + "epoch": 0.12901155327342748, + "grad_norm": 0.011073155328631401, + "learning_rate": 9.745880063476362e-05, + "loss": 0.5469, + "step": 201 + }, + { + "epoch": 0.12965340179717585, + "grad_norm": 0.01133064553141594, + "learning_rate": 9.742597792786999e-05, + "loss": 0.5859, + "step": 202 + }, + { + "epoch": 0.13029525032092426, + "grad_norm": 0.011374837718904018, + "learning_rate": 9.739295020535437e-05, + "loss": 0.5508, + "step": 203 + }, + { + "epoch": 0.13093709884467267, + "grad_norm": 0.013687084428966045, + "learning_rate": 9.735971760999083e-05, + "loss": 0.5859, + "step": 204 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 0.010709963738918304, + "learning_rate": 9.732628028543906e-05, + "loss": 0.5312, + "step": 205 + }, + { + "epoch": 0.13222079589216945, + "grad_norm": 0.012992851436138153, + "learning_rate": 9.729263837624376e-05, + "loss": 0.5938, + "step": 206 + }, + { + "epoch": 0.13286264441591783, + "grad_norm": 0.011751007288694382, + "learning_rate": 9.725879202783401e-05, + "loss": 0.5859, + "step": 207 + }, + { + "epoch": 0.13350449293966624, + "grad_norm": 0.01132382545620203, + "learning_rate": 9.722474138652267e-05, + "loss": 0.582, + "step": 208 + }, + { + "epoch": 0.13414634146341464, + "grad_norm": 0.011845381930470467, + "learning_rate": 9.719048659950573e-05, + "loss": 0.5625, + "step": 209 + }, + { + "epoch": 0.13478818998716302, + "grad_norm": 0.011076336726546288, + "learning_rate": 9.715602781486166e-05, + "loss": 0.5703, + "step": 210 + }, + { + "epoch": 0.13543003851091143, + "grad_norm": 0.01224234327673912, + "learning_rate": 9.712136518155079e-05, + "loss": 0.5625, + "step": 211 + }, + { + "epoch": 0.13607188703465983, + "grad_norm": 0.012131668627262115, + "learning_rate": 9.708649884941466e-05, + "loss": 0.5547, + "step": 212 + }, + { + "epoch": 0.1367137355584082, + "grad_norm": 0.014434052631258965, + "learning_rate": 9.705142896917534e-05, + "loss": 0.5625, + "step": 213 + }, + { + "epoch": 0.13735558408215662, + "grad_norm": 0.013702210038900375, + "learning_rate": 9.701615569243485e-05, + "loss": 0.5781, + "step": 214 + }, + { + "epoch": 0.137997432605905, + "grad_norm": 0.01172378659248352, + "learning_rate": 9.698067917167446e-05, + "loss": 0.5703, + "step": 215 + }, + { + "epoch": 0.1386392811296534, + "grad_norm": 0.01423399057239294, + "learning_rate": 9.6944999560254e-05, + "loss": 0.5352, + "step": 216 + }, + { + "epoch": 0.1392811296534018, + "grad_norm": 0.013110863976180553, + "learning_rate": 9.690911701241128e-05, + "loss": 0.5625, + "step": 217 + }, + { + "epoch": 0.1399229781771502, + "grad_norm": 0.013581229373812675, + "learning_rate": 9.687303168326133e-05, + "loss": 0.5312, + "step": 218 + }, + { + "epoch": 0.1405648267008986, + "grad_norm": 0.010977387428283691, + "learning_rate": 9.683674372879579e-05, + "loss": 0.5391, + "step": 219 + }, + { + "epoch": 0.14120667522464697, + "grad_norm": 0.010781140998005867, + "learning_rate": 9.680025330588223e-05, + "loss": 0.5664, + "step": 220 + }, + { + "epoch": 0.14184852374839538, + "grad_norm": 0.010498926043510437, + "learning_rate": 9.676356057226346e-05, + "loss": 0.5234, + "step": 221 + }, + { + "epoch": 0.14249037227214378, + "grad_norm": 0.017316415905952454, + "learning_rate": 9.672666568655684e-05, + "loss": 0.5859, + "step": 222 + }, + { + "epoch": 0.14313222079589216, + "grad_norm": 0.010606053285300732, + "learning_rate": 9.66895688082536e-05, + "loss": 0.5859, + "step": 223 + }, + { + "epoch": 0.14377406931964057, + "grad_norm": 0.010790948756039143, + "learning_rate": 9.665227009771815e-05, + "loss": 0.5469, + "step": 224 + }, + { + "epoch": 0.14441591784338895, + "grad_norm": 0.016792552545666695, + "learning_rate": 9.661476971618744e-05, + "loss": 0.5625, + "step": 225 + }, + { + "epoch": 0.14505776636713735, + "grad_norm": 0.011027633212506771, + "learning_rate": 9.657706782577017e-05, + "loss": 0.5703, + "step": 226 + }, + { + "epoch": 0.14569961489088576, + "grad_norm": 0.012505598366260529, + "learning_rate": 9.653916458944612e-05, + "loss": 0.5664, + "step": 227 + }, + { + "epoch": 0.14634146341463414, + "grad_norm": 0.010994997806847095, + "learning_rate": 9.650106017106548e-05, + "loss": 0.5312, + "step": 228 + }, + { + "epoch": 0.14698331193838254, + "grad_norm": 0.019628558307886124, + "learning_rate": 9.646275473534817e-05, + "loss": 0.5312, + "step": 229 + }, + { + "epoch": 0.14762516046213095, + "grad_norm": 0.012861509807407856, + "learning_rate": 9.642424844788298e-05, + "loss": 0.5508, + "step": 230 + }, + { + "epoch": 0.14826700898587933, + "grad_norm": 0.014055141247808933, + "learning_rate": 9.638554147512702e-05, + "loss": 0.5625, + "step": 231 + }, + { + "epoch": 0.14890885750962773, + "grad_norm": 0.015493751503527164, + "learning_rate": 9.634663398440493e-05, + "loss": 0.543, + "step": 232 + }, + { + "epoch": 0.1495507060333761, + "grad_norm": 0.012860543094575405, + "learning_rate": 9.630752614390813e-05, + "loss": 0.4922, + "step": 233 + }, + { + "epoch": 0.15019255455712452, + "grad_norm": 0.01053924672305584, + "learning_rate": 9.626821812269415e-05, + "loss": 0.543, + "step": 234 + }, + { + "epoch": 0.15083440308087293, + "grad_norm": 0.011286425404250622, + "learning_rate": 9.622871009068588e-05, + "loss": 0.5039, + "step": 235 + }, + { + "epoch": 0.1514762516046213, + "grad_norm": 0.012574701569974422, + "learning_rate": 9.618900221867077e-05, + "loss": 0.5391, + "step": 236 + }, + { + "epoch": 0.1521181001283697, + "grad_norm": 0.012315727770328522, + "learning_rate": 9.614909467830022e-05, + "loss": 0.5273, + "step": 237 + }, + { + "epoch": 0.1527599486521181, + "grad_norm": 0.010991916060447693, + "learning_rate": 9.610898764208873e-05, + "loss": 0.5469, + "step": 238 + }, + { + "epoch": 0.1534017971758665, + "grad_norm": 0.011746605858206749, + "learning_rate": 9.60686812834132e-05, + "loss": 0.5547, + "step": 239 + }, + { + "epoch": 0.1540436456996149, + "grad_norm": 0.011498944833874702, + "learning_rate": 9.602817577651217e-05, + "loss": 0.5586, + "step": 240 + }, + { + "epoch": 0.15468549422336328, + "grad_norm": 0.011620343662798405, + "learning_rate": 9.598747129648505e-05, + "loss": 0.5312, + "step": 241 + }, + { + "epoch": 0.15532734274711169, + "grad_norm": 0.010271632112562656, + "learning_rate": 9.594656801929145e-05, + "loss": 0.5586, + "step": 242 + }, + { + "epoch": 0.15596919127086006, + "grad_norm": 0.013585187494754791, + "learning_rate": 9.590546612175024e-05, + "loss": 0.5625, + "step": 243 + }, + { + "epoch": 0.15661103979460847, + "grad_norm": 0.012161128222942352, + "learning_rate": 9.586416578153903e-05, + "loss": 0.5508, + "step": 244 + }, + { + "epoch": 0.15725288831835688, + "grad_norm": 0.011398044414818287, + "learning_rate": 9.582266717719314e-05, + "loss": 0.5703, + "step": 245 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 0.012302602641284466, + "learning_rate": 9.578097048810503e-05, + "loss": 0.5469, + "step": 246 + }, + { + "epoch": 0.15853658536585366, + "grad_norm": 0.012995855882763863, + "learning_rate": 9.573907589452347e-05, + "loss": 0.5195, + "step": 247 + }, + { + "epoch": 0.15917843388960207, + "grad_norm": 0.011635944247245789, + "learning_rate": 9.569698357755267e-05, + "loss": 0.5273, + "step": 248 + }, + { + "epoch": 0.15982028241335045, + "grad_norm": 0.011981082148849964, + "learning_rate": 9.565469371915164e-05, + "loss": 0.5234, + "step": 249 + }, + { + "epoch": 0.16046213093709885, + "grad_norm": 0.013345803134143353, + "learning_rate": 9.561220650213326e-05, + "loss": 0.5547, + "step": 250 + }, + { + "epoch": 0.16110397946084723, + "grad_norm": 0.010194964706897736, + "learning_rate": 9.556952211016366e-05, + "loss": 0.4805, + "step": 251 + }, + { + "epoch": 0.16174582798459564, + "grad_norm": 0.010166475549340248, + "learning_rate": 9.552664072776123e-05, + "loss": 0.4844, + "step": 252 + }, + { + "epoch": 0.16238767650834404, + "grad_norm": 0.011069612577557564, + "learning_rate": 9.548356254029598e-05, + "loss": 0.5, + "step": 253 + }, + { + "epoch": 0.16302952503209242, + "grad_norm": 0.014476699754595757, + "learning_rate": 9.544028773398867e-05, + "loss": 0.5352, + "step": 254 + }, + { + "epoch": 0.16367137355584083, + "grad_norm": 0.013018032535910606, + "learning_rate": 9.539681649591002e-05, + "loss": 0.5234, + "step": 255 + }, + { + "epoch": 0.1643132220795892, + "grad_norm": 0.009963922202587128, + "learning_rate": 9.535314901397985e-05, + "loss": 0.5273, + "step": 256 + }, + { + "epoch": 0.1649550706033376, + "grad_norm": 0.011673172004520893, + "learning_rate": 9.530928547696639e-05, + "loss": 0.5586, + "step": 257 + }, + { + "epoch": 0.16559691912708602, + "grad_norm": 0.011493822559714317, + "learning_rate": 9.52652260744853e-05, + "loss": 0.5391, + "step": 258 + }, + { + "epoch": 0.1662387676508344, + "grad_norm": 0.015019495040178299, + "learning_rate": 9.522097099699903e-05, + "loss": 0.5234, + "step": 259 + }, + { + "epoch": 0.1668806161745828, + "grad_norm": 0.012797566130757332, + "learning_rate": 9.517652043581583e-05, + "loss": 0.5, + "step": 260 + }, + { + "epoch": 0.16752246469833118, + "grad_norm": 0.012790082022547722, + "learning_rate": 9.513187458308904e-05, + "loss": 0.5, + "step": 261 + }, + { + "epoch": 0.1681643132220796, + "grad_norm": 0.013730279169976711, + "learning_rate": 9.508703363181621e-05, + "loss": 0.5469, + "step": 262 + }, + { + "epoch": 0.168806161745828, + "grad_norm": 0.011931111104786396, + "learning_rate": 9.504199777583824e-05, + "loss": 0.5625, + "step": 263 + }, + { + "epoch": 0.16944801026957637, + "grad_norm": 0.012304815463721752, + "learning_rate": 9.499676720983864e-05, + "loss": 0.5234, + "step": 264 + }, + { + "epoch": 0.17008985879332478, + "grad_norm": 0.012368976138532162, + "learning_rate": 9.495134212934256e-05, + "loss": 0.5273, + "step": 265 + }, + { + "epoch": 0.17073170731707318, + "grad_norm": 0.01270821038633585, + "learning_rate": 9.490572273071603e-05, + "loss": 0.5234, + "step": 266 + }, + { + "epoch": 0.17137355584082156, + "grad_norm": 0.01076431479305029, + "learning_rate": 9.48599092111651e-05, + "loss": 0.5156, + "step": 267 + }, + { + "epoch": 0.17201540436456997, + "grad_norm": 0.013268671929836273, + "learning_rate": 9.481390176873498e-05, + "loss": 0.4961, + "step": 268 + }, + { + "epoch": 0.17265725288831835, + "grad_norm": 0.012573538348078728, + "learning_rate": 9.476770060230915e-05, + "loss": 0.4883, + "step": 269 + }, + { + "epoch": 0.17329910141206675, + "grad_norm": 0.011922149918973446, + "learning_rate": 9.472130591160855e-05, + "loss": 0.5625, + "step": 270 + }, + { + "epoch": 0.17394094993581516, + "grad_norm": 0.012727407738566399, + "learning_rate": 9.467471789719072e-05, + "loss": 0.5586, + "step": 271 + }, + { + "epoch": 0.17458279845956354, + "grad_norm": 0.01120324619114399, + "learning_rate": 9.462793676044885e-05, + "loss": 0.4902, + "step": 272 + }, + { + "epoch": 0.17522464698331194, + "grad_norm": 0.013182485476136208, + "learning_rate": 9.458096270361104e-05, + "loss": 0.5391, + "step": 273 + }, + { + "epoch": 0.17586649550706032, + "grad_norm": 0.010877820663154125, + "learning_rate": 9.453379592973928e-05, + "loss": 0.5234, + "step": 274 + }, + { + "epoch": 0.17650834403080873, + "grad_norm": 0.010631302371621132, + "learning_rate": 9.448643664272876e-05, + "loss": 0.5117, + "step": 275 + }, + { + "epoch": 0.17715019255455713, + "grad_norm": 0.011218084953725338, + "learning_rate": 9.443888504730673e-05, + "loss": 0.4707, + "step": 276 + }, + { + "epoch": 0.1777920410783055, + "grad_norm": 0.013051497749984264, + "learning_rate": 9.439114134903191e-05, + "loss": 0.4668, + "step": 277 + }, + { + "epoch": 0.17843388960205392, + "grad_norm": 0.014543252997100353, + "learning_rate": 9.434320575429334e-05, + "loss": 0.5469, + "step": 278 + }, + { + "epoch": 0.1790757381258023, + "grad_norm": 0.010615730658173561, + "learning_rate": 9.429507847030963e-05, + "loss": 0.5352, + "step": 279 + }, + { + "epoch": 0.1797175866495507, + "grad_norm": 0.014637456275522709, + "learning_rate": 9.424675970512808e-05, + "loss": 0.5039, + "step": 280 + }, + { + "epoch": 0.1803594351732991, + "grad_norm": 0.01204176526516676, + "learning_rate": 9.419824966762367e-05, + "loss": 0.5039, + "step": 281 + }, + { + "epoch": 0.1810012836970475, + "grad_norm": 0.013622627593576908, + "learning_rate": 9.414954856749828e-05, + "loss": 0.4883, + "step": 282 + }, + { + "epoch": 0.1816431322207959, + "grad_norm": 0.012876857072114944, + "learning_rate": 9.41006566152797e-05, + "loss": 0.5039, + "step": 283 + }, + { + "epoch": 0.1822849807445443, + "grad_norm": 0.011097314767539501, + "learning_rate": 9.405157402232072e-05, + "loss": 0.5234, + "step": 284 + }, + { + "epoch": 0.18292682926829268, + "grad_norm": 0.012606339529156685, + "learning_rate": 9.400230100079829e-05, + "loss": 0.5078, + "step": 285 + }, + { + "epoch": 0.18356867779204109, + "grad_norm": 0.013987897895276546, + "learning_rate": 9.395283776371253e-05, + "loss": 0.5156, + "step": 286 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 0.010777951218187809, + "learning_rate": 9.390318452488583e-05, + "loss": 0.5195, + "step": 287 + }, + { + "epoch": 0.18485237483953787, + "grad_norm": 0.012862320989370346, + "learning_rate": 9.385334149896195e-05, + "loss": 0.543, + "step": 288 + }, + { + "epoch": 0.18549422336328628, + "grad_norm": 0.013322705402970314, + "learning_rate": 9.380330890140505e-05, + "loss": 0.5156, + "step": 289 + }, + { + "epoch": 0.18613607188703465, + "grad_norm": 0.013143341057002544, + "learning_rate": 9.37530869484988e-05, + "loss": 0.5117, + "step": 290 + }, + { + "epoch": 0.18677792041078306, + "grad_norm": 0.012022369541227818, + "learning_rate": 9.37026758573454e-05, + "loss": 0.4707, + "step": 291 + }, + { + "epoch": 0.18741976893453144, + "grad_norm": 0.012463578954339027, + "learning_rate": 9.365207584586471e-05, + "loss": 0.5, + "step": 292 + }, + { + "epoch": 0.18806161745827984, + "grad_norm": 0.01562550477683544, + "learning_rate": 9.360128713279321e-05, + "loss": 0.5391, + "step": 293 + }, + { + "epoch": 0.18870346598202825, + "grad_norm": 0.01224803552031517, + "learning_rate": 9.355030993768314e-05, + "loss": 0.4922, + "step": 294 + }, + { + "epoch": 0.18934531450577663, + "grad_norm": 0.012806670740246773, + "learning_rate": 9.349914448090156e-05, + "loss": 0.5078, + "step": 295 + }, + { + "epoch": 0.18998716302952504, + "grad_norm": 0.014918792992830276, + "learning_rate": 9.344779098362926e-05, + "loss": 0.4902, + "step": 296 + }, + { + "epoch": 0.19062901155327341, + "grad_norm": 0.015880340710282326, + "learning_rate": 9.339624966785999e-05, + "loss": 0.4961, + "step": 297 + }, + { + "epoch": 0.19127086007702182, + "grad_norm": 0.016379036009311676, + "learning_rate": 9.334452075639938e-05, + "loss": 0.5117, + "step": 298 + }, + { + "epoch": 0.19191270860077023, + "grad_norm": 0.013551746495068073, + "learning_rate": 9.329260447286399e-05, + "loss": 0.5391, + "step": 299 + }, + { + "epoch": 0.1925545571245186, + "grad_norm": 0.012869910337030888, + "learning_rate": 9.32405010416804e-05, + "loss": 0.4961, + "step": 300 + }, + { + "epoch": 0.193196405648267, + "grad_norm": 0.01776876486837864, + "learning_rate": 9.318821068808418e-05, + "loss": 0.4805, + "step": 301 + }, + { + "epoch": 0.19383825417201542, + "grad_norm": 0.01472089160233736, + "learning_rate": 9.313573363811895e-05, + "loss": 0.5078, + "step": 302 + }, + { + "epoch": 0.1944801026957638, + "grad_norm": 0.012855904176831245, + "learning_rate": 9.308307011863537e-05, + "loss": 0.5, + "step": 303 + }, + { + "epoch": 0.1951219512195122, + "grad_norm": 0.015342450700700283, + "learning_rate": 9.303022035729019e-05, + "loss": 0.5156, + "step": 304 + }, + { + "epoch": 0.19576379974326058, + "grad_norm": 0.014027988538146019, + "learning_rate": 9.297718458254528e-05, + "loss": 0.4922, + "step": 305 + }, + { + "epoch": 0.196405648267009, + "grad_norm": 0.014229445718228817, + "learning_rate": 9.29239630236666e-05, + "loss": 0.4922, + "step": 306 + }, + { + "epoch": 0.1970474967907574, + "grad_norm": 0.011061770841479301, + "learning_rate": 9.287055591072319e-05, + "loss": 0.457, + "step": 307 + }, + { + "epoch": 0.19768934531450577, + "grad_norm": 0.012393632903695107, + "learning_rate": 9.28169634745863e-05, + "loss": 0.5078, + "step": 308 + }, + { + "epoch": 0.19833119383825418, + "grad_norm": 0.014692421071231365, + "learning_rate": 9.276318594692822e-05, + "loss": 0.4707, + "step": 309 + }, + { + "epoch": 0.19897304236200256, + "grad_norm": 0.013025031425058842, + "learning_rate": 9.270922356022142e-05, + "loss": 0.4883, + "step": 310 + }, + { + "epoch": 0.19961489088575096, + "grad_norm": 0.011297275312244892, + "learning_rate": 9.265507654773747e-05, + "loss": 0.5, + "step": 311 + }, + { + "epoch": 0.20025673940949937, + "grad_norm": 0.012788046151399612, + "learning_rate": 9.260074514354603e-05, + "loss": 0.5, + "step": 312 + }, + { + "epoch": 0.20089858793324775, + "grad_norm": 0.012072072364389896, + "learning_rate": 9.254622958251389e-05, + "loss": 0.4902, + "step": 313 + }, + { + "epoch": 0.20154043645699615, + "grad_norm": 0.011217731982469559, + "learning_rate": 9.249153010030391e-05, + "loss": 0.5, + "step": 314 + }, + { + "epoch": 0.20218228498074453, + "grad_norm": 0.015587392263114452, + "learning_rate": 9.243664693337404e-05, + "loss": 0.4766, + "step": 315 + }, + { + "epoch": 0.20282413350449294, + "grad_norm": 0.014844152145087719, + "learning_rate": 9.23815803189762e-05, + "loss": 0.4668, + "step": 316 + }, + { + "epoch": 0.20346598202824134, + "grad_norm": 0.012265946716070175, + "learning_rate": 9.232633049515541e-05, + "loss": 0.5078, + "step": 317 + }, + { + "epoch": 0.20410783055198972, + "grad_norm": 0.016097310930490494, + "learning_rate": 9.227089770074864e-05, + "loss": 0.4746, + "step": 318 + }, + { + "epoch": 0.20474967907573813, + "grad_norm": 0.013965766876935959, + "learning_rate": 9.22152821753838e-05, + "loss": 0.4766, + "step": 319 + }, + { + "epoch": 0.20539152759948653, + "grad_norm": 0.012059083208441734, + "learning_rate": 9.215948415947875e-05, + "loss": 0.5078, + "step": 320 + }, + { + "epoch": 0.2060333761232349, + "grad_norm": 0.013087877072393894, + "learning_rate": 9.210350389424021e-05, + "loss": 0.5273, + "step": 321 + }, + { + "epoch": 0.20667522464698332, + "grad_norm": 0.016379237174987793, + "learning_rate": 9.204734162166276e-05, + "loss": 0.5039, + "step": 322 + }, + { + "epoch": 0.2073170731707317, + "grad_norm": 0.014362157322466373, + "learning_rate": 9.199099758452774e-05, + "loss": 0.4883, + "step": 323 + }, + { + "epoch": 0.2079589216944801, + "grad_norm": 0.017146103084087372, + "learning_rate": 9.193447202640226e-05, + "loss": 0.5, + "step": 324 + }, + { + "epoch": 0.2086007702182285, + "grad_norm": 0.011025834828615189, + "learning_rate": 9.187776519163811e-05, + "loss": 0.4648, + "step": 325 + }, + { + "epoch": 0.2092426187419769, + "grad_norm": 0.013537570834159851, + "learning_rate": 9.182087732537068e-05, + "loss": 0.4922, + "step": 326 + }, + { + "epoch": 0.2098844672657253, + "grad_norm": 0.012621462345123291, + "learning_rate": 9.176380867351801e-05, + "loss": 0.5078, + "step": 327 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.013818283565342426, + "learning_rate": 9.170655948277956e-05, + "loss": 0.4922, + "step": 328 + }, + { + "epoch": 0.21116816431322208, + "grad_norm": 0.011845164000988007, + "learning_rate": 9.164913000063529e-05, + "loss": 0.5195, + "step": 329 + }, + { + "epoch": 0.21181001283697048, + "grad_norm": 0.014658152125775814, + "learning_rate": 9.159152047534454e-05, + "loss": 0.4883, + "step": 330 + }, + { + "epoch": 0.21245186136071886, + "grad_norm": 0.013154874555766582, + "learning_rate": 9.153373115594489e-05, + "loss": 0.5312, + "step": 331 + }, + { + "epoch": 0.21309370988446727, + "grad_norm": 0.013952475972473621, + "learning_rate": 9.147576229225121e-05, + "loss": 0.4727, + "step": 332 + }, + { + "epoch": 0.21373555840821565, + "grad_norm": 0.012911539524793625, + "learning_rate": 9.14176141348545e-05, + "loss": 0.5156, + "step": 333 + }, + { + "epoch": 0.21437740693196405, + "grad_norm": 0.012119744904339314, + "learning_rate": 9.135928693512077e-05, + "loss": 0.4922, + "step": 334 + }, + { + "epoch": 0.21501925545571246, + "grad_norm": 0.013211081735789776, + "learning_rate": 9.130078094519008e-05, + "loss": 0.5078, + "step": 335 + }, + { + "epoch": 0.21566110397946084, + "grad_norm": 0.010815898887813091, + "learning_rate": 9.12420964179753e-05, + "loss": 0.4688, + "step": 336 + }, + { + "epoch": 0.21630295250320924, + "grad_norm": 0.011093908920884132, + "learning_rate": 9.118323360716117e-05, + "loss": 0.4883, + "step": 337 + }, + { + "epoch": 0.21694480102695765, + "grad_norm": 0.011569825932383537, + "learning_rate": 9.112419276720303e-05, + "loss": 0.4727, + "step": 338 + }, + { + "epoch": 0.21758664955070603, + "grad_norm": 0.01549893245100975, + "learning_rate": 9.106497415332591e-05, + "loss": 0.4688, + "step": 339 + }, + { + "epoch": 0.21822849807445444, + "grad_norm": 0.012803218327462673, + "learning_rate": 9.100557802152328e-05, + "loss": 0.5078, + "step": 340 + }, + { + "epoch": 0.21887034659820281, + "grad_norm": 0.015469119884073734, + "learning_rate": 9.094600462855598e-05, + "loss": 0.5156, + "step": 341 + }, + { + "epoch": 0.21951219512195122, + "grad_norm": 0.013518411666154861, + "learning_rate": 9.088625423195115e-05, + "loss": 0.4766, + "step": 342 + }, + { + "epoch": 0.22015404364569963, + "grad_norm": 0.01218617707490921, + "learning_rate": 9.08263270900011e-05, + "loss": 0.4824, + "step": 343 + }, + { + "epoch": 0.220795892169448, + "grad_norm": 0.011522636748850346, + "learning_rate": 9.076622346176216e-05, + "loss": 0.4395, + "step": 344 + }, + { + "epoch": 0.2214377406931964, + "grad_norm": 0.01178770512342453, + "learning_rate": 9.070594360705358e-05, + "loss": 0.4531, + "step": 345 + }, + { + "epoch": 0.2220795892169448, + "grad_norm": 0.012270083650946617, + "learning_rate": 9.064548778645646e-05, + "loss": 0.4648, + "step": 346 + }, + { + "epoch": 0.2227214377406932, + "grad_norm": 0.012514191679656506, + "learning_rate": 9.058485626131252e-05, + "loss": 0.4766, + "step": 347 + }, + { + "epoch": 0.2233632862644416, + "grad_norm": 0.01187925785779953, + "learning_rate": 9.052404929372305e-05, + "loss": 0.4609, + "step": 348 + }, + { + "epoch": 0.22400513478818998, + "grad_norm": 0.013160323724150658, + "learning_rate": 9.046306714654775e-05, + "loss": 0.4609, + "step": 349 + }, + { + "epoch": 0.2246469833119384, + "grad_norm": 0.01244362536817789, + "learning_rate": 9.04019100834036e-05, + "loss": 0.4609, + "step": 350 + }, + { + "epoch": 0.22528883183568676, + "grad_norm": 0.01528861839324236, + "learning_rate": 9.03405783686637e-05, + "loss": 0.4844, + "step": 351 + }, + { + "epoch": 0.22593068035943517, + "grad_norm": 0.010898303240537643, + "learning_rate": 9.027907226745616e-05, + "loss": 0.4785, + "step": 352 + }, + { + "epoch": 0.22657252888318358, + "grad_norm": 0.013835811987519264, + "learning_rate": 9.021739204566295e-05, + "loss": 0.4805, + "step": 353 + }, + { + "epoch": 0.22721437740693196, + "grad_norm": 0.012497610412538052, + "learning_rate": 9.015553796991869e-05, + "loss": 0.4922, + "step": 354 + }, + { + "epoch": 0.22785622593068036, + "grad_norm": 0.011201409623026848, + "learning_rate": 9.009351030760958e-05, + "loss": 0.4648, + "step": 355 + }, + { + "epoch": 0.22849807445442877, + "grad_norm": 0.015274932608008385, + "learning_rate": 9.003130932687223e-05, + "loss": 0.4883, + "step": 356 + }, + { + "epoch": 0.22913992297817715, + "grad_norm": 0.013798701576888561, + "learning_rate": 8.996893529659243e-05, + "loss": 0.4766, + "step": 357 + }, + { + "epoch": 0.22978177150192555, + "grad_norm": 0.013277494348585606, + "learning_rate": 8.990638848640407e-05, + "loss": 0.457, + "step": 358 + }, + { + "epoch": 0.23042362002567393, + "grad_norm": 0.013865608721971512, + "learning_rate": 8.984366916668795e-05, + "loss": 0.4824, + "step": 359 + }, + { + "epoch": 0.23106546854942234, + "grad_norm": 0.01533426996320486, + "learning_rate": 8.978077760857058e-05, + "loss": 0.4609, + "step": 360 + }, + { + "epoch": 0.23170731707317074, + "grad_norm": 0.013055570423603058, + "learning_rate": 8.971771408392302e-05, + "loss": 0.4629, + "step": 361 + }, + { + "epoch": 0.23234916559691912, + "grad_norm": 0.011497069150209427, + "learning_rate": 8.965447886535978e-05, + "loss": 0.4727, + "step": 362 + }, + { + "epoch": 0.23299101412066753, + "grad_norm": 0.01564948260784149, + "learning_rate": 8.95910722262375e-05, + "loss": 0.4863, + "step": 363 + }, + { + "epoch": 0.2336328626444159, + "grad_norm": 0.010947753675282001, + "learning_rate": 8.95274944406539e-05, + "loss": 0.4512, + "step": 364 + }, + { + "epoch": 0.2342747111681643, + "grad_norm": 0.011532683856785297, + "learning_rate": 8.946374578344653e-05, + "loss": 0.4551, + "step": 365 + }, + { + "epoch": 0.23491655969191272, + "grad_norm": 0.012891596183180809, + "learning_rate": 8.939982653019157e-05, + "loss": 0.4883, + "step": 366 + }, + { + "epoch": 0.2355584082156611, + "grad_norm": 0.014495361596345901, + "learning_rate": 8.933573695720267e-05, + "loss": 0.4727, + "step": 367 + }, + { + "epoch": 0.2362002567394095, + "grad_norm": 0.01764221116900444, + "learning_rate": 8.927147734152978e-05, + "loss": 0.4766, + "step": 368 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 0.014389213174581528, + "learning_rate": 8.920704796095788e-05, + "loss": 0.4922, + "step": 369 + }, + { + "epoch": 0.2374839537869063, + "grad_norm": 0.01599840074777603, + "learning_rate": 8.914244909400585e-05, + "loss": 0.4551, + "step": 370 + }, + { + "epoch": 0.2381258023106547, + "grad_norm": 0.011929795145988464, + "learning_rate": 8.90776810199252e-05, + "loss": 0.4805, + "step": 371 + }, + { + "epoch": 0.23876765083440307, + "grad_norm": 0.011343288235366344, + "learning_rate": 8.901274401869893e-05, + "loss": 0.4805, + "step": 372 + }, + { + "epoch": 0.23940949935815148, + "grad_norm": 0.014292542822659016, + "learning_rate": 8.894763837104026e-05, + "loss": 0.4883, + "step": 373 + }, + { + "epoch": 0.24005134788189988, + "grad_norm": 0.013415781781077385, + "learning_rate": 8.888236435839147e-05, + "loss": 0.4766, + "step": 374 + }, + { + "epoch": 0.24069319640564826, + "grad_norm": 0.013569642789661884, + "learning_rate": 8.881692226292269e-05, + "loss": 0.5, + "step": 375 + }, + { + "epoch": 0.24133504492939667, + "grad_norm": 0.011331136338412762, + "learning_rate": 8.875131236753051e-05, + "loss": 0.4609, + "step": 376 + }, + { + "epoch": 0.24197689345314505, + "grad_norm": 0.013599894009530544, + "learning_rate": 8.868553495583707e-05, + "loss": 0.4766, + "step": 377 + }, + { + "epoch": 0.24261874197689345, + "grad_norm": 0.011140760965645313, + "learning_rate": 8.861959031218855e-05, + "loss": 0.4746, + "step": 378 + }, + { + "epoch": 0.24326059050064186, + "grad_norm": 0.01980205811560154, + "learning_rate": 8.855347872165406e-05, + "loss": 0.4727, + "step": 379 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 0.013902513310313225, + "learning_rate": 8.848720047002446e-05, + "loss": 0.4727, + "step": 380 + }, + { + "epoch": 0.24454428754813864, + "grad_norm": 0.01137533038854599, + "learning_rate": 8.842075584381096e-05, + "loss": 0.4648, + "step": 381 + }, + { + "epoch": 0.24518613607188702, + "grad_norm": 0.019017167389392853, + "learning_rate": 8.835414513024409e-05, + "loss": 0.4922, + "step": 382 + }, + { + "epoch": 0.24582798459563543, + "grad_norm": 0.014492962509393692, + "learning_rate": 8.828736861727225e-05, + "loss": 0.4844, + "step": 383 + }, + { + "epoch": 0.24646983311938384, + "grad_norm": 0.018102675676345825, + "learning_rate": 8.822042659356065e-05, + "loss": 0.4746, + "step": 384 + }, + { + "epoch": 0.2471116816431322, + "grad_norm": 0.012978147715330124, + "learning_rate": 8.815331934848996e-05, + "loss": 0.459, + "step": 385 + }, + { + "epoch": 0.24775353016688062, + "grad_norm": 0.011396810412406921, + "learning_rate": 8.808604717215503e-05, + "loss": 0.4492, + "step": 386 + }, + { + "epoch": 0.248395378690629, + "grad_norm": 0.013627592474222183, + "learning_rate": 8.801861035536374e-05, + "loss": 0.4805, + "step": 387 + }, + { + "epoch": 0.2490372272143774, + "grad_norm": 0.013477048836648464, + "learning_rate": 8.795100918963566e-05, + "loss": 0.4785, + "step": 388 + }, + { + "epoch": 0.2496790757381258, + "grad_norm": 0.012124343775212765, + "learning_rate": 8.78832439672008e-05, + "loss": 0.4727, + "step": 389 + }, + { + "epoch": 0.2503209242618742, + "grad_norm": 0.011770759709179401, + "learning_rate": 8.781531498099844e-05, + "loss": 0.4629, + "step": 390 + }, + { + "epoch": 0.25096277278562257, + "grad_norm": 0.011974097229540348, + "learning_rate": 8.774722252467566e-05, + "loss": 0.4766, + "step": 391 + }, + { + "epoch": 0.251604621309371, + "grad_norm": 0.012057865038514137, + "learning_rate": 8.767896689258635e-05, + "loss": 0.4531, + "step": 392 + }, + { + "epoch": 0.2522464698331194, + "grad_norm": 0.012654995545744896, + "learning_rate": 8.761054837978964e-05, + "loss": 0.4961, + "step": 393 + }, + { + "epoch": 0.25288831835686776, + "grad_norm": 0.01383557915687561, + "learning_rate": 8.754196728204886e-05, + "loss": 0.4727, + "step": 394 + }, + { + "epoch": 0.2535301668806162, + "grad_norm": 0.011721700429916382, + "learning_rate": 8.747322389583013e-05, + "loss": 0.4766, + "step": 395 + }, + { + "epoch": 0.25417201540436457, + "grad_norm": 0.012775659561157227, + "learning_rate": 8.740431851830118e-05, + "loss": 0.4707, + "step": 396 + }, + { + "epoch": 0.25481386392811295, + "grad_norm": 0.013818454928696156, + "learning_rate": 8.733525144732991e-05, + "loss": 0.4453, + "step": 397 + }, + { + "epoch": 0.2554557124518614, + "grad_norm": 0.014101814478635788, + "learning_rate": 8.726602298148325e-05, + "loss": 0.457, + "step": 398 + }, + { + "epoch": 0.25609756097560976, + "grad_norm": 0.013880562037229538, + "learning_rate": 8.719663342002585e-05, + "loss": 0.4727, + "step": 399 + }, + { + "epoch": 0.25673940949935814, + "grad_norm": 0.015757685527205467, + "learning_rate": 8.71270830629187e-05, + "loss": 0.4941, + "step": 400 + }, + { + "epoch": 0.2573812580231066, + "grad_norm": 0.01359548605978489, + "learning_rate": 8.705737221081789e-05, + "loss": 0.4883, + "step": 401 + }, + { + "epoch": 0.25802310654685495, + "grad_norm": 0.0140383904799819, + "learning_rate": 8.698750116507333e-05, + "loss": 0.4219, + "step": 402 + }, + { + "epoch": 0.25866495507060333, + "grad_norm": 0.016375742852687836, + "learning_rate": 8.691747022772742e-05, + "loss": 0.4883, + "step": 403 + }, + { + "epoch": 0.2593068035943517, + "grad_norm": 0.0133128697052598, + "learning_rate": 8.684727970151374e-05, + "loss": 0.4805, + "step": 404 + }, + { + "epoch": 0.25994865211810014, + "grad_norm": 0.013061772100627422, + "learning_rate": 8.677692988985575e-05, + "loss": 0.4766, + "step": 405 + }, + { + "epoch": 0.2605905006418485, + "grad_norm": 0.013719551265239716, + "learning_rate": 8.670642109686546e-05, + "loss": 0.457, + "step": 406 + }, + { + "epoch": 0.2612323491655969, + "grad_norm": 0.013394895009696484, + "learning_rate": 8.663575362734219e-05, + "loss": 0.4844, + "step": 407 + }, + { + "epoch": 0.26187419768934533, + "grad_norm": 0.014179762452840805, + "learning_rate": 8.656492778677112e-05, + "loss": 0.4512, + "step": 408 + }, + { + "epoch": 0.2625160462130937, + "grad_norm": 0.011698760092258453, + "learning_rate": 8.649394388132211e-05, + "loss": 0.4609, + "step": 409 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.012449960224330425, + "learning_rate": 8.642280221784828e-05, + "loss": 0.4922, + "step": 410 + }, + { + "epoch": 0.2637997432605905, + "grad_norm": 0.013107222504913807, + "learning_rate": 8.635150310388471e-05, + "loss": 0.4531, + "step": 411 + }, + { + "epoch": 0.2644415917843389, + "grad_norm": 0.012861665338277817, + "learning_rate": 8.628004684764715e-05, + "loss": 0.4258, + "step": 412 + }, + { + "epoch": 0.2650834403080873, + "grad_norm": 0.013430675491690636, + "learning_rate": 8.620843375803058e-05, + "loss": 0.4805, + "step": 413 + }, + { + "epoch": 0.26572528883183566, + "grad_norm": 0.01297419611364603, + "learning_rate": 8.613666414460806e-05, + "loss": 0.457, + "step": 414 + }, + { + "epoch": 0.2663671373555841, + "grad_norm": 0.01470587681978941, + "learning_rate": 8.606473831762916e-05, + "loss": 0.4531, + "step": 415 + }, + { + "epoch": 0.26700898587933247, + "grad_norm": 0.012723923660814762, + "learning_rate": 8.599265658801883e-05, + "loss": 0.4531, + "step": 416 + }, + { + "epoch": 0.26765083440308085, + "grad_norm": 0.010943865403532982, + "learning_rate": 8.592041926737591e-05, + "loss": 0.4688, + "step": 417 + }, + { + "epoch": 0.2682926829268293, + "grad_norm": 0.015221865847706795, + "learning_rate": 8.584802666797188e-05, + "loss": 0.4648, + "step": 418 + }, + { + "epoch": 0.26893453145057766, + "grad_norm": 0.016463516280055046, + "learning_rate": 8.577547910274941e-05, + "loss": 0.4824, + "step": 419 + }, + { + "epoch": 0.26957637997432604, + "grad_norm": 0.01197727955877781, + "learning_rate": 8.570277688532112e-05, + "loss": 0.4648, + "step": 420 + }, + { + "epoch": 0.2702182284980745, + "grad_norm": 0.016626371070742607, + "learning_rate": 8.562992032996815e-05, + "loss": 0.4824, + "step": 421 + }, + { + "epoch": 0.27086007702182285, + "grad_norm": 0.012758683413267136, + "learning_rate": 8.555690975163883e-05, + "loss": 0.4688, + "step": 422 + }, + { + "epoch": 0.27150192554557123, + "grad_norm": 0.01449665054678917, + "learning_rate": 8.548374546594727e-05, + "loss": 0.4648, + "step": 423 + }, + { + "epoch": 0.27214377406931967, + "grad_norm": 0.012013441883027554, + "learning_rate": 8.541042778917207e-05, + "loss": 0.4805, + "step": 424 + }, + { + "epoch": 0.27278562259306804, + "grad_norm": 0.01310022547841072, + "learning_rate": 8.533695703825493e-05, + "loss": 0.4551, + "step": 425 + }, + { + "epoch": 0.2734274711168164, + "grad_norm": 0.01266622543334961, + "learning_rate": 8.526333353079922e-05, + "loss": 0.416, + "step": 426 + }, + { + "epoch": 0.2740693196405648, + "grad_norm": 0.013401447795331478, + "learning_rate": 8.518955758506871e-05, + "loss": 0.4531, + "step": 427 + }, + { + "epoch": 0.27471116816431324, + "grad_norm": 0.013581424951553345, + "learning_rate": 8.511562951998607e-05, + "loss": 0.4688, + "step": 428 + }, + { + "epoch": 0.2753530166880616, + "grad_norm": 0.013305437751114368, + "learning_rate": 8.504154965513163e-05, + "loss": 0.5, + "step": 429 + }, + { + "epoch": 0.27599486521181, + "grad_norm": 0.012767552398145199, + "learning_rate": 8.496731831074189e-05, + "loss": 0.4805, + "step": 430 + }, + { + "epoch": 0.2766367137355584, + "grad_norm": 0.014782493934035301, + "learning_rate": 8.48929358077082e-05, + "loss": 0.4414, + "step": 431 + }, + { + "epoch": 0.2772785622593068, + "grad_norm": 0.013910866342484951, + "learning_rate": 8.481840246757531e-05, + "loss": 0.4414, + "step": 432 + }, + { + "epoch": 0.2779204107830552, + "grad_norm": 0.012829374521970749, + "learning_rate": 8.474371861254002e-05, + "loss": 0.4746, + "step": 433 + }, + { + "epoch": 0.2785622593068036, + "grad_norm": 0.011781183071434498, + "learning_rate": 8.466888456544983e-05, + "loss": 0.457, + "step": 434 + }, + { + "epoch": 0.279204107830552, + "grad_norm": 0.011600065045058727, + "learning_rate": 8.459390064980146e-05, + "loss": 0.4531, + "step": 435 + }, + { + "epoch": 0.2798459563543004, + "grad_norm": 0.014579742215573788, + "learning_rate": 8.45187671897395e-05, + "loss": 0.4922, + "step": 436 + }, + { + "epoch": 0.2804878048780488, + "grad_norm": 0.01168655976653099, + "learning_rate": 8.444348451005499e-05, + "loss": 0.4316, + "step": 437 + }, + { + "epoch": 0.2811296534017972, + "grad_norm": 0.013155391439795494, + "learning_rate": 8.436805293618404e-05, + "loss": 0.4688, + "step": 438 + }, + { + "epoch": 0.28177150192554556, + "grad_norm": 0.012940961867570877, + "learning_rate": 8.429247279420637e-05, + "loss": 0.4492, + "step": 439 + }, + { + "epoch": 0.28241335044929394, + "grad_norm": 0.01248475257307291, + "learning_rate": 8.421674441084404e-05, + "loss": 0.457, + "step": 440 + }, + { + "epoch": 0.2830551989730424, + "grad_norm": 0.014048158191144466, + "learning_rate": 8.414086811345978e-05, + "loss": 0.4863, + "step": 441 + }, + { + "epoch": 0.28369704749679076, + "grad_norm": 0.012956113554537296, + "learning_rate": 8.406484423005587e-05, + "loss": 0.4688, + "step": 442 + }, + { + "epoch": 0.28433889602053913, + "grad_norm": 0.01437971368432045, + "learning_rate": 8.398867308927252e-05, + "loss": 0.4609, + "step": 443 + }, + { + "epoch": 0.28498074454428757, + "grad_norm": 0.01513160765171051, + "learning_rate": 8.391235502038651e-05, + "loss": 0.4727, + "step": 444 + }, + { + "epoch": 0.28562259306803595, + "grad_norm": 0.012992394156754017, + "learning_rate": 8.383589035330977e-05, + "loss": 0.4766, + "step": 445 + }, + { + "epoch": 0.2862644415917843, + "grad_norm": 0.012563800439238548, + "learning_rate": 8.375927941858797e-05, + "loss": 0.4492, + "step": 446 + }, + { + "epoch": 0.28690629011553276, + "grad_norm": 0.014958107843995094, + "learning_rate": 8.368252254739908e-05, + "loss": 0.4688, + "step": 447 + }, + { + "epoch": 0.28754813863928114, + "grad_norm": 0.01127879973500967, + "learning_rate": 8.360562007155192e-05, + "loss": 0.418, + "step": 448 + }, + { + "epoch": 0.2881899871630295, + "grad_norm": 0.012075050733983517, + "learning_rate": 8.352857232348472e-05, + "loss": 0.4297, + "step": 449 + }, + { + "epoch": 0.2888318356867779, + "grad_norm": 0.01276320219039917, + "learning_rate": 8.345137963626372e-05, + "loss": 0.457, + "step": 450 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 0.012094990350306034, + "learning_rate": 8.337404234358172e-05, + "loss": 0.4609, + "step": 451 + }, + { + "epoch": 0.2901155327342747, + "grad_norm": 0.012510258704423904, + "learning_rate": 8.329656077975658e-05, + "loss": 0.4531, + "step": 452 + }, + { + "epoch": 0.2907573812580231, + "grad_norm": 0.013942587189376354, + "learning_rate": 8.32189352797299e-05, + "loss": 0.4688, + "step": 453 + }, + { + "epoch": 0.2913992297817715, + "grad_norm": 0.011335965245962143, + "learning_rate": 8.314116617906544e-05, + "loss": 0.4355, + "step": 454 + }, + { + "epoch": 0.2920410783055199, + "grad_norm": 0.013128997758030891, + "learning_rate": 8.306325381394774e-05, + "loss": 0.4473, + "step": 455 + }, + { + "epoch": 0.2926829268292683, + "grad_norm": 0.013161028735339642, + "learning_rate": 8.298519852118065e-05, + "loss": 0.4609, + "step": 456 + }, + { + "epoch": 0.2933247753530167, + "grad_norm": 0.012427788227796555, + "learning_rate": 8.290700063818589e-05, + "loss": 0.4434, + "step": 457 + }, + { + "epoch": 0.2939666238767651, + "grad_norm": 0.014244086109101772, + "learning_rate": 8.282866050300152e-05, + "loss": 0.4766, + "step": 458 + }, + { + "epoch": 0.29460847240051347, + "grad_norm": 0.012252756394445896, + "learning_rate": 8.275017845428063e-05, + "loss": 0.4766, + "step": 459 + }, + { + "epoch": 0.2952503209242619, + "grad_norm": 0.012408980168402195, + "learning_rate": 8.26715548312897e-05, + "loss": 0.4746, + "step": 460 + }, + { + "epoch": 0.2958921694480103, + "grad_norm": 0.014483646489679813, + "learning_rate": 8.259278997390725e-05, + "loss": 0.4316, + "step": 461 + }, + { + "epoch": 0.29653401797175866, + "grad_norm": 0.011408063583076, + "learning_rate": 8.251388422262234e-05, + "loss": 0.4453, + "step": 462 + }, + { + "epoch": 0.29717586649550704, + "grad_norm": 0.016647979617118835, + "learning_rate": 8.243483791853308e-05, + "loss": 0.4688, + "step": 463 + }, + { + "epoch": 0.29781771501925547, + "grad_norm": 0.012057177722454071, + "learning_rate": 8.235565140334518e-05, + "loss": 0.4512, + "step": 464 + }, + { + "epoch": 0.29845956354300385, + "grad_norm": 0.01220938190817833, + "learning_rate": 8.227632501937045e-05, + "loss": 0.4473, + "step": 465 + }, + { + "epoch": 0.2991014120667522, + "grad_norm": 0.012876022607088089, + "learning_rate": 8.219685910952532e-05, + "loss": 0.5078, + "step": 466 + }, + { + "epoch": 0.29974326059050066, + "grad_norm": 0.011478842236101627, + "learning_rate": 8.211725401732944e-05, + "loss": 0.4531, + "step": 467 + }, + { + "epoch": 0.30038510911424904, + "grad_norm": 0.012432406656444073, + "learning_rate": 8.203751008690403e-05, + "loss": 0.4297, + "step": 468 + }, + { + "epoch": 0.3010269576379974, + "grad_norm": 0.011501291766762733, + "learning_rate": 8.195762766297055e-05, + "loss": 0.4297, + "step": 469 + }, + { + "epoch": 0.30166880616174585, + "grad_norm": 0.013943162746727467, + "learning_rate": 8.187760709084911e-05, + "loss": 0.4707, + "step": 470 + }, + { + "epoch": 0.30231065468549423, + "grad_norm": 0.011519819498062134, + "learning_rate": 8.179744871645707e-05, + "loss": 0.4453, + "step": 471 + }, + { + "epoch": 0.3029525032092426, + "grad_norm": 0.011311845853924751, + "learning_rate": 8.171715288630742e-05, + "loss": 0.4414, + "step": 472 + }, + { + "epoch": 0.30359435173299104, + "grad_norm": 0.01158730685710907, + "learning_rate": 8.16367199475074e-05, + "loss": 0.4609, + "step": 473 + }, + { + "epoch": 0.3042362002567394, + "grad_norm": 0.013310915790498257, + "learning_rate": 8.155615024775693e-05, + "loss": 0.4473, + "step": 474 + }, + { + "epoch": 0.3048780487804878, + "grad_norm": 0.014084615744650364, + "learning_rate": 8.147544413534714e-05, + "loss": 0.4375, + "step": 475 + }, + { + "epoch": 0.3055198973042362, + "grad_norm": 0.011800910346210003, + "learning_rate": 8.139460195915883e-05, + "loss": 0.4453, + "step": 476 + }, + { + "epoch": 0.3061617458279846, + "grad_norm": 0.012746773660182953, + "learning_rate": 8.1313624068661e-05, + "loss": 0.4473, + "step": 477 + }, + { + "epoch": 0.306803594351733, + "grad_norm": 0.0162535198032856, + "learning_rate": 8.123251081390937e-05, + "loss": 0.4805, + "step": 478 + }, + { + "epoch": 0.30744544287548137, + "grad_norm": 0.011776765808463097, + "learning_rate": 8.115126254554471e-05, + "loss": 0.4336, + "step": 479 + }, + { + "epoch": 0.3080872913992298, + "grad_norm": 0.015100241638720036, + "learning_rate": 8.10698796147915e-05, + "loss": 0.4453, + "step": 480 + }, + { + "epoch": 0.3087291399229782, + "grad_norm": 0.014546082355082035, + "learning_rate": 8.09883623734564e-05, + "loss": 0.418, + "step": 481 + }, + { + "epoch": 0.30937098844672656, + "grad_norm": 0.01146666705608368, + "learning_rate": 8.090671117392655e-05, + "loss": 0.4336, + "step": 482 + }, + { + "epoch": 0.310012836970475, + "grad_norm": 0.012188490480184555, + "learning_rate": 8.082492636916828e-05, + "loss": 0.4551, + "step": 483 + }, + { + "epoch": 0.31065468549422337, + "grad_norm": 0.012891625985503197, + "learning_rate": 8.074300831272542e-05, + "loss": 0.4316, + "step": 484 + }, + { + "epoch": 0.31129653401797175, + "grad_norm": 0.012159287929534912, + "learning_rate": 8.066095735871786e-05, + "loss": 0.4414, + "step": 485 + }, + { + "epoch": 0.3119383825417201, + "grad_norm": 0.014158441685140133, + "learning_rate": 8.057877386183995e-05, + "loss": 0.457, + "step": 486 + }, + { + "epoch": 0.31258023106546856, + "grad_norm": 0.014469098299741745, + "learning_rate": 8.049645817735903e-05, + "loss": 0.4492, + "step": 487 + }, + { + "epoch": 0.31322207958921694, + "grad_norm": 0.012358460575342178, + "learning_rate": 8.041401066111387e-05, + "loss": 0.4316, + "step": 488 + }, + { + "epoch": 0.3138639281129653, + "grad_norm": 0.014756308868527412, + "learning_rate": 8.033143166951311e-05, + "loss": 0.4297, + "step": 489 + }, + { + "epoch": 0.31450577663671375, + "grad_norm": 0.012180181220173836, + "learning_rate": 8.024872155953376e-05, + "loss": 0.4375, + "step": 490 + }, + { + "epoch": 0.31514762516046213, + "grad_norm": 0.014177980832755566, + "learning_rate": 8.016588068871961e-05, + "loss": 0.4492, + "step": 491 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.014273082837462425, + "learning_rate": 8.008290941517976e-05, + "loss": 0.4336, + "step": 492 + }, + { + "epoch": 0.31643132220795894, + "grad_norm": 0.012356355786323547, + "learning_rate": 7.999980809758694e-05, + "loss": 0.4375, + "step": 493 + }, + { + "epoch": 0.3170731707317073, + "grad_norm": 0.013944054022431374, + "learning_rate": 7.991657709517613e-05, + "loss": 0.4375, + "step": 494 + }, + { + "epoch": 0.3177150192554557, + "grad_norm": 0.015265170484781265, + "learning_rate": 7.983321676774285e-05, + "loss": 0.4609, + "step": 495 + }, + { + "epoch": 0.31835686777920413, + "grad_norm": 0.011430501006543636, + "learning_rate": 7.974972747564174e-05, + "loss": 0.4355, + "step": 496 + }, + { + "epoch": 0.3189987163029525, + "grad_norm": 0.015532695688307285, + "learning_rate": 7.966610957978483e-05, + "loss": 0.4922, + "step": 497 + }, + { + "epoch": 0.3196405648267009, + "grad_norm": 0.014049970544874668, + "learning_rate": 7.958236344164021e-05, + "loss": 0.4473, + "step": 498 + }, + { + "epoch": 0.32028241335044927, + "grad_norm": 0.011538274586200714, + "learning_rate": 7.949848942323029e-05, + "loss": 0.4375, + "step": 499 + }, + { + "epoch": 0.3209242618741977, + "grad_norm": 0.01410175021737814, + "learning_rate": 7.941448788713024e-05, + "loss": 0.4512, + "step": 500 + }, + { + "epoch": 0.3215661103979461, + "grad_norm": 0.017135094851255417, + "learning_rate": 7.933035919646654e-05, + "loss": 0.4453, + "step": 501 + }, + { + "epoch": 0.32220795892169446, + "grad_norm": 0.013886646367609501, + "learning_rate": 7.92461037149153e-05, + "loss": 0.457, + "step": 502 + }, + { + "epoch": 0.3228498074454429, + "grad_norm": 0.02123275399208069, + "learning_rate": 7.916172180670076e-05, + "loss": 0.4805, + "step": 503 + }, + { + "epoch": 0.32349165596919127, + "grad_norm": 0.012984886765480042, + "learning_rate": 7.907721383659368e-05, + "loss": 0.4453, + "step": 504 + }, + { + "epoch": 0.32413350449293965, + "grad_norm": 0.014154748991131783, + "learning_rate": 7.899258016990969e-05, + "loss": 0.4453, + "step": 505 + }, + { + "epoch": 0.3247753530166881, + "grad_norm": 0.01399321760982275, + "learning_rate": 7.890782117250793e-05, + "loss": 0.4492, + "step": 506 + }, + { + "epoch": 0.32541720154043646, + "grad_norm": 0.012078939937055111, + "learning_rate": 7.882293721078921e-05, + "loss": 0.4238, + "step": 507 + }, + { + "epoch": 0.32605905006418484, + "grad_norm": 0.013892695307731628, + "learning_rate": 7.873792865169458e-05, + "loss": 0.4766, + "step": 508 + }, + { + "epoch": 0.3267008985879333, + "grad_norm": 0.012964858673512936, + "learning_rate": 7.865279586270371e-05, + "loss": 0.4629, + "step": 509 + }, + { + "epoch": 0.32734274711168165, + "grad_norm": 0.012767773121595383, + "learning_rate": 7.856753921183331e-05, + "loss": 0.4727, + "step": 510 + }, + { + "epoch": 0.32798459563543003, + "grad_norm": 0.015065948478877544, + "learning_rate": 7.84821590676355e-05, + "loss": 0.4336, + "step": 511 + }, + { + "epoch": 0.3286264441591784, + "grad_norm": 0.01637822948396206, + "learning_rate": 7.839665579919626e-05, + "loss": 0.4277, + "step": 512 + }, + { + "epoch": 0.32926829268292684, + "grad_norm": 0.015638669952750206, + "learning_rate": 7.83110297761338e-05, + "loss": 0.4199, + "step": 513 + }, + { + "epoch": 0.3299101412066752, + "grad_norm": 0.014630009420216084, + "learning_rate": 7.822528136859702e-05, + "loss": 0.418, + "step": 514 + }, + { + "epoch": 0.3305519897304236, + "grad_norm": 0.017467154189944267, + "learning_rate": 7.813941094726384e-05, + "loss": 0.4707, + "step": 515 + }, + { + "epoch": 0.33119383825417203, + "grad_norm": 0.012475800700485706, + "learning_rate": 7.805341888333961e-05, + "loss": 0.4297, + "step": 516 + }, + { + "epoch": 0.3318356867779204, + "grad_norm": 0.011913339607417583, + "learning_rate": 7.796730554855558e-05, + "loss": 0.4414, + "step": 517 + }, + { + "epoch": 0.3324775353016688, + "grad_norm": 0.016746627166867256, + "learning_rate": 7.788107131516717e-05, + "loss": 0.4219, + "step": 518 + }, + { + "epoch": 0.3331193838254172, + "grad_norm": 0.015730712562799454, + "learning_rate": 7.779471655595248e-05, + "loss": 0.4414, + "step": 519 + }, + { + "epoch": 0.3337612323491656, + "grad_norm": 0.013719049282371998, + "learning_rate": 7.770824164421062e-05, + "loss": 0.4609, + "step": 520 + }, + { + "epoch": 0.334403080872914, + "grad_norm": 0.01619836688041687, + "learning_rate": 7.762164695376005e-05, + "loss": 0.459, + "step": 521 + }, + { + "epoch": 0.33504492939666236, + "grad_norm": 0.015834450721740723, + "learning_rate": 7.753493285893708e-05, + "loss": 0.4219, + "step": 522 + }, + { + "epoch": 0.3356867779204108, + "grad_norm": 0.012220130302011967, + "learning_rate": 7.744809973459415e-05, + "loss": 0.4238, + "step": 523 + }, + { + "epoch": 0.3363286264441592, + "grad_norm": 0.014210267923772335, + "learning_rate": 7.736114795609828e-05, + "loss": 0.4297, + "step": 524 + }, + { + "epoch": 0.33697047496790755, + "grad_norm": 0.012427953071892262, + "learning_rate": 7.727407789932935e-05, + "loss": 0.4531, + "step": 525 + }, + { + "epoch": 0.337612323491656, + "grad_norm": 0.014018227346241474, + "learning_rate": 7.718688994067863e-05, + "loss": 0.4258, + "step": 526 + }, + { + "epoch": 0.33825417201540436, + "grad_norm": 0.011482173576951027, + "learning_rate": 7.709958445704698e-05, + "loss": 0.4219, + "step": 527 + }, + { + "epoch": 0.33889602053915274, + "grad_norm": 0.01443139836192131, + "learning_rate": 7.701216182584336e-05, + "loss": 0.4375, + "step": 528 + }, + { + "epoch": 0.3395378690629012, + "grad_norm": 0.012723613530397415, + "learning_rate": 7.692462242498313e-05, + "loss": 0.4512, + "step": 529 + }, + { + "epoch": 0.34017971758664955, + "grad_norm": 0.013509729877114296, + "learning_rate": 7.68369666328864e-05, + "loss": 0.4492, + "step": 530 + }, + { + "epoch": 0.34082156611039793, + "grad_norm": 0.012013684958219528, + "learning_rate": 7.674919482847645e-05, + "loss": 0.4336, + "step": 531 + }, + { + "epoch": 0.34146341463414637, + "grad_norm": 0.012167776003479958, + "learning_rate": 7.666130739117805e-05, + "loss": 0.418, + "step": 532 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 0.013328084722161293, + "learning_rate": 7.657330470091584e-05, + "loss": 0.4258, + "step": 533 + }, + { + "epoch": 0.3427471116816431, + "grad_norm": 0.011257752776145935, + "learning_rate": 7.648518713811267e-05, + "loss": 0.4062, + "step": 534 + }, + { + "epoch": 0.3433889602053915, + "grad_norm": 0.013994047418236732, + "learning_rate": 7.639695508368803e-05, + "loss": 0.4258, + "step": 535 + }, + { + "epoch": 0.34403080872913994, + "grad_norm": 0.014302888885140419, + "learning_rate": 7.630860891905625e-05, + "loss": 0.4492, + "step": 536 + }, + { + "epoch": 0.3446726572528883, + "grad_norm": 0.012811757624149323, + "learning_rate": 7.6220149026125e-05, + "loss": 0.4414, + "step": 537 + }, + { + "epoch": 0.3453145057766367, + "grad_norm": 0.011713909916579723, + "learning_rate": 7.613157578729353e-05, + "loss": 0.4219, + "step": 538 + }, + { + "epoch": 0.3459563543003851, + "grad_norm": 0.01381740439683199, + "learning_rate": 7.604288958545113e-05, + "loss": 0.4199, + "step": 539 + }, + { + "epoch": 0.3465982028241335, + "grad_norm": 0.011871659196913242, + "learning_rate": 7.59540908039754e-05, + "loss": 0.375, + "step": 540 + }, + { + "epoch": 0.3472400513478819, + "grad_norm": 0.012422285042703152, + "learning_rate": 7.586517982673054e-05, + "loss": 0.4453, + "step": 541 + }, + { + "epoch": 0.3478818998716303, + "grad_norm": 0.014537949115037918, + "learning_rate": 7.577615703806586e-05, + "loss": 0.4316, + "step": 542 + }, + { + "epoch": 0.3485237483953787, + "grad_norm": 0.013570103794336319, + "learning_rate": 7.568702282281392e-05, + "loss": 0.4336, + "step": 543 + }, + { + "epoch": 0.3491655969191271, + "grad_norm": 0.011618518270552158, + "learning_rate": 7.559777756628901e-05, + "loss": 0.4258, + "step": 544 + }, + { + "epoch": 0.3498074454428755, + "grad_norm": 0.014149384573101997, + "learning_rate": 7.550842165428543e-05, + "loss": 0.4297, + "step": 545 + }, + { + "epoch": 0.3504492939666239, + "grad_norm": 0.012584755197167397, + "learning_rate": 7.541895547307584e-05, + "loss": 0.4414, + "step": 546 + }, + { + "epoch": 0.35109114249037227, + "grad_norm": 0.01141996867954731, + "learning_rate": 7.532937940940953e-05, + "loss": 0.4102, + "step": 547 + }, + { + "epoch": 0.35173299101412064, + "grad_norm": 0.012579156085848808, + "learning_rate": 7.523969385051084e-05, + "loss": 0.4199, + "step": 548 + }, + { + "epoch": 0.3523748395378691, + "grad_norm": 0.015211129561066628, + "learning_rate": 7.514989918407744e-05, + "loss": 0.3906, + "step": 549 + }, + { + "epoch": 0.35301668806161746, + "grad_norm": 0.013005176559090614, + "learning_rate": 7.505999579827863e-05, + "loss": 0.4297, + "step": 550 + }, + { + "epoch": 0.35365853658536583, + "grad_norm": 0.015413876622915268, + "learning_rate": 7.496998408175373e-05, + "loss": 0.4531, + "step": 551 + }, + { + "epoch": 0.35430038510911427, + "grad_norm": 0.014693383127450943, + "learning_rate": 7.48798644236103e-05, + "loss": 0.4707, + "step": 552 + }, + { + "epoch": 0.35494223363286265, + "grad_norm": 0.012898659333586693, + "learning_rate": 7.478963721342256e-05, + "loss": 0.416, + "step": 553 + }, + { + "epoch": 0.355584082156611, + "grad_norm": 0.011905779130756855, + "learning_rate": 7.469930284122966e-05, + "loss": 0.4141, + "step": 554 + }, + { + "epoch": 0.35622593068035946, + "grad_norm": 0.01483968086540699, + "learning_rate": 7.460886169753397e-05, + "loss": 0.4141, + "step": 555 + }, + { + "epoch": 0.35686777920410784, + "grad_norm": 0.014148566871881485, + "learning_rate": 7.451831417329943e-05, + "loss": 0.4531, + "step": 556 + }, + { + "epoch": 0.3575096277278562, + "grad_norm": 0.013587851077318192, + "learning_rate": 7.442766065994985e-05, + "loss": 0.4316, + "step": 557 + }, + { + "epoch": 0.3581514762516046, + "grad_norm": 0.012541595846414566, + "learning_rate": 7.433690154936724e-05, + "loss": 0.4141, + "step": 558 + }, + { + "epoch": 0.35879332477535303, + "grad_norm": 0.012243524193763733, + "learning_rate": 7.424603723389006e-05, + "loss": 0.4277, + "step": 559 + }, + { + "epoch": 0.3594351732991014, + "grad_norm": 0.014602742157876492, + "learning_rate": 7.415506810631155e-05, + "loss": 0.4102, + "step": 560 + }, + { + "epoch": 0.3600770218228498, + "grad_norm": 0.015661180019378662, + "learning_rate": 7.406399455987804e-05, + "loss": 0.4336, + "step": 561 + }, + { + "epoch": 0.3607188703465982, + "grad_norm": 0.014495295472443104, + "learning_rate": 7.39728169882873e-05, + "loss": 0.4492, + "step": 562 + }, + { + "epoch": 0.3613607188703466, + "grad_norm": 0.013921634294092655, + "learning_rate": 7.388153578568671e-05, + "loss": 0.4531, + "step": 563 + }, + { + "epoch": 0.362002567394095, + "grad_norm": 0.014414452016353607, + "learning_rate": 7.379015134667167e-05, + "loss": 0.4297, + "step": 564 + }, + { + "epoch": 0.3626444159178434, + "grad_norm": 0.01415167935192585, + "learning_rate": 7.369866406628385e-05, + "loss": 0.4297, + "step": 565 + }, + { + "epoch": 0.3632862644415918, + "grad_norm": 0.017361560836434364, + "learning_rate": 7.36070743400095e-05, + "loss": 0.4062, + "step": 566 + }, + { + "epoch": 0.36392811296534017, + "grad_norm": 0.01368169579654932, + "learning_rate": 7.351538256377771e-05, + "loss": 0.4297, + "step": 567 + }, + { + "epoch": 0.3645699614890886, + "grad_norm": 0.013606247492134571, + "learning_rate": 7.342358913395874e-05, + "loss": 0.4258, + "step": 568 + }, + { + "epoch": 0.365211810012837, + "grad_norm": 0.014146490022540092, + "learning_rate": 7.333169444736225e-05, + "loss": 0.4531, + "step": 569 + }, + { + "epoch": 0.36585365853658536, + "grad_norm": 0.013685311190783978, + "learning_rate": 7.323969890123565e-05, + "loss": 0.4297, + "step": 570 + }, + { + "epoch": 0.36649550706033374, + "grad_norm": 0.012898226268589497, + "learning_rate": 7.314760289326236e-05, + "loss": 0.418, + "step": 571 + }, + { + "epoch": 0.36713735558408217, + "grad_norm": 0.014033549465239048, + "learning_rate": 7.305540682156e-05, + "loss": 0.4414, + "step": 572 + }, + { + "epoch": 0.36777920410783055, + "grad_norm": 0.012177499942481518, + "learning_rate": 7.296311108467888e-05, + "loss": 0.4102, + "step": 573 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 0.019322926178574562, + "learning_rate": 7.287071608160003e-05, + "loss": 0.4453, + "step": 574 + }, + { + "epoch": 0.36906290115532736, + "grad_norm": 0.014528055675327778, + "learning_rate": 7.277822221173367e-05, + "loss": 0.4434, + "step": 575 + }, + { + "epoch": 0.36970474967907574, + "grad_norm": 0.014417881146073341, + "learning_rate": 7.268562987491739e-05, + "loss": 0.4609, + "step": 576 + }, + { + "epoch": 0.3703465982028241, + "grad_norm": 0.013631274923682213, + "learning_rate": 7.259293947141441e-05, + "loss": 0.4375, + "step": 577 + }, + { + "epoch": 0.37098844672657255, + "grad_norm": 0.016857121139764786, + "learning_rate": 7.250015140191188e-05, + "loss": 0.4531, + "step": 578 + }, + { + "epoch": 0.37163029525032093, + "grad_norm": 0.014084297232329845, + "learning_rate": 7.24072660675192e-05, + "loss": 0.4219, + "step": 579 + }, + { + "epoch": 0.3722721437740693, + "grad_norm": 0.014484019950032234, + "learning_rate": 7.231428386976618e-05, + "loss": 0.4551, + "step": 580 + }, + { + "epoch": 0.37291399229781774, + "grad_norm": 0.016134057193994522, + "learning_rate": 7.222120521060134e-05, + "loss": 0.4531, + "step": 581 + }, + { + "epoch": 0.3735558408215661, + "grad_norm": 0.015156952664256096, + "learning_rate": 7.212803049239028e-05, + "loss": 0.4277, + "step": 582 + }, + { + "epoch": 0.3741976893453145, + "grad_norm": 0.014183555729687214, + "learning_rate": 7.203476011791373e-05, + "loss": 0.4102, + "step": 583 + }, + { + "epoch": 0.3748395378690629, + "grad_norm": 0.016075201332569122, + "learning_rate": 7.194139449036603e-05, + "loss": 0.4375, + "step": 584 + }, + { + "epoch": 0.3754813863928113, + "grad_norm": 0.01476068701595068, + "learning_rate": 7.184793401335322e-05, + "loss": 0.418, + "step": 585 + }, + { + "epoch": 0.3761232349165597, + "grad_norm": 0.013279945589601994, + "learning_rate": 7.175437909089139e-05, + "loss": 0.4395, + "step": 586 + }, + { + "epoch": 0.37676508344030807, + "grad_norm": 0.01367487758398056, + "learning_rate": 7.166073012740491e-05, + "loss": 0.4531, + "step": 587 + }, + { + "epoch": 0.3774069319640565, + "grad_norm": 0.012248165905475616, + "learning_rate": 7.156698752772463e-05, + "loss": 0.4102, + "step": 588 + }, + { + "epoch": 0.3780487804878049, + "grad_norm": 0.01692807860672474, + "learning_rate": 7.147315169708622e-05, + "loss": 0.4336, + "step": 589 + }, + { + "epoch": 0.37869062901155326, + "grad_norm": 0.011725704185664654, + "learning_rate": 7.137922304112838e-05, + "loss": 0.4062, + "step": 590 + }, + { + "epoch": 0.3793324775353017, + "grad_norm": 0.015160840004682541, + "learning_rate": 7.128520196589105e-05, + "loss": 0.4062, + "step": 591 + }, + { + "epoch": 0.37997432605905007, + "grad_norm": 0.01488816924393177, + "learning_rate": 7.119108887781371e-05, + "loss": 0.4375, + "step": 592 + }, + { + "epoch": 0.38061617458279845, + "grad_norm": 0.014125654473900795, + "learning_rate": 7.109688418373355e-05, + "loss": 0.4453, + "step": 593 + }, + { + "epoch": 0.38125802310654683, + "grad_norm": 0.015549080446362495, + "learning_rate": 7.100258829088384e-05, + "loss": 0.4492, + "step": 594 + }, + { + "epoch": 0.38189987163029526, + "grad_norm": 0.012530624866485596, + "learning_rate": 7.090820160689201e-05, + "loss": 0.418, + "step": 595 + }, + { + "epoch": 0.38254172015404364, + "grad_norm": 0.011856675148010254, + "learning_rate": 7.081372453977803e-05, + "loss": 0.4238, + "step": 596 + }, + { + "epoch": 0.383183568677792, + "grad_norm": 0.013924370519816875, + "learning_rate": 7.071915749795253e-05, + "loss": 0.4258, + "step": 597 + }, + { + "epoch": 0.38382541720154045, + "grad_norm": 0.012145726941525936, + "learning_rate": 7.062450089021511e-05, + "loss": 0.4258, + "step": 598 + }, + { + "epoch": 0.38446726572528883, + "grad_norm": 0.01391034759581089, + "learning_rate": 7.052975512575258e-05, + "loss": 0.4219, + "step": 599 + }, + { + "epoch": 0.3851091142490372, + "grad_norm": 0.012797384522855282, + "learning_rate": 7.04349206141371e-05, + "loss": 0.4355, + "step": 600 + }, + { + "epoch": 0.38575096277278564, + "grad_norm": 0.0131190475076437, + "learning_rate": 7.033999776532454e-05, + "loss": 0.4336, + "step": 601 + }, + { + "epoch": 0.386392811296534, + "grad_norm": 0.012565034441649914, + "learning_rate": 7.024498698965258e-05, + "loss": 0.4395, + "step": 602 + }, + { + "epoch": 0.3870346598202824, + "grad_norm": 0.014118792489171028, + "learning_rate": 7.014988869783901e-05, + "loss": 0.4473, + "step": 603 + }, + { + "epoch": 0.38767650834403083, + "grad_norm": 0.01384700182825327, + "learning_rate": 7.005470330098e-05, + "loss": 0.3984, + "step": 604 + }, + { + "epoch": 0.3883183568677792, + "grad_norm": 0.0131995165720582, + "learning_rate": 6.995943121054816e-05, + "loss": 0.4395, + "step": 605 + }, + { + "epoch": 0.3889602053915276, + "grad_norm": 0.013334594666957855, + "learning_rate": 6.986407283839091e-05, + "loss": 0.4336, + "step": 606 + }, + { + "epoch": 0.38960205391527597, + "grad_norm": 0.012548287399113178, + "learning_rate": 6.976862859672869e-05, + "loss": 0.4316, + "step": 607 + }, + { + "epoch": 0.3902439024390244, + "grad_norm": 0.01311532687395811, + "learning_rate": 6.96730988981531e-05, + "loss": 0.4316, + "step": 608 + }, + { + "epoch": 0.3908857509627728, + "grad_norm": 0.012342913076281548, + "learning_rate": 6.957748415562517e-05, + "loss": 0.3867, + "step": 609 + }, + { + "epoch": 0.39152759948652116, + "grad_norm": 0.012902142480015755, + "learning_rate": 6.948178478247355e-05, + "loss": 0.3906, + "step": 610 + }, + { + "epoch": 0.3921694480102696, + "grad_norm": 0.011220437474548817, + "learning_rate": 6.938600119239273e-05, + "loss": 0.416, + "step": 611 + }, + { + "epoch": 0.392811296534018, + "grad_norm": 0.012904628179967403, + "learning_rate": 6.929013379944132e-05, + "loss": 0.4277, + "step": 612 + }, + { + "epoch": 0.39345314505776635, + "grad_norm": 0.019070589914917946, + "learning_rate": 6.919418301804012e-05, + "loss": 0.4238, + "step": 613 + }, + { + "epoch": 0.3940949935815148, + "grad_norm": 0.013551855459809303, + "learning_rate": 6.909814926297044e-05, + "loss": 0.4199, + "step": 614 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 0.011821141466498375, + "learning_rate": 6.900203294937229e-05, + "loss": 0.3906, + "step": 615 + }, + { + "epoch": 0.39537869062901154, + "grad_norm": 0.01720435544848442, + "learning_rate": 6.89058344927425e-05, + "loss": 0.4375, + "step": 616 + }, + { + "epoch": 0.39602053915276, + "grad_norm": 0.013075001537799835, + "learning_rate": 6.88095543089331e-05, + "loss": 0.4473, + "step": 617 + }, + { + "epoch": 0.39666238767650835, + "grad_norm": 0.015591239556670189, + "learning_rate": 6.871319281414932e-05, + "loss": 0.4219, + "step": 618 + }, + { + "epoch": 0.39730423620025673, + "grad_norm": 0.014195159077644348, + "learning_rate": 6.861675042494794e-05, + "loss": 0.4414, + "step": 619 + }, + { + "epoch": 0.3979460847240051, + "grad_norm": 0.013499117456376553, + "learning_rate": 6.85202275582354e-05, + "loss": 0.3867, + "step": 620 + }, + { + "epoch": 0.39858793324775355, + "grad_norm": 0.014024997130036354, + "learning_rate": 6.842362463126604e-05, + "loss": 0.4199, + "step": 621 + }, + { + "epoch": 0.3992297817715019, + "grad_norm": 0.011665104888379574, + "learning_rate": 6.832694206164035e-05, + "loss": 0.3984, + "step": 622 + }, + { + "epoch": 0.3998716302952503, + "grad_norm": 0.01300954632461071, + "learning_rate": 6.8230180267303e-05, + "loss": 0.4062, + "step": 623 + }, + { + "epoch": 0.40051347881899874, + "grad_norm": 0.018235405907034874, + "learning_rate": 6.813333966654122e-05, + "loss": 0.457, + "step": 624 + }, + { + "epoch": 0.4011553273427471, + "grad_norm": 0.013178367167711258, + "learning_rate": 6.803642067798284e-05, + "loss": 0.4434, + "step": 625 + }, + { + "epoch": 0.4017971758664955, + "grad_norm": 0.014165622182190418, + "learning_rate": 6.793942372059462e-05, + "loss": 0.4102, + "step": 626 + }, + { + "epoch": 0.4024390243902439, + "grad_norm": 0.013849026523530483, + "learning_rate": 6.784234921368033e-05, + "loss": 0.4258, + "step": 627 + }, + { + "epoch": 0.4030808729139923, + "grad_norm": 0.014649467542767525, + "learning_rate": 6.774519757687897e-05, + "loss": 0.418, + "step": 628 + }, + { + "epoch": 0.4037227214377407, + "grad_norm": 0.01384900975972414, + "learning_rate": 6.764796923016298e-05, + "loss": 0.4199, + "step": 629 + }, + { + "epoch": 0.40436456996148906, + "grad_norm": 0.011851613409817219, + "learning_rate": 6.755066459383637e-05, + "loss": 0.4258, + "step": 630 + }, + { + "epoch": 0.4050064184852375, + "grad_norm": 0.01206874568015337, + "learning_rate": 6.7453284088533e-05, + "loss": 0.4121, + "step": 631 + }, + { + "epoch": 0.4056482670089859, + "grad_norm": 0.012382757849991322, + "learning_rate": 6.735582813521467e-05, + "loss": 0.4141, + "step": 632 + }, + { + "epoch": 0.40629011553273425, + "grad_norm": 0.01458633504807949, + "learning_rate": 6.72582971551693e-05, + "loss": 0.418, + "step": 633 + }, + { + "epoch": 0.4069319640564827, + "grad_norm": 0.012113570235669613, + "learning_rate": 6.716069157000917e-05, + "loss": 0.4219, + "step": 634 + }, + { + "epoch": 0.40757381258023107, + "grad_norm": 0.011811967939138412, + "learning_rate": 6.706301180166909e-05, + "loss": 0.4395, + "step": 635 + }, + { + "epoch": 0.40821566110397944, + "grad_norm": 0.012068016454577446, + "learning_rate": 6.69652582724045e-05, + "loss": 0.4062, + "step": 636 + }, + { + "epoch": 0.4088575096277279, + "grad_norm": 0.012213320471346378, + "learning_rate": 6.686743140478972e-05, + "loss": 0.4297, + "step": 637 + }, + { + "epoch": 0.40949935815147626, + "grad_norm": 0.012913153506815434, + "learning_rate": 6.676953162171613e-05, + "loss": 0.3926, + "step": 638 + }, + { + "epoch": 0.41014120667522463, + "grad_norm": 0.013523731380701065, + "learning_rate": 6.667155934639026e-05, + "loss": 0.4141, + "step": 639 + }, + { + "epoch": 0.41078305519897307, + "grad_norm": 0.01334290113300085, + "learning_rate": 6.65735150023321e-05, + "loss": 0.4258, + "step": 640 + }, + { + "epoch": 0.41142490372272145, + "grad_norm": 0.011239216662943363, + "learning_rate": 6.647539901337307e-05, + "loss": 0.4004, + "step": 641 + }, + { + "epoch": 0.4120667522464698, + "grad_norm": 0.013552368618547916, + "learning_rate": 6.637721180365437e-05, + "loss": 0.3984, + "step": 642 + }, + { + "epoch": 0.4127086007702182, + "grad_norm": 0.012600836344063282, + "learning_rate": 6.627895379762506e-05, + "loss": 0.4375, + "step": 643 + }, + { + "epoch": 0.41335044929396664, + "grad_norm": 0.012270119972527027, + "learning_rate": 6.618062542004024e-05, + "loss": 0.4453, + "step": 644 + }, + { + "epoch": 0.413992297817715, + "grad_norm": 0.013107932172715664, + "learning_rate": 6.608222709595925e-05, + "loss": 0.418, + "step": 645 + }, + { + "epoch": 0.4146341463414634, + "grad_norm": 0.011738761328160763, + "learning_rate": 6.598375925074373e-05, + "loss": 0.4297, + "step": 646 + }, + { + "epoch": 0.41527599486521183, + "grad_norm": 0.014674089848995209, + "learning_rate": 6.588522231005591e-05, + "loss": 0.4492, + "step": 647 + }, + { + "epoch": 0.4159178433889602, + "grad_norm": 0.014927718788385391, + "learning_rate": 6.578661669985669e-05, + "loss": 0.4395, + "step": 648 + }, + { + "epoch": 0.4165596919127086, + "grad_norm": 0.011675119400024414, + "learning_rate": 6.568794284640383e-05, + "loss": 0.4102, + "step": 649 + }, + { + "epoch": 0.417201540436457, + "grad_norm": 0.012475242838263512, + "learning_rate": 6.558920117625005e-05, + "loss": 0.4082, + "step": 650 + }, + { + "epoch": 0.4178433889602054, + "grad_norm": 0.014828507788479328, + "learning_rate": 6.549039211624129e-05, + "loss": 0.4062, + "step": 651 + }, + { + "epoch": 0.4184852374839538, + "grad_norm": 0.01544868666678667, + "learning_rate": 6.539151609351476e-05, + "loss": 0.4336, + "step": 652 + }, + { + "epoch": 0.4191270860077022, + "grad_norm": 0.011812376789748669, + "learning_rate": 6.529257353549717e-05, + "loss": 0.3789, + "step": 653 + }, + { + "epoch": 0.4197689345314506, + "grad_norm": 0.013554428704082966, + "learning_rate": 6.519356486990287e-05, + "loss": 0.4297, + "step": 654 + }, + { + "epoch": 0.42041078305519897, + "grad_norm": 0.013231674209237099, + "learning_rate": 6.509449052473193e-05, + "loss": 0.4102, + "step": 655 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.013440310023725033, + "learning_rate": 6.499535092826835e-05, + "loss": 0.3984, + "step": 656 + }, + { + "epoch": 0.4216944801026958, + "grad_norm": 0.01264109555631876, + "learning_rate": 6.489614650907825e-05, + "loss": 0.4277, + "step": 657 + }, + { + "epoch": 0.42233632862644416, + "grad_norm": 0.01602267101407051, + "learning_rate": 6.479687769600795e-05, + "loss": 0.4258, + "step": 658 + }, + { + "epoch": 0.42297817715019254, + "grad_norm": 0.013843649998307228, + "learning_rate": 6.469754491818212e-05, + "loss": 0.3945, + "step": 659 + }, + { + "epoch": 0.42362002567394097, + "grad_norm": 0.011772658675909042, + "learning_rate": 6.459814860500194e-05, + "loss": 0.4004, + "step": 660 + }, + { + "epoch": 0.42426187419768935, + "grad_norm": 0.011879803612828255, + "learning_rate": 6.449868918614325e-05, + "loss": 0.4023, + "step": 661 + }, + { + "epoch": 0.4249037227214377, + "grad_norm": 0.016992155462503433, + "learning_rate": 6.439916709155468e-05, + "loss": 0.416, + "step": 662 + }, + { + "epoch": 0.42554557124518616, + "grad_norm": 0.013572323136031628, + "learning_rate": 6.429958275145583e-05, + "loss": 0.4082, + "step": 663 + }, + { + "epoch": 0.42618741976893454, + "grad_norm": 0.012573149986565113, + "learning_rate": 6.419993659633535e-05, + "loss": 0.4336, + "step": 664 + }, + { + "epoch": 0.4268292682926829, + "grad_norm": 0.011085672304034233, + "learning_rate": 6.41002290569491e-05, + "loss": 0.4121, + "step": 665 + }, + { + "epoch": 0.4274711168164313, + "grad_norm": 0.01807100512087345, + "learning_rate": 6.400046056431829e-05, + "loss": 0.4277, + "step": 666 + }, + { + "epoch": 0.42811296534017973, + "grad_norm": 0.01302388682961464, + "learning_rate": 6.390063154972767e-05, + "loss": 0.4219, + "step": 667 + }, + { + "epoch": 0.4287548138639281, + "grad_norm": 0.01238784659653902, + "learning_rate": 6.38007424447236e-05, + "loss": 0.4023, + "step": 668 + }, + { + "epoch": 0.4293966623876765, + "grad_norm": 0.014977477490901947, + "learning_rate": 6.370079368111214e-05, + "loss": 0.4062, + "step": 669 + }, + { + "epoch": 0.4300385109114249, + "grad_norm": 0.014952096156775951, + "learning_rate": 6.360078569095734e-05, + "loss": 0.4219, + "step": 670 + }, + { + "epoch": 0.4306803594351733, + "grad_norm": 0.012696454301476479, + "learning_rate": 6.350071890657918e-05, + "loss": 0.3848, + "step": 671 + }, + { + "epoch": 0.4313222079589217, + "grad_norm": 0.013992464169859886, + "learning_rate": 6.340059376055193e-05, + "loss": 0.4336, + "step": 672 + }, + { + "epoch": 0.4319640564826701, + "grad_norm": 0.015323866158723831, + "learning_rate": 6.330041068570198e-05, + "loss": 0.4258, + "step": 673 + }, + { + "epoch": 0.4326059050064185, + "grad_norm": 0.013826565816998482, + "learning_rate": 6.32001701151063e-05, + "loss": 0.418, + "step": 674 + }, + { + "epoch": 0.43324775353016687, + "grad_norm": 0.015464494936168194, + "learning_rate": 6.309987248209029e-05, + "loss": 0.4023, + "step": 675 + }, + { + "epoch": 0.4338896020539153, + "grad_norm": 0.015331785194575787, + "learning_rate": 6.299951822022609e-05, + "loss": 0.4102, + "step": 676 + }, + { + "epoch": 0.4345314505776637, + "grad_norm": 0.012050331570208073, + "learning_rate": 6.289910776333062e-05, + "loss": 0.418, + "step": 677 + }, + { + "epoch": 0.43517329910141206, + "grad_norm": 0.016262125223875046, + "learning_rate": 6.279864154546366e-05, + "loss": 0.4102, + "step": 678 + }, + { + "epoch": 0.43581514762516044, + "grad_norm": 0.014397810213267803, + "learning_rate": 6.269812000092619e-05, + "loss": 0.4219, + "step": 679 + }, + { + "epoch": 0.43645699614890887, + "grad_norm": 0.012025640346109867, + "learning_rate": 6.259754356425818e-05, + "loss": 0.4199, + "step": 680 + }, + { + "epoch": 0.43709884467265725, + "grad_norm": 0.012223280966281891, + "learning_rate": 6.249691267023701e-05, + "loss": 0.4141, + "step": 681 + }, + { + "epoch": 0.43774069319640563, + "grad_norm": 0.013894766569137573, + "learning_rate": 6.239622775387543e-05, + "loss": 0.4141, + "step": 682 + }, + { + "epoch": 0.43838254172015406, + "grad_norm": 0.013338884338736534, + "learning_rate": 6.229548925041973e-05, + "loss": 0.4141, + "step": 683 + }, + { + "epoch": 0.43902439024390244, + "grad_norm": 0.012654834426939487, + "learning_rate": 6.219469759534784e-05, + "loss": 0.4102, + "step": 684 + }, + { + "epoch": 0.4396662387676508, + "grad_norm": 0.012363187968730927, + "learning_rate": 6.209385322436746e-05, + "loss": 0.3809, + "step": 685 + }, + { + "epoch": 0.44030808729139925, + "grad_norm": 0.012646438553929329, + "learning_rate": 6.199295657341419e-05, + "loss": 0.418, + "step": 686 + }, + { + "epoch": 0.44094993581514763, + "grad_norm": 0.013392324559390545, + "learning_rate": 6.189200807864959e-05, + "loss": 0.4023, + "step": 687 + }, + { + "epoch": 0.441591784338896, + "grad_norm": 0.013896627351641655, + "learning_rate": 6.179100817645938e-05, + "loss": 0.4082, + "step": 688 + }, + { + "epoch": 0.44223363286264444, + "grad_norm": 0.014535841532051563, + "learning_rate": 6.168995730345145e-05, + "loss": 0.418, + "step": 689 + }, + { + "epoch": 0.4428754813863928, + "grad_norm": 0.015824053436517715, + "learning_rate": 6.15888558964541e-05, + "loss": 0.3926, + "step": 690 + }, + { + "epoch": 0.4435173299101412, + "grad_norm": 0.012450765818357468, + "learning_rate": 6.148770439251405e-05, + "loss": 0.416, + "step": 691 + }, + { + "epoch": 0.4441591784338896, + "grad_norm": 0.012254000641405582, + "learning_rate": 6.138650322889453e-05, + "loss": 0.4219, + "step": 692 + }, + { + "epoch": 0.444801026957638, + "grad_norm": 0.018290603533387184, + "learning_rate": 6.128525284307354e-05, + "loss": 0.4219, + "step": 693 + }, + { + "epoch": 0.4454428754813864, + "grad_norm": 0.012934917584061623, + "learning_rate": 6.118395367274177e-05, + "loss": 0.4297, + "step": 694 + }, + { + "epoch": 0.44608472400513477, + "grad_norm": 0.01413138210773468, + "learning_rate": 6.108260615580086e-05, + "loss": 0.4336, + "step": 695 + }, + { + "epoch": 0.4467265725288832, + "grad_norm": 0.0147935152053833, + "learning_rate": 6.09812107303614e-05, + "loss": 0.4102, + "step": 696 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 0.013712236657738686, + "learning_rate": 6.087976783474114e-05, + "loss": 0.418, + "step": 697 + }, + { + "epoch": 0.44801026957637996, + "grad_norm": 0.012871808372437954, + "learning_rate": 6.0778277907462945e-05, + "loss": 0.4023, + "step": 698 + }, + { + "epoch": 0.4486521181001284, + "grad_norm": 0.013505810871720314, + "learning_rate": 6.06767413872531e-05, + "loss": 0.4121, + "step": 699 + }, + { + "epoch": 0.4492939666238768, + "grad_norm": 0.014261540025472641, + "learning_rate": 6.0575158713039234e-05, + "loss": 0.4102, + "step": 700 + }, + { + "epoch": 0.44993581514762515, + "grad_norm": 0.014325248077511787, + "learning_rate": 6.047353032394849e-05, + "loss": 0.4238, + "step": 701 + }, + { + "epoch": 0.45057766367137353, + "grad_norm": 0.014693713746964931, + "learning_rate": 6.037185665930567e-05, + "loss": 0.4297, + "step": 702 + }, + { + "epoch": 0.45121951219512196, + "grad_norm": 0.01406776811927557, + "learning_rate": 6.027013815863128e-05, + "loss": 0.3848, + "step": 703 + }, + { + "epoch": 0.45186136071887034, + "grad_norm": 0.011017663404345512, + "learning_rate": 6.016837526163962e-05, + "loss": 0.4023, + "step": 704 + }, + { + "epoch": 0.4525032092426187, + "grad_norm": 0.01230404619127512, + "learning_rate": 6.006656840823696e-05, + "loss": 0.418, + "step": 705 + }, + { + "epoch": 0.45314505776636715, + "grad_norm": 0.012102317065000534, + "learning_rate": 5.996471803851951e-05, + "loss": 0.3789, + "step": 706 + }, + { + "epoch": 0.45378690629011553, + "grad_norm": 0.01572810672223568, + "learning_rate": 5.986282459277168e-05, + "loss": 0.4258, + "step": 707 + }, + { + "epoch": 0.4544287548138639, + "grad_norm": 0.013043870218098164, + "learning_rate": 5.976088851146405e-05, + "loss": 0.3926, + "step": 708 + }, + { + "epoch": 0.45507060333761234, + "grad_norm": 0.013452388346195221, + "learning_rate": 5.9658910235251495e-05, + "loss": 0.4297, + "step": 709 + }, + { + "epoch": 0.4557124518613607, + "grad_norm": 0.011983458884060383, + "learning_rate": 5.9556890204971326e-05, + "loss": 0.4219, + "step": 710 + }, + { + "epoch": 0.4563543003851091, + "grad_norm": 0.012922801077365875, + "learning_rate": 5.945482886164132e-05, + "loss": 0.4336, + "step": 711 + }, + { + "epoch": 0.45699614890885754, + "grad_norm": 0.01430261880159378, + "learning_rate": 5.935272664645786e-05, + "loss": 0.4141, + "step": 712 + }, + { + "epoch": 0.4576379974326059, + "grad_norm": 0.013975469395518303, + "learning_rate": 5.9250584000794017e-05, + "loss": 0.4238, + "step": 713 + }, + { + "epoch": 0.4582798459563543, + "grad_norm": 0.013664472848176956, + "learning_rate": 5.914840136619761e-05, + "loss": 0.4453, + "step": 714 + }, + { + "epoch": 0.45892169448010267, + "grad_norm": 0.01542427483946085, + "learning_rate": 5.904617918438936e-05, + "loss": 0.3945, + "step": 715 + }, + { + "epoch": 0.4595635430038511, + "grad_norm": 0.012004110030829906, + "learning_rate": 5.894391789726093e-05, + "loss": 0.3945, + "step": 716 + }, + { + "epoch": 0.4602053915275995, + "grad_norm": 0.014152607880532742, + "learning_rate": 5.8841617946873015e-05, + "loss": 0.4023, + "step": 717 + }, + { + "epoch": 0.46084724005134786, + "grad_norm": 0.014494920149445534, + "learning_rate": 5.873927977545346e-05, + "loss": 0.4219, + "step": 718 + }, + { + "epoch": 0.4614890885750963, + "grad_norm": 0.01685180701315403, + "learning_rate": 5.863690382539535e-05, + "loss": 0.3945, + "step": 719 + }, + { + "epoch": 0.4621309370988447, + "grad_norm": 0.013664394617080688, + "learning_rate": 5.853449053925505e-05, + "loss": 0.4219, + "step": 720 + }, + { + "epoch": 0.46277278562259305, + "grad_norm": 0.014847121201455593, + "learning_rate": 5.843204035975033e-05, + "loss": 0.4258, + "step": 721 + }, + { + "epoch": 0.4634146341463415, + "grad_norm": 0.015490569174289703, + "learning_rate": 5.832955372975848e-05, + "loss": 0.418, + "step": 722 + }, + { + "epoch": 0.46405648267008986, + "grad_norm": 0.015505144372582436, + "learning_rate": 5.822703109231431e-05, + "loss": 0.4258, + "step": 723 + }, + { + "epoch": 0.46469833119383824, + "grad_norm": 0.012789933010935783, + "learning_rate": 5.812447289060832e-05, + "loss": 0.4023, + "step": 724 + }, + { + "epoch": 0.4653401797175867, + "grad_norm": 0.01355286780744791, + "learning_rate": 5.80218795679847e-05, + "loss": 0.3984, + "step": 725 + }, + { + "epoch": 0.46598202824133506, + "grad_norm": 0.013972820714116096, + "learning_rate": 5.791925156793956e-05, + "loss": 0.4258, + "step": 726 + }, + { + "epoch": 0.46662387676508343, + "grad_norm": 0.012723516672849655, + "learning_rate": 5.781658933411882e-05, + "loss": 0.3633, + "step": 727 + }, + { + "epoch": 0.4672657252888318, + "grad_norm": 0.013931083492934704, + "learning_rate": 5.7713893310316426e-05, + "loss": 0.4258, + "step": 728 + }, + { + "epoch": 0.46790757381258025, + "grad_norm": 0.014217558316886425, + "learning_rate": 5.761116394047238e-05, + "loss": 0.4434, + "step": 729 + }, + { + "epoch": 0.4685494223363286, + "grad_norm": 0.012869459576904774, + "learning_rate": 5.750840166867085e-05, + "loss": 0.4062, + "step": 730 + }, + { + "epoch": 0.469191270860077, + "grad_norm": 0.014253105036914349, + "learning_rate": 5.740560693913825e-05, + "loss": 0.4375, + "step": 731 + }, + { + "epoch": 0.46983311938382544, + "grad_norm": 0.012103088200092316, + "learning_rate": 5.7302780196241245e-05, + "loss": 0.4297, + "step": 732 + }, + { + "epoch": 0.4704749679075738, + "grad_norm": 0.012361271306872368, + "learning_rate": 5.7199921884484975e-05, + "loss": 0.4258, + "step": 733 + }, + { + "epoch": 0.4711168164313222, + "grad_norm": 0.015202773734927177, + "learning_rate": 5.7097032448510945e-05, + "loss": 0.4102, + "step": 734 + }, + { + "epoch": 0.47175866495507063, + "grad_norm": 0.012599785812199116, + "learning_rate": 5.699411233309528e-05, + "loss": 0.3984, + "step": 735 + }, + { + "epoch": 0.472400513478819, + "grad_norm": 0.011520575731992722, + "learning_rate": 5.689116198314673e-05, + "loss": 0.377, + "step": 736 + }, + { + "epoch": 0.4730423620025674, + "grad_norm": 0.01304789911955595, + "learning_rate": 5.678818184370469e-05, + "loss": 0.4102, + "step": 737 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.011445467360317707, + "learning_rate": 5.668517235993739e-05, + "loss": 0.3945, + "step": 738 + }, + { + "epoch": 0.4743260590500642, + "grad_norm": 0.012655424885451794, + "learning_rate": 5.658213397713985e-05, + "loss": 0.4023, + "step": 739 + }, + { + "epoch": 0.4749679075738126, + "grad_norm": 0.014334771782159805, + "learning_rate": 5.647906714073208e-05, + "loss": 0.4082, + "step": 740 + }, + { + "epoch": 0.47560975609756095, + "grad_norm": 0.011826477013528347, + "learning_rate": 5.637597229625705e-05, + "loss": 0.4375, + "step": 741 + }, + { + "epoch": 0.4762516046213094, + "grad_norm": 0.013866581954061985, + "learning_rate": 5.627284988937882e-05, + "loss": 0.4102, + "step": 742 + }, + { + "epoch": 0.47689345314505777, + "grad_norm": 0.011929787695407867, + "learning_rate": 5.616970036588058e-05, + "loss": 0.3945, + "step": 743 + }, + { + "epoch": 0.47753530166880614, + "grad_norm": 0.014417732134461403, + "learning_rate": 5.606652417166276e-05, + "loss": 0.418, + "step": 744 + }, + { + "epoch": 0.4781771501925546, + "grad_norm": 0.012242990545928478, + "learning_rate": 5.59633217527411e-05, + "loss": 0.4277, + "step": 745 + }, + { + "epoch": 0.47881899871630296, + "grad_norm": 0.012755023315548897, + "learning_rate": 5.586009355524465e-05, + "loss": 0.3945, + "step": 746 + }, + { + "epoch": 0.47946084724005134, + "grad_norm": 0.012235814705491066, + "learning_rate": 5.575684002541397e-05, + "loss": 0.3945, + "step": 747 + }, + { + "epoch": 0.48010269576379977, + "grad_norm": 0.013560086488723755, + "learning_rate": 5.5653561609599046e-05, + "loss": 0.4023, + "step": 748 + }, + { + "epoch": 0.48074454428754815, + "grad_norm": 0.013584466651082039, + "learning_rate": 5.555025875425751e-05, + "loss": 0.4102, + "step": 749 + }, + { + "epoch": 0.4813863928112965, + "grad_norm": 0.012287539429962635, + "learning_rate": 5.5446931905952624e-05, + "loss": 0.4023, + "step": 750 + }, + { + "epoch": 0.4820282413350449, + "grad_norm": 0.015574885532259941, + "learning_rate": 5.534358151135135e-05, + "loss": 0.4023, + "step": 751 + }, + { + "epoch": 0.48267008985879334, + "grad_norm": 0.0164936576038599, + "learning_rate": 5.524020801722246e-05, + "loss": 0.4102, + "step": 752 + }, + { + "epoch": 0.4833119383825417, + "grad_norm": 0.012463854625821114, + "learning_rate": 5.513681187043456e-05, + "loss": 0.3965, + "step": 753 + }, + { + "epoch": 0.4839537869062901, + "grad_norm": 0.013957749120891094, + "learning_rate": 5.503339351795419e-05, + "loss": 0.4062, + "step": 754 + }, + { + "epoch": 0.48459563543003853, + "grad_norm": 0.013575349003076553, + "learning_rate": 5.4929953406843906e-05, + "loss": 0.4375, + "step": 755 + }, + { + "epoch": 0.4852374839537869, + "grad_norm": 0.012251514010131359, + "learning_rate": 5.4826491984260284e-05, + "loss": 0.4043, + "step": 756 + }, + { + "epoch": 0.4858793324775353, + "grad_norm": 0.012463661842048168, + "learning_rate": 5.472300969745204e-05, + "loss": 0.418, + "step": 757 + }, + { + "epoch": 0.4865211810012837, + "grad_norm": 0.014550065621733665, + "learning_rate": 5.46195069937581e-05, + "loss": 0.4297, + "step": 758 + }, + { + "epoch": 0.4871630295250321, + "grad_norm": 0.012784469872713089, + "learning_rate": 5.451598432060563e-05, + "loss": 0.4473, + "step": 759 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 0.014856231398880482, + "learning_rate": 5.4412442125508113e-05, + "loss": 0.4023, + "step": 760 + }, + { + "epoch": 0.4884467265725289, + "grad_norm": 0.014101446606218815, + "learning_rate": 5.430888085606346e-05, + "loss": 0.4336, + "step": 761 + }, + { + "epoch": 0.4890885750962773, + "grad_norm": 0.013721944764256477, + "learning_rate": 5.4205300959951974e-05, + "loss": 0.3945, + "step": 762 + }, + { + "epoch": 0.48973042362002567, + "grad_norm": 0.013042719103395939, + "learning_rate": 5.410170288493458e-05, + "loss": 0.3867, + "step": 763 + }, + { + "epoch": 0.49037227214377405, + "grad_norm": 0.013128210790455341, + "learning_rate": 5.399808707885069e-05, + "loss": 0.3789, + "step": 764 + }, + { + "epoch": 0.4910141206675225, + "grad_norm": 0.016061117872595787, + "learning_rate": 5.389445398961639e-05, + "loss": 0.3965, + "step": 765 + }, + { + "epoch": 0.49165596919127086, + "grad_norm": 0.013035389594733715, + "learning_rate": 5.3790804065222524e-05, + "loss": 0.4023, + "step": 766 + }, + { + "epoch": 0.49229781771501924, + "grad_norm": 0.012277752161026001, + "learning_rate": 5.368713775373264e-05, + "loss": 0.375, + "step": 767 + }, + { + "epoch": 0.49293966623876767, + "grad_norm": 0.013250727206468582, + "learning_rate": 5.3583455503281176e-05, + "loss": 0.4219, + "step": 768 + }, + { + "epoch": 0.49358151476251605, + "grad_norm": 0.01238930132240057, + "learning_rate": 5.347975776207148e-05, + "loss": 0.416, + "step": 769 + }, + { + "epoch": 0.4942233632862644, + "grad_norm": 0.011913436464965343, + "learning_rate": 5.337604497837383e-05, + "loss": 0.4023, + "step": 770 + }, + { + "epoch": 0.49486521181001286, + "grad_norm": 0.01252955012023449, + "learning_rate": 5.3272317600523505e-05, + "loss": 0.3828, + "step": 771 + }, + { + "epoch": 0.49550706033376124, + "grad_norm": 0.013348378241062164, + "learning_rate": 5.3168576076918966e-05, + "loss": 0.4102, + "step": 772 + }, + { + "epoch": 0.4961489088575096, + "grad_norm": 0.014592018909752369, + "learning_rate": 5.3064820856019745e-05, + "loss": 0.3828, + "step": 773 + }, + { + "epoch": 0.496790757381258, + "grad_norm": 0.014033477753400803, + "learning_rate": 5.2961052386344615e-05, + "loss": 0.4121, + "step": 774 + }, + { + "epoch": 0.49743260590500643, + "grad_norm": 0.013774393126368523, + "learning_rate": 5.28572711164696e-05, + "loss": 0.4102, + "step": 775 + }, + { + "epoch": 0.4980744544287548, + "grad_norm": 0.013182034716010094, + "learning_rate": 5.2753477495026084e-05, + "loss": 0.4297, + "step": 776 + }, + { + "epoch": 0.4987163029525032, + "grad_norm": 0.01512659341096878, + "learning_rate": 5.264967197069884e-05, + "loss": 0.4004, + "step": 777 + }, + { + "epoch": 0.4993581514762516, + "grad_norm": 0.013438140042126179, + "learning_rate": 5.25458549922241e-05, + "loss": 0.3945, + "step": 778 + }, + { + "epoch": 0.5, + "grad_norm": 0.012077905237674713, + "learning_rate": 5.244202700838756e-05, + "loss": 0.3867, + "step": 779 + }, + { + "epoch": 0.5006418485237484, + "grad_norm": 0.015471718274056911, + "learning_rate": 5.233818846802255e-05, + "loss": 0.3887, + "step": 780 + }, + { + "epoch": 0.5012836970474968, + "grad_norm": 0.015281775034964085, + "learning_rate": 5.223433982000804e-05, + "loss": 0.3926, + "step": 781 + }, + { + "epoch": 0.5019255455712451, + "grad_norm": 0.014022503979504108, + "learning_rate": 5.213048151326664e-05, + "loss": 0.4043, + "step": 782 + }, + { + "epoch": 0.5025673940949936, + "grad_norm": 0.012185904197394848, + "learning_rate": 5.2026613996762754e-05, + "loss": 0.375, + "step": 783 + }, + { + "epoch": 0.503209242618742, + "grad_norm": 0.01691463030874729, + "learning_rate": 5.192273771950057e-05, + "loss": 0.4141, + "step": 784 + }, + { + "epoch": 0.5038510911424904, + "grad_norm": 0.013279242441058159, + "learning_rate": 5.1818853130522184e-05, + "loss": 0.4023, + "step": 785 + }, + { + "epoch": 0.5044929396662388, + "grad_norm": 0.014866221696138382, + "learning_rate": 5.17149606789056e-05, + "loss": 0.4023, + "step": 786 + }, + { + "epoch": 0.5051347881899871, + "grad_norm": 0.014251725748181343, + "learning_rate": 5.161106081376281e-05, + "loss": 0.4199, + "step": 787 + }, + { + "epoch": 0.5057766367137355, + "grad_norm": 0.01272731926292181, + "learning_rate": 5.1507153984237857e-05, + "loss": 0.4023, + "step": 788 + }, + { + "epoch": 0.506418485237484, + "grad_norm": 0.014095312915742397, + "learning_rate": 5.140324063950488e-05, + "loss": 0.4082, + "step": 789 + }, + { + "epoch": 0.5070603337612324, + "grad_norm": 0.015140281058847904, + "learning_rate": 5.1299321228766194e-05, + "loss": 0.3809, + "step": 790 + }, + { + "epoch": 0.5077021822849808, + "grad_norm": 0.013064335100352764, + "learning_rate": 5.119539620125037e-05, + "loss": 0.4297, + "step": 791 + }, + { + "epoch": 0.5083440308087291, + "grad_norm": 0.013767260126769543, + "learning_rate": 5.1091466006210185e-05, + "loss": 0.3965, + "step": 792 + }, + { + "epoch": 0.5089858793324775, + "grad_norm": 0.014455810189247131, + "learning_rate": 5.0987531092920806e-05, + "loss": 0.3945, + "step": 793 + }, + { + "epoch": 0.5096277278562259, + "grad_norm": 0.012534044682979584, + "learning_rate": 5.0883591910677774e-05, + "loss": 0.3945, + "step": 794 + }, + { + "epoch": 0.5102695763799743, + "grad_norm": 0.015598188154399395, + "learning_rate": 5.0779648908795116e-05, + "loss": 0.3945, + "step": 795 + }, + { + "epoch": 0.5109114249037228, + "grad_norm": 0.013126207515597343, + "learning_rate": 5.067570253660333e-05, + "loss": 0.4102, + "step": 796 + }, + { + "epoch": 0.5115532734274711, + "grad_norm": 0.013315839692950249, + "learning_rate": 5.0571753243447515e-05, + "loss": 0.4043, + "step": 797 + }, + { + "epoch": 0.5121951219512195, + "grad_norm": 0.016860026866197586, + "learning_rate": 5.046780147868537e-05, + "loss": 0.3848, + "step": 798 + }, + { + "epoch": 0.5128369704749679, + "grad_norm": 0.01219270471483469, + "learning_rate": 5.0363847691685305e-05, + "loss": 0.3945, + "step": 799 + }, + { + "epoch": 0.5134788189987163, + "grad_norm": 0.011769594624638557, + "learning_rate": 5.0259892331824474e-05, + "loss": 0.3867, + "step": 800 + }, + { + "epoch": 0.5141206675224647, + "grad_norm": 0.012503019534051418, + "learning_rate": 5.015593584848679e-05, + "loss": 0.416, + "step": 801 + }, + { + "epoch": 0.5147625160462131, + "grad_norm": 0.01306526642292738, + "learning_rate": 5.0051978691061054e-05, + "loss": 0.4141, + "step": 802 + }, + { + "epoch": 0.5154043645699615, + "grad_norm": 0.012884598225355148, + "learning_rate": 4.9948021308938965e-05, + "loss": 0.4043, + "step": 803 + }, + { + "epoch": 0.5160462130937099, + "grad_norm": 0.010572467930614948, + "learning_rate": 4.984406415151323e-05, + "loss": 0.3828, + "step": 804 + }, + { + "epoch": 0.5166880616174583, + "grad_norm": 0.014578374102711678, + "learning_rate": 4.974010766817555e-05, + "loss": 0.4141, + "step": 805 + }, + { + "epoch": 0.5173299101412067, + "grad_norm": 0.012406845577061176, + "learning_rate": 4.9636152308314687e-05, + "loss": 0.3926, + "step": 806 + }, + { + "epoch": 0.517971758664955, + "grad_norm": 0.012090642005205154, + "learning_rate": 4.9532198521314635e-05, + "loss": 0.4102, + "step": 807 + }, + { + "epoch": 0.5186136071887034, + "grad_norm": 0.013852028176188469, + "learning_rate": 4.9428246756552496e-05, + "loss": 0.3945, + "step": 808 + }, + { + "epoch": 0.5192554557124519, + "grad_norm": 0.01639687456190586, + "learning_rate": 4.9324297463396685e-05, + "loss": 0.4062, + "step": 809 + }, + { + "epoch": 0.5198973042362003, + "grad_norm": 0.01494478527456522, + "learning_rate": 4.922035109120491e-05, + "loss": 0.4277, + "step": 810 + }, + { + "epoch": 0.5205391527599487, + "grad_norm": 0.012924427166581154, + "learning_rate": 4.911640808932223e-05, + "loss": 0.4062, + "step": 811 + }, + { + "epoch": 0.521181001283697, + "grad_norm": 0.012701825238764286, + "learning_rate": 4.901246890707921e-05, + "loss": 0.3984, + "step": 812 + }, + { + "epoch": 0.5218228498074454, + "grad_norm": 0.013332185335457325, + "learning_rate": 4.8908533993789826e-05, + "loss": 0.4023, + "step": 813 + }, + { + "epoch": 0.5224646983311938, + "grad_norm": 0.015123789198696613, + "learning_rate": 4.880460379874965e-05, + "loss": 0.4102, + "step": 814 + }, + { + "epoch": 0.5231065468549422, + "grad_norm": 0.011819974519312382, + "learning_rate": 4.870067877123382e-05, + "loss": 0.3789, + "step": 815 + }, + { + "epoch": 0.5237483953786907, + "grad_norm": 0.016871098428964615, + "learning_rate": 4.859675936049514e-05, + "loss": 0.3945, + "step": 816 + }, + { + "epoch": 0.524390243902439, + "grad_norm": 0.013095321133732796, + "learning_rate": 4.849284601576215e-05, + "loss": 0.3809, + "step": 817 + }, + { + "epoch": 0.5250320924261874, + "grad_norm": 0.014770804904401302, + "learning_rate": 4.8388939186237195e-05, + "loss": 0.3711, + "step": 818 + }, + { + "epoch": 0.5256739409499358, + "grad_norm": 0.015598641708493233, + "learning_rate": 4.828503932109441e-05, + "loss": 0.4121, + "step": 819 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.012911478988826275, + "learning_rate": 4.818114686947783e-05, + "loss": 0.3887, + "step": 820 + }, + { + "epoch": 0.5269576379974326, + "grad_norm": 0.013151862658560276, + "learning_rate": 4.8077262280499444e-05, + "loss": 0.4062, + "step": 821 + }, + { + "epoch": 0.527599486521181, + "grad_norm": 0.012958858162164688, + "learning_rate": 4.797338600323727e-05, + "loss": 0.3984, + "step": 822 + }, + { + "epoch": 0.5282413350449294, + "grad_norm": 0.013672449626028538, + "learning_rate": 4.7869518486733364e-05, + "loss": 0.418, + "step": 823 + }, + { + "epoch": 0.5288831835686778, + "grad_norm": 0.01231690589338541, + "learning_rate": 4.776566017999197e-05, + "loss": 0.373, + "step": 824 + }, + { + "epoch": 0.5295250320924262, + "grad_norm": 0.012452663853764534, + "learning_rate": 4.766181153197746e-05, + "loss": 0.3711, + "step": 825 + }, + { + "epoch": 0.5301668806161746, + "grad_norm": 0.011798235587775707, + "learning_rate": 4.755797299161246e-05, + "loss": 0.3828, + "step": 826 + }, + { + "epoch": 0.5308087291399229, + "grad_norm": 0.013432770036160946, + "learning_rate": 4.7454145007775925e-05, + "loss": 0.375, + "step": 827 + }, + { + "epoch": 0.5314505776636713, + "grad_norm": 0.013223826885223389, + "learning_rate": 4.7350328029301156e-05, + "loss": 0.3867, + "step": 828 + }, + { + "epoch": 0.5320924261874198, + "grad_norm": 0.012126738205552101, + "learning_rate": 4.7246522504973914e-05, + "loss": 0.3926, + "step": 829 + }, + { + "epoch": 0.5327342747111682, + "grad_norm": 0.011325270868837833, + "learning_rate": 4.714272888353041e-05, + "loss": 0.373, + "step": 830 + }, + { + "epoch": 0.5333761232349166, + "grad_norm": 0.01423851028084755, + "learning_rate": 4.70389476136554e-05, + "loss": 0.418, + "step": 831 + }, + { + "epoch": 0.5340179717586649, + "grad_norm": 0.01402893103659153, + "learning_rate": 4.6935179143980266e-05, + "loss": 0.4258, + "step": 832 + }, + { + "epoch": 0.5346598202824133, + "grad_norm": 0.012596989050507545, + "learning_rate": 4.683142392308105e-05, + "loss": 0.375, + "step": 833 + }, + { + "epoch": 0.5353016688061617, + "grad_norm": 0.012625569477677345, + "learning_rate": 4.672768239947649e-05, + "loss": 0.4043, + "step": 834 + }, + { + "epoch": 0.5359435173299102, + "grad_norm": 0.012342373840510845, + "learning_rate": 4.6623955021626184e-05, + "loss": 0.4023, + "step": 835 + }, + { + "epoch": 0.5365853658536586, + "grad_norm": 0.014212844893336296, + "learning_rate": 4.652024223792853e-05, + "loss": 0.4258, + "step": 836 + }, + { + "epoch": 0.537227214377407, + "grad_norm": 0.01258305087685585, + "learning_rate": 4.6416544496718835e-05, + "loss": 0.3945, + "step": 837 + }, + { + "epoch": 0.5378690629011553, + "grad_norm": 0.013877546414732933, + "learning_rate": 4.631286224626739e-05, + "loss": 0.3906, + "step": 838 + }, + { + "epoch": 0.5385109114249037, + "grad_norm": 0.013766865245997906, + "learning_rate": 4.620919593477749e-05, + "loss": 0.4141, + "step": 839 + }, + { + "epoch": 0.5391527599486521, + "grad_norm": 0.011848713271319866, + "learning_rate": 4.610554601038361e-05, + "loss": 0.3594, + "step": 840 + }, + { + "epoch": 0.5397946084724005, + "grad_norm": 0.013195286504924297, + "learning_rate": 4.600191292114932e-05, + "loss": 0.4219, + "step": 841 + }, + { + "epoch": 0.540436456996149, + "grad_norm": 0.012025582604110241, + "learning_rate": 4.5898297115065434e-05, + "loss": 0.3828, + "step": 842 + }, + { + "epoch": 0.5410783055198973, + "grad_norm": 0.014689466916024685, + "learning_rate": 4.579469904004803e-05, + "loss": 0.418, + "step": 843 + }, + { + "epoch": 0.5417201540436457, + "grad_norm": 0.01123505737632513, + "learning_rate": 4.5691119143936565e-05, + "loss": 0.3867, + "step": 844 + }, + { + "epoch": 0.5423620025673941, + "grad_norm": 0.012202415615320206, + "learning_rate": 4.558755787449189e-05, + "loss": 0.3848, + "step": 845 + }, + { + "epoch": 0.5430038510911425, + "grad_norm": 0.011510019190609455, + "learning_rate": 4.548401567939439e-05, + "loss": 0.3691, + "step": 846 + }, + { + "epoch": 0.5436456996148908, + "grad_norm": 0.014318268746137619, + "learning_rate": 4.538049300624192e-05, + "loss": 0.4297, + "step": 847 + }, + { + "epoch": 0.5442875481386393, + "grad_norm": 0.01239953376352787, + "learning_rate": 4.5276990302547964e-05, + "loss": 0.4102, + "step": 848 + }, + { + "epoch": 0.5449293966623877, + "grad_norm": 0.012069017626345158, + "learning_rate": 4.5173508015739735e-05, + "loss": 0.3789, + "step": 849 + }, + { + "epoch": 0.5455712451861361, + "grad_norm": 0.011770493350923061, + "learning_rate": 4.507004659315611e-05, + "loss": 0.3711, + "step": 850 + }, + { + "epoch": 0.5462130937098845, + "grad_norm": 0.012663186527788639, + "learning_rate": 4.496660648204581e-05, + "loss": 0.3828, + "step": 851 + }, + { + "epoch": 0.5468549422336328, + "grad_norm": 0.012398538179695606, + "learning_rate": 4.4863188129565454e-05, + "loss": 0.3906, + "step": 852 + }, + { + "epoch": 0.5474967907573812, + "grad_norm": 0.013944141566753387, + "learning_rate": 4.4759791982777546e-05, + "loss": 0.375, + "step": 853 + }, + { + "epoch": 0.5481386392811296, + "grad_norm": 0.012811684980988503, + "learning_rate": 4.465641848864866e-05, + "loss": 0.4199, + "step": 854 + }, + { + "epoch": 0.5487804878048781, + "grad_norm": 0.013038134202361107, + "learning_rate": 4.4553068094047394e-05, + "loss": 0.3965, + "step": 855 + }, + { + "epoch": 0.5494223363286265, + "grad_norm": 0.010878165252506733, + "learning_rate": 4.4449741245742494e-05, + "loss": 0.3828, + "step": 856 + }, + { + "epoch": 0.5500641848523748, + "grad_norm": 0.013144944794476032, + "learning_rate": 4.4346438390400965e-05, + "loss": 0.375, + "step": 857 + }, + { + "epoch": 0.5507060333761232, + "grad_norm": 0.013227356597781181, + "learning_rate": 4.4243159974586044e-05, + "loss": 0.3867, + "step": 858 + }, + { + "epoch": 0.5513478818998716, + "grad_norm": 0.013552071526646614, + "learning_rate": 4.413990644475536e-05, + "loss": 0.4102, + "step": 859 + }, + { + "epoch": 0.55198973042362, + "grad_norm": 0.016889218240976334, + "learning_rate": 4.4036678247258924e-05, + "loss": 0.4062, + "step": 860 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.01147947646677494, + "learning_rate": 4.393347582833724e-05, + "loss": 0.3867, + "step": 861 + }, + { + "epoch": 0.5532734274711169, + "grad_norm": 0.011723281815648079, + "learning_rate": 4.383029963411943e-05, + "loss": 0.3867, + "step": 862 + }, + { + "epoch": 0.5539152759948652, + "grad_norm": 0.012720635160803795, + "learning_rate": 4.372715011062118e-05, + "loss": 0.4102, + "step": 863 + }, + { + "epoch": 0.5545571245186136, + "grad_norm": 0.016574840992689133, + "learning_rate": 4.3624027703742953e-05, + "loss": 0.3945, + "step": 864 + }, + { + "epoch": 0.555198973042362, + "grad_norm": 0.01387743279337883, + "learning_rate": 4.352093285926793e-05, + "loss": 0.4004, + "step": 865 + }, + { + "epoch": 0.5558408215661104, + "grad_norm": 0.014131239615380764, + "learning_rate": 4.3417866022860156e-05, + "loss": 0.4297, + "step": 866 + }, + { + "epoch": 0.5564826700898587, + "grad_norm": 0.013099314644932747, + "learning_rate": 4.331482764006262e-05, + "loss": 0.4141, + "step": 867 + }, + { + "epoch": 0.5571245186136072, + "grad_norm": 0.012790219858288765, + "learning_rate": 4.3211818156295307e-05, + "loss": 0.3945, + "step": 868 + }, + { + "epoch": 0.5577663671373556, + "grad_norm": 0.017325740307569504, + "learning_rate": 4.310883801685328e-05, + "loss": 0.3906, + "step": 869 + }, + { + "epoch": 0.558408215661104, + "grad_norm": 0.012533944100141525, + "learning_rate": 4.300588766690473e-05, + "loss": 0.3711, + "step": 870 + }, + { + "epoch": 0.5590500641848524, + "grad_norm": 0.013145022094249725, + "learning_rate": 4.290296755148907e-05, + "loss": 0.3984, + "step": 871 + }, + { + "epoch": 0.5596919127086007, + "grad_norm": 0.014711151830852032, + "learning_rate": 4.280007811551505e-05, + "loss": 0.3945, + "step": 872 + }, + { + "epoch": 0.5603337612323491, + "grad_norm": 0.01361773069947958, + "learning_rate": 4.2697219803758746e-05, + "loss": 0.4043, + "step": 873 + }, + { + "epoch": 0.5609756097560976, + "grad_norm": 0.014319811947643757, + "learning_rate": 4.259439306086176e-05, + "loss": 0.3809, + "step": 874 + }, + { + "epoch": 0.561617458279846, + "grad_norm": 0.01249017659574747, + "learning_rate": 4.2491598331329154e-05, + "loss": 0.3906, + "step": 875 + }, + { + "epoch": 0.5622593068035944, + "grad_norm": 0.013194131664931774, + "learning_rate": 4.2388836059527634e-05, + "loss": 0.3906, + "step": 876 + }, + { + "epoch": 0.5629011553273428, + "grad_norm": 0.016752328723669052, + "learning_rate": 4.2286106689683605e-05, + "loss": 0.4082, + "step": 877 + }, + { + "epoch": 0.5635430038510911, + "grad_norm": 0.014421268366277218, + "learning_rate": 4.218341066588121e-05, + "loss": 0.3984, + "step": 878 + }, + { + "epoch": 0.5641848523748395, + "grad_norm": 0.012712839990854263, + "learning_rate": 4.2080748432060444e-05, + "loss": 0.3984, + "step": 879 + }, + { + "epoch": 0.5648267008985879, + "grad_norm": 0.016034195199608803, + "learning_rate": 4.19781204320153e-05, + "loss": 0.3945, + "step": 880 + }, + { + "epoch": 0.5654685494223364, + "grad_norm": 0.014965656213462353, + "learning_rate": 4.18755271093917e-05, + "loss": 0.3828, + "step": 881 + }, + { + "epoch": 0.5661103979460848, + "grad_norm": 0.013681135140359402, + "learning_rate": 4.1772968907685704e-05, + "loss": 0.3828, + "step": 882 + }, + { + "epoch": 0.5667522464698331, + "grad_norm": 0.012078775092959404, + "learning_rate": 4.1670446270241545e-05, + "loss": 0.4062, + "step": 883 + }, + { + "epoch": 0.5673940949935815, + "grad_norm": 0.013224365189671516, + "learning_rate": 4.156795964024967e-05, + "loss": 0.4004, + "step": 884 + }, + { + "epoch": 0.5680359435173299, + "grad_norm": 0.012720691040158272, + "learning_rate": 4.1465509460744963e-05, + "loss": 0.3555, + "step": 885 + }, + { + "epoch": 0.5686777920410783, + "grad_norm": 0.015442232601344585, + "learning_rate": 4.1363096174604654e-05, + "loss": 0.4023, + "step": 886 + }, + { + "epoch": 0.5693196405648266, + "grad_norm": 0.013440394774079323, + "learning_rate": 4.126072022454655e-05, + "loss": 0.3789, + "step": 887 + }, + { + "epoch": 0.5699614890885751, + "grad_norm": 0.014425133354961872, + "learning_rate": 4.115838205312701e-05, + "loss": 0.4102, + "step": 888 + }, + { + "epoch": 0.5706033376123235, + "grad_norm": 0.012587243691086769, + "learning_rate": 4.105608210273909e-05, + "loss": 0.4004, + "step": 889 + }, + { + "epoch": 0.5712451861360719, + "grad_norm": 0.01208252739161253, + "learning_rate": 4.0953820815610636e-05, + "loss": 0.375, + "step": 890 + }, + { + "epoch": 0.5718870346598203, + "grad_norm": 0.012063434347510338, + "learning_rate": 4.085159863380239e-05, + "loss": 0.3789, + "step": 891 + }, + { + "epoch": 0.5725288831835686, + "grad_norm": 0.012063032016158104, + "learning_rate": 4.0749415999205995e-05, + "loss": 0.4141, + "step": 892 + }, + { + "epoch": 0.573170731707317, + "grad_norm": 0.012117046862840652, + "learning_rate": 4.064727335354215e-05, + "loss": 0.4121, + "step": 893 + }, + { + "epoch": 0.5738125802310655, + "grad_norm": 0.014085904695093632, + "learning_rate": 4.054517113835869e-05, + "loss": 0.3594, + "step": 894 + }, + { + "epoch": 0.5744544287548139, + "grad_norm": 0.014206532388925552, + "learning_rate": 4.0443109795028665e-05, + "loss": 0.375, + "step": 895 + }, + { + "epoch": 0.5750962772785623, + "grad_norm": 0.014762453734874725, + "learning_rate": 4.03410897647485e-05, + "loss": 0.3633, + "step": 896 + }, + { + "epoch": 0.5757381258023107, + "grad_norm": 0.014051433652639389, + "learning_rate": 4.023911148853596e-05, + "loss": 0.4141, + "step": 897 + }, + { + "epoch": 0.576379974326059, + "grad_norm": 0.013479063287377357, + "learning_rate": 4.013717540722833e-05, + "loss": 0.418, + "step": 898 + }, + { + "epoch": 0.5770218228498074, + "grad_norm": 0.016634047031402588, + "learning_rate": 4.0035281961480496e-05, + "loss": 0.4219, + "step": 899 + }, + { + "epoch": 0.5776636713735558, + "grad_norm": 0.015428063459694386, + "learning_rate": 3.993343159176307e-05, + "loss": 0.4004, + "step": 900 + }, + { + "epoch": 0.5783055198973043, + "grad_norm": 0.013261674903333187, + "learning_rate": 3.983162473836038e-05, + "loss": 0.3945, + "step": 901 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.015347424894571304, + "learning_rate": 3.972986184136873e-05, + "loss": 0.3867, + "step": 902 + }, + { + "epoch": 0.579589216944801, + "grad_norm": 0.014665666036307812, + "learning_rate": 3.962814334069434e-05, + "loss": 0.3828, + "step": 903 + }, + { + "epoch": 0.5802310654685494, + "grad_norm": 0.01409863866865635, + "learning_rate": 3.952646967605152e-05, + "loss": 0.3906, + "step": 904 + }, + { + "epoch": 0.5808729139922978, + "grad_norm": 0.012653130106627941, + "learning_rate": 3.9424841286960784e-05, + "loss": 0.4102, + "step": 905 + }, + { + "epoch": 0.5815147625160462, + "grad_norm": 0.015162872150540352, + "learning_rate": 3.9323258612746916e-05, + "loss": 0.3887, + "step": 906 + }, + { + "epoch": 0.5821566110397947, + "grad_norm": 0.012417521327733994, + "learning_rate": 3.922172209253705e-05, + "loss": 0.4121, + "step": 907 + }, + { + "epoch": 0.582798459563543, + "grad_norm": 0.012824651785194874, + "learning_rate": 3.912023216525887e-05, + "loss": 0.3906, + "step": 908 + }, + { + "epoch": 0.5834403080872914, + "grad_norm": 0.013151159510016441, + "learning_rate": 3.90187892696386e-05, + "loss": 0.4062, + "step": 909 + }, + { + "epoch": 0.5840821566110398, + "grad_norm": 0.012618138454854488, + "learning_rate": 3.8917393844199156e-05, + "loss": 0.4102, + "step": 910 + }, + { + "epoch": 0.5847240051347882, + "grad_norm": 0.013840214349329472, + "learning_rate": 3.881604632725825e-05, + "loss": 0.3809, + "step": 911 + }, + { + "epoch": 0.5853658536585366, + "grad_norm": 0.016128696501255035, + "learning_rate": 3.8714747156926466e-05, + "loss": 0.3984, + "step": 912 + }, + { + "epoch": 0.5860077021822849, + "grad_norm": 0.013604899868369102, + "learning_rate": 3.8613496771105464e-05, + "loss": 0.3867, + "step": 913 + }, + { + "epoch": 0.5866495507060334, + "grad_norm": 0.0137571319937706, + "learning_rate": 3.8512295607485965e-05, + "loss": 0.3906, + "step": 914 + }, + { + "epoch": 0.5872913992297818, + "grad_norm": 0.022923102602362633, + "learning_rate": 3.8411144103545904e-05, + "loss": 0.4082, + "step": 915 + }, + { + "epoch": 0.5879332477535302, + "grad_norm": 0.013792094774544239, + "learning_rate": 3.8310042696548565e-05, + "loss": 0.418, + "step": 916 + }, + { + "epoch": 0.5885750962772786, + "grad_norm": 0.013917778618633747, + "learning_rate": 3.8208991823540646e-05, + "loss": 0.3691, + "step": 917 + }, + { + "epoch": 0.5892169448010269, + "grad_norm": 0.012257051654160023, + "learning_rate": 3.8107991921350416e-05, + "loss": 0.3945, + "step": 918 + }, + { + "epoch": 0.5898587933247753, + "grad_norm": 0.015123442746698856, + "learning_rate": 3.8007043426585824e-05, + "loss": 0.3945, + "step": 919 + }, + { + "epoch": 0.5905006418485238, + "grad_norm": 0.011402972973883152, + "learning_rate": 3.7906146775632554e-05, + "loss": 0.4004, + "step": 920 + }, + { + "epoch": 0.5911424903722722, + "grad_norm": 0.014657670632004738, + "learning_rate": 3.780530240465217e-05, + "loss": 0.3945, + "step": 921 + }, + { + "epoch": 0.5917843388960206, + "grad_norm": 0.011596706695854664, + "learning_rate": 3.770451074958029e-05, + "loss": 0.3711, + "step": 922 + }, + { + "epoch": 0.5924261874197689, + "grad_norm": 0.012447874061763287, + "learning_rate": 3.760377224612457e-05, + "loss": 0.3945, + "step": 923 + }, + { + "epoch": 0.5930680359435173, + "grad_norm": 0.012190941721200943, + "learning_rate": 3.7503087329763e-05, + "loss": 0.3984, + "step": 924 + }, + { + "epoch": 0.5937098844672657, + "grad_norm": 0.012762784026563168, + "learning_rate": 3.740245643574184e-05, + "loss": 0.3789, + "step": 925 + }, + { + "epoch": 0.5943517329910141, + "grad_norm": 0.013606292195618153, + "learning_rate": 3.730187999907383e-05, + "loss": 0.3652, + "step": 926 + }, + { + "epoch": 0.5949935815147626, + "grad_norm": 0.013056452386081219, + "learning_rate": 3.720135845453634e-05, + "loss": 0.3828, + "step": 927 + }, + { + "epoch": 0.5956354300385109, + "grad_norm": 0.012558141723275185, + "learning_rate": 3.7100892236669415e-05, + "loss": 0.3867, + "step": 928 + }, + { + "epoch": 0.5962772785622593, + "grad_norm": 0.013830955140292645, + "learning_rate": 3.700048177977391e-05, + "loss": 0.4102, + "step": 929 + }, + { + "epoch": 0.5969191270860077, + "grad_norm": 0.013005700893700123, + "learning_rate": 3.690012751790972e-05, + "loss": 0.3789, + "step": 930 + }, + { + "epoch": 0.5975609756097561, + "grad_norm": 0.014384974725544453, + "learning_rate": 3.679982988489371e-05, + "loss": 0.3945, + "step": 931 + }, + { + "epoch": 0.5982028241335045, + "grad_norm": 0.011707819066941738, + "learning_rate": 3.6699589314298026e-05, + "loss": 0.3926, + "step": 932 + }, + { + "epoch": 0.5988446726572529, + "grad_norm": 0.015173131600022316, + "learning_rate": 3.65994062394481e-05, + "loss": 0.3945, + "step": 933 + }, + { + "epoch": 0.5994865211810013, + "grad_norm": 0.01235125120729208, + "learning_rate": 3.649928109342082e-05, + "loss": 0.3867, + "step": 934 + }, + { + "epoch": 0.6001283697047497, + "grad_norm": 0.016087165102362633, + "learning_rate": 3.639921430904268e-05, + "loss": 0.3984, + "step": 935 + }, + { + "epoch": 0.6007702182284981, + "grad_norm": 0.013020205311477184, + "learning_rate": 3.629920631888787e-05, + "loss": 0.4062, + "step": 936 + }, + { + "epoch": 0.6014120667522465, + "grad_norm": 0.01325344666838646, + "learning_rate": 3.619925755527642e-05, + "loss": 0.3926, + "step": 937 + }, + { + "epoch": 0.6020539152759948, + "grad_norm": 0.014157435856759548, + "learning_rate": 3.609936845027234e-05, + "loss": 0.3984, + "step": 938 + }, + { + "epoch": 0.6026957637997432, + "grad_norm": 0.01504555344581604, + "learning_rate": 3.5999539435681717e-05, + "loss": 0.4062, + "step": 939 + }, + { + "epoch": 0.6033376123234917, + "grad_norm": 0.012598301284015179, + "learning_rate": 3.5899770943050924e-05, + "loss": 0.3945, + "step": 940 + }, + { + "epoch": 0.6039794608472401, + "grad_norm": 0.012591421604156494, + "learning_rate": 3.5800063403664666e-05, + "loss": 0.3867, + "step": 941 + }, + { + "epoch": 0.6046213093709885, + "grad_norm": 0.013082444667816162, + "learning_rate": 3.5700417248544174e-05, + "loss": 0.3828, + "step": 942 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.013451972976326942, + "learning_rate": 3.560083290844534e-05, + "loss": 0.3945, + "step": 943 + }, + { + "epoch": 0.6059050064184852, + "grad_norm": 0.014088460244238377, + "learning_rate": 3.5501310813856766e-05, + "loss": 0.4023, + "step": 944 + }, + { + "epoch": 0.6065468549422336, + "grad_norm": 0.013572737574577332, + "learning_rate": 3.5401851394998084e-05, + "loss": 0.3809, + "step": 945 + }, + { + "epoch": 0.6071887034659821, + "grad_norm": 0.013333848677575588, + "learning_rate": 3.530245508181789e-05, + "loss": 0.3867, + "step": 946 + }, + { + "epoch": 0.6078305519897305, + "grad_norm": 0.015051721595227718, + "learning_rate": 3.5203122303992056e-05, + "loss": 0.3906, + "step": 947 + }, + { + "epoch": 0.6084724005134788, + "grad_norm": 0.01500693429261446, + "learning_rate": 3.5103853490921756e-05, + "loss": 0.3828, + "step": 948 + }, + { + "epoch": 0.6091142490372272, + "grad_norm": 0.012870650738477707, + "learning_rate": 3.5004649071731664e-05, + "loss": 0.3867, + "step": 949 + }, + { + "epoch": 0.6097560975609756, + "grad_norm": 0.012895474210381508, + "learning_rate": 3.4905509475268104e-05, + "loss": 0.3594, + "step": 950 + }, + { + "epoch": 0.610397946084724, + "grad_norm": 0.016247158870100975, + "learning_rate": 3.4806435130097134e-05, + "loss": 0.3926, + "step": 951 + }, + { + "epoch": 0.6110397946084724, + "grad_norm": 0.013967982493340969, + "learning_rate": 3.470742646450282e-05, + "loss": 0.3867, + "step": 952 + }, + { + "epoch": 0.6116816431322208, + "grad_norm": 0.01287417858839035, + "learning_rate": 3.4608483906485254e-05, + "loss": 0.3945, + "step": 953 + }, + { + "epoch": 0.6123234916559692, + "grad_norm": 0.013941469602286816, + "learning_rate": 3.450960788375872e-05, + "loss": 0.3691, + "step": 954 + }, + { + "epoch": 0.6129653401797176, + "grad_norm": 0.017454208806157112, + "learning_rate": 3.4410798823749964e-05, + "loss": 0.3867, + "step": 955 + }, + { + "epoch": 0.613607188703466, + "grad_norm": 0.013686290942132473, + "learning_rate": 3.4312057153596186e-05, + "loss": 0.3789, + "step": 956 + }, + { + "epoch": 0.6142490372272144, + "grad_norm": 0.013105151243507862, + "learning_rate": 3.42133833001433e-05, + "loss": 0.3672, + "step": 957 + }, + { + "epoch": 0.6148908857509627, + "grad_norm": 0.012791011482477188, + "learning_rate": 3.411477768994409e-05, + "loss": 0.373, + "step": 958 + }, + { + "epoch": 0.6155327342747111, + "grad_norm": 0.01345115527510643, + "learning_rate": 3.4016240749256266e-05, + "loss": 0.3555, + "step": 959 + }, + { + "epoch": 0.6161745827984596, + "grad_norm": 0.012273910455405712, + "learning_rate": 3.391777290404077e-05, + "loss": 0.4004, + "step": 960 + }, + { + "epoch": 0.616816431322208, + "grad_norm": 0.013736803084611893, + "learning_rate": 3.381937457995977e-05, + "loss": 0.4062, + "step": 961 + }, + { + "epoch": 0.6174582798459564, + "grad_norm": 0.014002666808664799, + "learning_rate": 3.372104620237495e-05, + "loss": 0.3789, + "step": 962 + }, + { + "epoch": 0.6181001283697047, + "grad_norm": 0.01376549992710352, + "learning_rate": 3.362278819634563e-05, + "loss": 0.4004, + "step": 963 + }, + { + "epoch": 0.6187419768934531, + "grad_norm": 0.014606526121497154, + "learning_rate": 3.352460098662694e-05, + "loss": 0.3926, + "step": 964 + }, + { + "epoch": 0.6193838254172015, + "grad_norm": 0.013210095465183258, + "learning_rate": 3.342648499766791e-05, + "loss": 0.3711, + "step": 965 + }, + { + "epoch": 0.62002567394095, + "grad_norm": 0.012232472188770771, + "learning_rate": 3.3328440653609735e-05, + "loss": 0.3809, + "step": 966 + }, + { + "epoch": 0.6206675224646984, + "grad_norm": 0.015114069916307926, + "learning_rate": 3.323046837828388e-05, + "loss": 0.4453, + "step": 967 + }, + { + "epoch": 0.6213093709884467, + "grad_norm": 0.014265330508351326, + "learning_rate": 3.313256859521028e-05, + "loss": 0.4141, + "step": 968 + }, + { + "epoch": 0.6219512195121951, + "grad_norm": 0.012651746161282063, + "learning_rate": 3.303474172759552e-05, + "loss": 0.3789, + "step": 969 + }, + { + "epoch": 0.6225930680359435, + "grad_norm": 0.01358629297465086, + "learning_rate": 3.293698819833093e-05, + "loss": 0.3984, + "step": 970 + }, + { + "epoch": 0.6232349165596919, + "grad_norm": 0.012160924263298512, + "learning_rate": 3.2839308429990846e-05, + "loss": 0.3555, + "step": 971 + }, + { + "epoch": 0.6238767650834403, + "grad_norm": 0.012310308404266834, + "learning_rate": 3.274170284483071e-05, + "loss": 0.3906, + "step": 972 + }, + { + "epoch": 0.6245186136071887, + "grad_norm": 0.015214286744594574, + "learning_rate": 3.264417186478535e-05, + "loss": 0.3945, + "step": 973 + }, + { + "epoch": 0.6251604621309371, + "grad_norm": 0.013628803193569183, + "learning_rate": 3.254671591146699e-05, + "loss": 0.3945, + "step": 974 + }, + { + "epoch": 0.6258023106546855, + "grad_norm": 0.012002642266452312, + "learning_rate": 3.244933540616363e-05, + "loss": 0.4121, + "step": 975 + }, + { + "epoch": 0.6264441591784339, + "grad_norm": 0.012065154500305653, + "learning_rate": 3.235203076983704e-05, + "loss": 0.3555, + "step": 976 + }, + { + "epoch": 0.6270860077021823, + "grad_norm": 0.012343095615506172, + "learning_rate": 3.2254802423121045e-05, + "loss": 0.3828, + "step": 977 + }, + { + "epoch": 0.6277278562259306, + "grad_norm": 0.012293039821088314, + "learning_rate": 3.2157650786319694e-05, + "loss": 0.3984, + "step": 978 + }, + { + "epoch": 0.6283697047496791, + "grad_norm": 0.013871513307094574, + "learning_rate": 3.206057627940539e-05, + "loss": 0.3711, + "step": 979 + }, + { + "epoch": 0.6290115532734275, + "grad_norm": 0.014145727269351482, + "learning_rate": 3.196357932201717e-05, + "loss": 0.3867, + "step": 980 + }, + { + "epoch": 0.6296534017971759, + "grad_norm": 0.015412476845085621, + "learning_rate": 3.18666603334588e-05, + "loss": 0.3691, + "step": 981 + }, + { + "epoch": 0.6302952503209243, + "grad_norm": 0.011903384700417519, + "learning_rate": 3.176981973269701e-05, + "loss": 0.3672, + "step": 982 + }, + { + "epoch": 0.6309370988446726, + "grad_norm": 0.012548285536468029, + "learning_rate": 3.167305793835967e-05, + "loss": 0.3945, + "step": 983 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.015092693269252777, + "learning_rate": 3.157637536873397e-05, + "loss": 0.4062, + "step": 984 + }, + { + "epoch": 0.6322207958921694, + "grad_norm": 0.012332094833254814, + "learning_rate": 3.147977244176461e-05, + "loss": 0.375, + "step": 985 + }, + { + "epoch": 0.6328626444159179, + "grad_norm": 0.012180821038782597, + "learning_rate": 3.138324957505207e-05, + "loss": 0.4004, + "step": 986 + }, + { + "epoch": 0.6335044929396663, + "grad_norm": 0.014315093867480755, + "learning_rate": 3.128680718585068e-05, + "loss": 0.4355, + "step": 987 + }, + { + "epoch": 0.6341463414634146, + "grad_norm": 0.012419315055012703, + "learning_rate": 3.1190445691066916e-05, + "loss": 0.375, + "step": 988 + }, + { + "epoch": 0.634788189987163, + "grad_norm": 0.012981019914150238, + "learning_rate": 3.1094165507257514e-05, + "loss": 0.3711, + "step": 989 + }, + { + "epoch": 0.6354300385109114, + "grad_norm": 0.014806430786848068, + "learning_rate": 3.099796705062773e-05, + "loss": 0.3789, + "step": 990 + }, + { + "epoch": 0.6360718870346598, + "grad_norm": 0.01672835648059845, + "learning_rate": 3.090185073702956e-05, + "loss": 0.4062, + "step": 991 + }, + { + "epoch": 0.6367137355584083, + "grad_norm": 0.013994491659104824, + "learning_rate": 3.080581698195989e-05, + "loss": 0.3867, + "step": 992 + }, + { + "epoch": 0.6373555840821566, + "grad_norm": 0.012993131764233112, + "learning_rate": 3.070986620055869e-05, + "loss": 0.375, + "step": 993 + }, + { + "epoch": 0.637997432605905, + "grad_norm": 0.016973624005913734, + "learning_rate": 3.0613998807607267e-05, + "loss": 0.3945, + "step": 994 + }, + { + "epoch": 0.6386392811296534, + "grad_norm": 0.013423938304185867, + "learning_rate": 3.051821521752647e-05, + "loss": 0.3945, + "step": 995 + }, + { + "epoch": 0.6392811296534018, + "grad_norm": 0.014576129615306854, + "learning_rate": 3.042251584437484e-05, + "loss": 0.373, + "step": 996 + }, + { + "epoch": 0.6399229781771502, + "grad_norm": 0.01368379034101963, + "learning_rate": 3.0326901101846905e-05, + "loss": 0.3984, + "step": 997 + }, + { + "epoch": 0.6405648267008985, + "grad_norm": 0.01375710591673851, + "learning_rate": 3.023137140327132e-05, + "loss": 0.375, + "step": 998 + }, + { + "epoch": 0.641206675224647, + "grad_norm": 0.013690740801393986, + "learning_rate": 3.0135927161609097e-05, + "loss": 0.3984, + "step": 999 + }, + { + "epoch": 0.6418485237483954, + "grad_norm": 0.01352259423583746, + "learning_rate": 3.0040568789451862e-05, + "loss": 0.3867, + "step": 1000 + }, + { + "epoch": 0.6424903722721438, + "grad_norm": 0.013356149196624756, + "learning_rate": 2.9945296699020027e-05, + "loss": 0.3438, + "step": 1001 + }, + { + "epoch": 0.6431322207958922, + "grad_norm": 0.013253252021968365, + "learning_rate": 2.9850111302160976e-05, + "loss": 0.4043, + "step": 1002 + }, + { + "epoch": 0.6437740693196405, + "grad_norm": 0.012618259526789188, + "learning_rate": 2.9755013010347434e-05, + "loss": 0.4023, + "step": 1003 + }, + { + "epoch": 0.6444159178433889, + "grad_norm": 0.013053328730165958, + "learning_rate": 2.9660002234675465e-05, + "loss": 0.3535, + "step": 1004 + }, + { + "epoch": 0.6450577663671374, + "grad_norm": 0.01481606811285019, + "learning_rate": 2.9565079385862903e-05, + "loss": 0.3594, + "step": 1005 + }, + { + "epoch": 0.6456996148908858, + "grad_norm": 0.012548556551337242, + "learning_rate": 2.9470244874247443e-05, + "loss": 0.3867, + "step": 1006 + }, + { + "epoch": 0.6463414634146342, + "grad_norm": 0.01402729656547308, + "learning_rate": 2.9375499109784886e-05, + "loss": 0.4023, + "step": 1007 + }, + { + "epoch": 0.6469833119383825, + "grad_norm": 0.014539182186126709, + "learning_rate": 2.928084250204749e-05, + "loss": 0.3984, + "step": 1008 + }, + { + "epoch": 0.6476251604621309, + "grad_norm": 0.015400463715195656, + "learning_rate": 2.918627546022199e-05, + "loss": 0.3828, + "step": 1009 + }, + { + "epoch": 0.6482670089858793, + "grad_norm": 0.016256695613265038, + "learning_rate": 2.9091798393107994e-05, + "loss": 0.377, + "step": 1010 + }, + { + "epoch": 0.6489088575096277, + "grad_norm": 0.012321251444518566, + "learning_rate": 2.8997411709116168e-05, + "loss": 0.3867, + "step": 1011 + }, + { + "epoch": 0.6495507060333762, + "grad_norm": 0.012926090508699417, + "learning_rate": 2.890311581626647e-05, + "loss": 0.3672, + "step": 1012 + }, + { + "epoch": 0.6501925545571245, + "grad_norm": 0.01560135930776596, + "learning_rate": 2.8808911122186293e-05, + "loss": 0.3906, + "step": 1013 + }, + { + "epoch": 0.6508344030808729, + "grad_norm": 0.01321703102439642, + "learning_rate": 2.871479803410896e-05, + "loss": 0.4219, + "step": 1014 + }, + { + "epoch": 0.6514762516046213, + "grad_norm": 0.013517164625227451, + "learning_rate": 2.8620776958871627e-05, + "loss": 0.375, + "step": 1015 + }, + { + "epoch": 0.6521181001283697, + "grad_norm": 0.013148191384971142, + "learning_rate": 2.852684830291378e-05, + "loss": 0.3633, + "step": 1016 + }, + { + "epoch": 0.6527599486521181, + "grad_norm": 0.01478677149862051, + "learning_rate": 2.84330124722754e-05, + "loss": 0.3848, + "step": 1017 + }, + { + "epoch": 0.6534017971758665, + "grad_norm": 0.013707499951124191, + "learning_rate": 2.8339269872595097e-05, + "loss": 0.3945, + "step": 1018 + }, + { + "epoch": 0.6540436456996149, + "grad_norm": 0.014768145978450775, + "learning_rate": 2.8245620909108618e-05, + "loss": 0.3906, + "step": 1019 + }, + { + "epoch": 0.6546854942233633, + "grad_norm": 0.01512923650443554, + "learning_rate": 2.8152065986646788e-05, + "loss": 0.3789, + "step": 1020 + }, + { + "epoch": 0.6553273427471117, + "grad_norm": 0.015050279907882214, + "learning_rate": 2.8058605509633974e-05, + "loss": 0.3984, + "step": 1021 + }, + { + "epoch": 0.6559691912708601, + "grad_norm": 0.012544546276330948, + "learning_rate": 2.7965239882086292e-05, + "loss": 0.3633, + "step": 1022 + }, + { + "epoch": 0.6566110397946084, + "grad_norm": 0.013999714516103268, + "learning_rate": 2.7871969507609747e-05, + "loss": 0.3887, + "step": 1023 + }, + { + "epoch": 0.6572528883183568, + "grad_norm": 0.012318492867052555, + "learning_rate": 2.7778794789398667e-05, + "loss": 0.3789, + "step": 1024 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.013553598895668983, + "learning_rate": 2.7685716130233842e-05, + "loss": 0.3828, + "step": 1025 + }, + { + "epoch": 0.6585365853658537, + "grad_norm": 0.01511886902153492, + "learning_rate": 2.759273393248081e-05, + "loss": 0.373, + "step": 1026 + }, + { + "epoch": 0.6591784338896021, + "grad_norm": 0.012835157103836536, + "learning_rate": 2.7499848598088136e-05, + "loss": 0.3633, + "step": 1027 + }, + { + "epoch": 0.6598202824133504, + "grad_norm": 0.012799869291484356, + "learning_rate": 2.7407060528585616e-05, + "loss": 0.3848, + "step": 1028 + }, + { + "epoch": 0.6604621309370988, + "grad_norm": 0.012743955478072166, + "learning_rate": 2.731437012508262e-05, + "loss": 0.3691, + "step": 1029 + }, + { + "epoch": 0.6611039794608472, + "grad_norm": 0.014323204755783081, + "learning_rate": 2.7221777788266324e-05, + "loss": 0.3828, + "step": 1030 + }, + { + "epoch": 0.6617458279845956, + "grad_norm": 0.015074282884597778, + "learning_rate": 2.712928391839996e-05, + "loss": 0.4043, + "step": 1031 + }, + { + "epoch": 0.6623876765083441, + "grad_norm": 0.01439194567501545, + "learning_rate": 2.7036888915321136e-05, + "loss": 0.4062, + "step": 1032 + }, + { + "epoch": 0.6630295250320924, + "grad_norm": 0.013209599070250988, + "learning_rate": 2.6944593178440003e-05, + "loss": 0.3828, + "step": 1033 + }, + { + "epoch": 0.6636713735558408, + "grad_norm": 0.013193206861615181, + "learning_rate": 2.685239710673766e-05, + "loss": 0.3555, + "step": 1034 + }, + { + "epoch": 0.6643132220795892, + "grad_norm": 0.012524771504104137, + "learning_rate": 2.676030109876434e-05, + "loss": 0.4062, + "step": 1035 + }, + { + "epoch": 0.6649550706033376, + "grad_norm": 0.012812083587050438, + "learning_rate": 2.666830555263774e-05, + "loss": 0.3945, + "step": 1036 + }, + { + "epoch": 0.665596919127086, + "grad_norm": 0.014166711829602718, + "learning_rate": 2.6576410866041274e-05, + "loss": 0.3789, + "step": 1037 + }, + { + "epoch": 0.6662387676508345, + "grad_norm": 0.013674836605787277, + "learning_rate": 2.6484617436222293e-05, + "loss": 0.3672, + "step": 1038 + }, + { + "epoch": 0.6668806161745828, + "grad_norm": 0.012029345147311687, + "learning_rate": 2.639292565999051e-05, + "loss": 0.3711, + "step": 1039 + }, + { + "epoch": 0.6675224646983312, + "grad_norm": 0.015155037865042686, + "learning_rate": 2.6301335933716176e-05, + "loss": 0.4102, + "step": 1040 + }, + { + "epoch": 0.6681643132220796, + "grad_norm": 0.011492251418530941, + "learning_rate": 2.620984865332834e-05, + "loss": 0.3594, + "step": 1041 + }, + { + "epoch": 0.668806161745828, + "grad_norm": 0.012895763851702213, + "learning_rate": 2.611846421431331e-05, + "loss": 0.3926, + "step": 1042 + }, + { + "epoch": 0.6694480102695763, + "grad_norm": 0.013804377056658268, + "learning_rate": 2.6027183011712708e-05, + "loss": 0.3867, + "step": 1043 + }, + { + "epoch": 0.6700898587933247, + "grad_norm": 0.015160754323005676, + "learning_rate": 2.593600544012196e-05, + "loss": 0.3906, + "step": 1044 + }, + { + "epoch": 0.6707317073170732, + "grad_norm": 0.013987107202410698, + "learning_rate": 2.5844931893688473e-05, + "loss": 0.3711, + "step": 1045 + }, + { + "epoch": 0.6713735558408216, + "grad_norm": 0.01572549343109131, + "learning_rate": 2.575396276610994e-05, + "loss": 0.3906, + "step": 1046 + }, + { + "epoch": 0.67201540436457, + "grad_norm": 0.013872017152607441, + "learning_rate": 2.5663098450632762e-05, + "loss": 0.3594, + "step": 1047 + }, + { + "epoch": 0.6726572528883183, + "grad_norm": 0.013739910908043385, + "learning_rate": 2.557233934005015e-05, + "loss": 0.3906, + "step": 1048 + }, + { + "epoch": 0.6732991014120667, + "grad_norm": 0.013132906518876553, + "learning_rate": 2.548168582670058e-05, + "loss": 0.3789, + "step": 1049 + }, + { + "epoch": 0.6739409499358151, + "grad_norm": 0.013742578215897083, + "learning_rate": 2.5391138302466062e-05, + "loss": 0.3984, + "step": 1050 + }, + { + "epoch": 0.6745827984595636, + "grad_norm": 0.013859684579074383, + "learning_rate": 2.5300697158770365e-05, + "loss": 0.3711, + "step": 1051 + }, + { + "epoch": 0.675224646983312, + "grad_norm": 0.012405592948198318, + "learning_rate": 2.5210362786577452e-05, + "loss": 0.4043, + "step": 1052 + }, + { + "epoch": 0.6758664955070603, + "grad_norm": 0.012248203158378601, + "learning_rate": 2.5120135576389715e-05, + "loss": 0.3867, + "step": 1053 + }, + { + "epoch": 0.6765083440308087, + "grad_norm": 0.01508716307580471, + "learning_rate": 2.503001591824628e-05, + "loss": 0.3789, + "step": 1054 + }, + { + "epoch": 0.6771501925545571, + "grad_norm": 0.013256301172077656, + "learning_rate": 2.4940004201721384e-05, + "loss": 0.4141, + "step": 1055 + }, + { + "epoch": 0.6777920410783055, + "grad_norm": 0.01401185430586338, + "learning_rate": 2.4850100815922577e-05, + "loss": 0.4004, + "step": 1056 + }, + { + "epoch": 0.6784338896020539, + "grad_norm": 0.012873097322881222, + "learning_rate": 2.476030614948917e-05, + "loss": 0.3809, + "step": 1057 + }, + { + "epoch": 0.6790757381258024, + "grad_norm": 0.01284098532050848, + "learning_rate": 2.4670620590590482e-05, + "loss": 0.3867, + "step": 1058 + }, + { + "epoch": 0.6797175866495507, + "grad_norm": 0.011478393338620663, + "learning_rate": 2.4581044526924175e-05, + "loss": 0.4102, + "step": 1059 + }, + { + "epoch": 0.6803594351732991, + "grad_norm": 0.0136007871478796, + "learning_rate": 2.4491578345714587e-05, + "loss": 0.3965, + "step": 1060 + }, + { + "epoch": 0.6810012836970475, + "grad_norm": 0.012089556083083153, + "learning_rate": 2.4402222433711008e-05, + "loss": 0.3809, + "step": 1061 + }, + { + "epoch": 0.6816431322207959, + "grad_norm": 0.013884073123335838, + "learning_rate": 2.4312977177186095e-05, + "loss": 0.3906, + "step": 1062 + }, + { + "epoch": 0.6822849807445442, + "grad_norm": 0.013468881137669086, + "learning_rate": 2.422384296193415e-05, + "loss": 0.4102, + "step": 1063 + }, + { + "epoch": 0.6829268292682927, + "grad_norm": 0.012956452555954456, + "learning_rate": 2.4134820173269456e-05, + "loss": 0.3652, + "step": 1064 + }, + { + "epoch": 0.6835686777920411, + "grad_norm": 0.012719823978841305, + "learning_rate": 2.4045909196024624e-05, + "loss": 0.3984, + "step": 1065 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.013100896961987019, + "learning_rate": 2.3957110414548874e-05, + "loss": 0.373, + "step": 1066 + }, + { + "epoch": 0.6848523748395379, + "grad_norm": 0.013353249989449978, + "learning_rate": 2.3868424212706476e-05, + "loss": 0.377, + "step": 1067 + }, + { + "epoch": 0.6854942233632862, + "grad_norm": 0.012005530297756195, + "learning_rate": 2.3779850973875036e-05, + "loss": 0.3789, + "step": 1068 + }, + { + "epoch": 0.6861360718870346, + "grad_norm": 0.012415427714586258, + "learning_rate": 2.369139108094375e-05, + "loss": 0.3438, + "step": 1069 + }, + { + "epoch": 0.686777920410783, + "grad_norm": 0.012416977435350418, + "learning_rate": 2.3603044916311963e-05, + "loss": 0.373, + "step": 1070 + }, + { + "epoch": 0.6874197689345315, + "grad_norm": 0.012292545288801193, + "learning_rate": 2.3514812861887327e-05, + "loss": 0.375, + "step": 1071 + }, + { + "epoch": 0.6880616174582799, + "grad_norm": 0.012595480307936668, + "learning_rate": 2.3426695299084173e-05, + "loss": 0.3828, + "step": 1072 + }, + { + "epoch": 0.6887034659820283, + "grad_norm": 0.012537018395960331, + "learning_rate": 2.3338692608821982e-05, + "loss": 0.3711, + "step": 1073 + }, + { + "epoch": 0.6893453145057766, + "grad_norm": 0.011396169662475586, + "learning_rate": 2.325080517152356e-05, + "loss": 0.3887, + "step": 1074 + }, + { + "epoch": 0.689987163029525, + "grad_norm": 0.012768317945301533, + "learning_rate": 2.3163033367113602e-05, + "loss": 0.3828, + "step": 1075 + }, + { + "epoch": 0.6906290115532734, + "grad_norm": 0.012074895203113556, + "learning_rate": 2.307537757501688e-05, + "loss": 0.3867, + "step": 1076 + }, + { + "epoch": 0.6912708600770219, + "grad_norm": 0.011272402480244637, + "learning_rate": 2.298783817415664e-05, + "loss": 0.3691, + "step": 1077 + }, + { + "epoch": 0.6919127086007703, + "grad_norm": 0.012021319009363651, + "learning_rate": 2.2900415542953035e-05, + "loss": 0.3574, + "step": 1078 + }, + { + "epoch": 0.6925545571245186, + "grad_norm": 0.013903750106692314, + "learning_rate": 2.281311005932139e-05, + "loss": 0.3887, + "step": 1079 + }, + { + "epoch": 0.693196405648267, + "grad_norm": 0.013219778425991535, + "learning_rate": 2.2725922100670644e-05, + "loss": 0.3867, + "step": 1080 + }, + { + "epoch": 0.6938382541720154, + "grad_norm": 0.013222387991845608, + "learning_rate": 2.263885204390174e-05, + "loss": 0.3633, + "step": 1081 + }, + { + "epoch": 0.6944801026957638, + "grad_norm": 0.015906382352113724, + "learning_rate": 2.255190026540585e-05, + "loss": 0.3867, + "step": 1082 + }, + { + "epoch": 0.6951219512195121, + "grad_norm": 0.013349148444831371, + "learning_rate": 2.246506714106294e-05, + "loss": 0.4141, + "step": 1083 + }, + { + "epoch": 0.6957637997432606, + "grad_norm": 0.016905279830098152, + "learning_rate": 2.2378353046239963e-05, + "loss": 0.3945, + "step": 1084 + }, + { + "epoch": 0.696405648267009, + "grad_norm": 0.011738389730453491, + "learning_rate": 2.22917583557894e-05, + "loss": 0.3887, + "step": 1085 + }, + { + "epoch": 0.6970474967907574, + "grad_norm": 0.013519138097763062, + "learning_rate": 2.220528344404752e-05, + "loss": 0.373, + "step": 1086 + }, + { + "epoch": 0.6976893453145058, + "grad_norm": 0.013963024131953716, + "learning_rate": 2.211892868483283e-05, + "loss": 0.3789, + "step": 1087 + }, + { + "epoch": 0.6983311938382541, + "grad_norm": 0.01420444156974554, + "learning_rate": 2.2032694451444424e-05, + "loss": 0.3633, + "step": 1088 + }, + { + "epoch": 0.6989730423620025, + "grad_norm": 0.012128225527703762, + "learning_rate": 2.1946581116660403e-05, + "loss": 0.3555, + "step": 1089 + }, + { + "epoch": 0.699614890885751, + "grad_norm": 0.012816647067666054, + "learning_rate": 2.186058905273618e-05, + "loss": 0.3789, + "step": 1090 + }, + { + "epoch": 0.7002567394094994, + "grad_norm": 0.013052361086010933, + "learning_rate": 2.1774718631402987e-05, + "loss": 0.3906, + "step": 1091 + }, + { + "epoch": 0.7008985879332478, + "grad_norm": 0.012910526245832443, + "learning_rate": 2.16889702238662e-05, + "loss": 0.3984, + "step": 1092 + }, + { + "epoch": 0.7015404364569962, + "grad_norm": 0.014237569645047188, + "learning_rate": 2.1603344200803743e-05, + "loss": 0.4062, + "step": 1093 + }, + { + "epoch": 0.7021822849807445, + "grad_norm": 0.012514214962720871, + "learning_rate": 2.151784093236452e-05, + "loss": 0.3711, + "step": 1094 + }, + { + "epoch": 0.7028241335044929, + "grad_norm": 0.011893239803612232, + "learning_rate": 2.1432460788166704e-05, + "loss": 0.3555, + "step": 1095 + }, + { + "epoch": 0.7034659820282413, + "grad_norm": 0.013944214209914207, + "learning_rate": 2.134720413729631e-05, + "loss": 0.3828, + "step": 1096 + }, + { + "epoch": 0.7041078305519898, + "grad_norm": 0.012904972769320011, + "learning_rate": 2.1262071348305425e-05, + "loss": 0.3828, + "step": 1097 + }, + { + "epoch": 0.7047496790757382, + "grad_norm": 0.014260875061154366, + "learning_rate": 2.1177062789210796e-05, + "loss": 0.3828, + "step": 1098 + }, + { + "epoch": 0.7053915275994865, + "grad_norm": 0.012384374625980854, + "learning_rate": 2.109217882749208e-05, + "loss": 0.3965, + "step": 1099 + }, + { + "epoch": 0.7060333761232349, + "grad_norm": 0.013365059159696102, + "learning_rate": 2.1007419830090306e-05, + "loss": 0.3594, + "step": 1100 + }, + { + "epoch": 0.7066752246469833, + "grad_norm": 0.015220331028103828, + "learning_rate": 2.0922786163406337e-05, + "loss": 0.3945, + "step": 1101 + }, + { + "epoch": 0.7073170731707317, + "grad_norm": 0.012258613482117653, + "learning_rate": 2.0838278193299237e-05, + "loss": 0.3535, + "step": 1102 + }, + { + "epoch": 0.70795892169448, + "grad_norm": 0.012427720241248608, + "learning_rate": 2.0753896285084694e-05, + "loss": 0.3848, + "step": 1103 + }, + { + "epoch": 0.7086007702182285, + "grad_norm": 0.011899521574378014, + "learning_rate": 2.0669640803533474e-05, + "loss": 0.3965, + "step": 1104 + }, + { + "epoch": 0.7092426187419769, + "grad_norm": 0.012814830057322979, + "learning_rate": 2.058551211286977e-05, + "loss": 0.3613, + "step": 1105 + }, + { + "epoch": 0.7098844672657253, + "grad_norm": 0.015035499818623066, + "learning_rate": 2.0501510576769722e-05, + "loss": 0.3535, + "step": 1106 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.016760557889938354, + "learning_rate": 2.04176365583598e-05, + "loss": 0.3984, + "step": 1107 + }, + { + "epoch": 0.711168164313222, + "grad_norm": 0.012370377779006958, + "learning_rate": 2.0333890420215164e-05, + "loss": 0.3691, + "step": 1108 + }, + { + "epoch": 0.7118100128369704, + "grad_norm": 0.013664168305695057, + "learning_rate": 2.025027252435829e-05, + "loss": 0.3984, + "step": 1109 + }, + { + "epoch": 0.7124518613607189, + "grad_norm": 0.01310388557612896, + "learning_rate": 2.0166783232257154e-05, + "loss": 0.3711, + "step": 1110 + }, + { + "epoch": 0.7130937098844673, + "grad_norm": 0.013425314798951149, + "learning_rate": 2.008342290482388e-05, + "loss": 0.3984, + "step": 1111 + }, + { + "epoch": 0.7137355584082157, + "grad_norm": 0.013783198781311512, + "learning_rate": 2.0000191902413073e-05, + "loss": 0.3945, + "step": 1112 + }, + { + "epoch": 0.714377406931964, + "grad_norm": 0.013114030472934246, + "learning_rate": 1.991709058482026e-05, + "loss": 0.3711, + "step": 1113 + }, + { + "epoch": 0.7150192554557124, + "grad_norm": 0.012237347662448883, + "learning_rate": 1.9834119311280392e-05, + "loss": 0.4043, + "step": 1114 + }, + { + "epoch": 0.7156611039794608, + "grad_norm": 0.01424240879714489, + "learning_rate": 1.9751278440466248e-05, + "loss": 0.3906, + "step": 1115 + }, + { + "epoch": 0.7163029525032092, + "grad_norm": 0.013711946085095406, + "learning_rate": 1.966856833048689e-05, + "loss": 0.3672, + "step": 1116 + }, + { + "epoch": 0.7169448010269577, + "grad_norm": 0.01330238301306963, + "learning_rate": 1.9585989338886147e-05, + "loss": 0.4102, + "step": 1117 + }, + { + "epoch": 0.7175866495507061, + "grad_norm": 0.013692202046513557, + "learning_rate": 1.950354182264098e-05, + "loss": 0.3965, + "step": 1118 + }, + { + "epoch": 0.7182284980744544, + "grad_norm": 0.013284241780638695, + "learning_rate": 1.942122613816006e-05, + "loss": 0.3613, + "step": 1119 + }, + { + "epoch": 0.7188703465982028, + "grad_norm": 0.012372693978250027, + "learning_rate": 1.9339042641282146e-05, + "loss": 0.3652, + "step": 1120 + }, + { + "epoch": 0.7195121951219512, + "grad_norm": 0.013219374231994152, + "learning_rate": 1.925699168727458e-05, + "loss": 0.3477, + "step": 1121 + }, + { + "epoch": 0.7201540436456996, + "grad_norm": 0.014225227758288383, + "learning_rate": 1.9175073630831736e-05, + "loss": 0.4023, + "step": 1122 + }, + { + "epoch": 0.7207958921694481, + "grad_norm": 0.01241610199213028, + "learning_rate": 1.9093288826073462e-05, + "loss": 0.3516, + "step": 1123 + }, + { + "epoch": 0.7214377406931964, + "grad_norm": 0.013540954329073429, + "learning_rate": 1.9011637626543617e-05, + "loss": 0.3906, + "step": 1124 + }, + { + "epoch": 0.7220795892169448, + "grad_norm": 0.015003510750830173, + "learning_rate": 1.8930120385208495e-05, + "loss": 0.3867, + "step": 1125 + }, + { + "epoch": 0.7227214377406932, + "grad_norm": 0.013304078951478004, + "learning_rate": 1.88487374544553e-05, + "loss": 0.3867, + "step": 1126 + }, + { + "epoch": 0.7233632862644416, + "grad_norm": 0.012056106701493263, + "learning_rate": 1.8767489186090654e-05, + "loss": 0.3887, + "step": 1127 + }, + { + "epoch": 0.72400513478819, + "grad_norm": 0.01176505722105503, + "learning_rate": 1.8686375931338997e-05, + "loss": 0.3945, + "step": 1128 + }, + { + "epoch": 0.7246469833119383, + "grad_norm": 0.013599003665149212, + "learning_rate": 1.8605398040841172e-05, + "loss": 0.3789, + "step": 1129 + }, + { + "epoch": 0.7252888318356868, + "grad_norm": 0.014635767787694931, + "learning_rate": 1.8524555864652865e-05, + "loss": 0.416, + "step": 1130 + }, + { + "epoch": 0.7259306803594352, + "grad_norm": 0.013412141241133213, + "learning_rate": 1.844384975224307e-05, + "loss": 0.4062, + "step": 1131 + }, + { + "epoch": 0.7265725288831836, + "grad_norm": 0.013095933943986893, + "learning_rate": 1.8363280052492617e-05, + "loss": 0.3711, + "step": 1132 + }, + { + "epoch": 0.727214377406932, + "grad_norm": 0.01511120330542326, + "learning_rate": 1.8282847113692593e-05, + "loss": 0.4023, + "step": 1133 + }, + { + "epoch": 0.7278562259306803, + "grad_norm": 0.012698481790721416, + "learning_rate": 1.820255128354294e-05, + "loss": 0.3828, + "step": 1134 + }, + { + "epoch": 0.7284980744544287, + "grad_norm": 0.013212742283940315, + "learning_rate": 1.8122392909150904e-05, + "loss": 0.375, + "step": 1135 + }, + { + "epoch": 0.7291399229781772, + "grad_norm": 0.012589886784553528, + "learning_rate": 1.8042372337029455e-05, + "loss": 0.3516, + "step": 1136 + }, + { + "epoch": 0.7297817715019256, + "grad_norm": 0.011949255131185055, + "learning_rate": 1.7962489913095987e-05, + "loss": 0.375, + "step": 1137 + }, + { + "epoch": 0.730423620025674, + "grad_norm": 0.012383786030113697, + "learning_rate": 1.7882745982670573e-05, + "loss": 0.3984, + "step": 1138 + }, + { + "epoch": 0.7310654685494223, + "grad_norm": 0.011663233861327171, + "learning_rate": 1.7803140890474674e-05, + "loss": 0.375, + "step": 1139 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 0.014100736938416958, + "learning_rate": 1.7723674980629572e-05, + "loss": 0.3711, + "step": 1140 + }, + { + "epoch": 0.7323491655969191, + "grad_norm": 0.012323318980634212, + "learning_rate": 1.7644348596654837e-05, + "loss": 0.3828, + "step": 1141 + }, + { + "epoch": 0.7329910141206675, + "grad_norm": 0.012874532490968704, + "learning_rate": 1.756516208146693e-05, + "loss": 0.3672, + "step": 1142 + }, + { + "epoch": 0.733632862644416, + "grad_norm": 0.013383268378674984, + "learning_rate": 1.7486115777377667e-05, + "loss": 0.3711, + "step": 1143 + }, + { + "epoch": 0.7342747111681643, + "grad_norm": 0.012120241299271584, + "learning_rate": 1.740721002609275e-05, + "loss": 0.3789, + "step": 1144 + }, + { + "epoch": 0.7349165596919127, + "grad_norm": 0.014149006456136703, + "learning_rate": 1.7328445168710323e-05, + "loss": 0.3926, + "step": 1145 + }, + { + "epoch": 0.7355584082156611, + "grad_norm": 0.012970617972314358, + "learning_rate": 1.7249821545719387e-05, + "loss": 0.4062, + "step": 1146 + }, + { + "epoch": 0.7362002567394095, + "grad_norm": 0.013903571292757988, + "learning_rate": 1.717133949699849e-05, + "loss": 0.3945, + "step": 1147 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.012635747902095318, + "learning_rate": 1.7092999361814126e-05, + "loss": 0.3867, + "step": 1148 + }, + { + "epoch": 0.7374839537869063, + "grad_norm": 0.012244436889886856, + "learning_rate": 1.7014801478819347e-05, + "loss": 0.3926, + "step": 1149 + }, + { + "epoch": 0.7381258023106547, + "grad_norm": 0.014593473635613918, + "learning_rate": 1.693674618605227e-05, + "loss": 0.4141, + "step": 1150 + }, + { + "epoch": 0.7387676508344031, + "grad_norm": 0.014035548083484173, + "learning_rate": 1.6858833820934567e-05, + "loss": 0.3848, + "step": 1151 + }, + { + "epoch": 0.7394094993581515, + "grad_norm": 0.013262953609228134, + "learning_rate": 1.6781064720270105e-05, + "loss": 0.3965, + "step": 1152 + }, + { + "epoch": 0.7400513478818999, + "grad_norm": 0.01288782898336649, + "learning_rate": 1.6703439220243423e-05, + "loss": 0.4102, + "step": 1153 + }, + { + "epoch": 0.7406931964056482, + "grad_norm": 0.011735158041119576, + "learning_rate": 1.6625957656418295e-05, + "loss": 0.373, + "step": 1154 + }, + { + "epoch": 0.7413350449293966, + "grad_norm": 0.012648727744817734, + "learning_rate": 1.6548620363736294e-05, + "loss": 0.3652, + "step": 1155 + }, + { + "epoch": 0.7419768934531451, + "grad_norm": 0.013695376925170422, + "learning_rate": 1.6471427676515288e-05, + "loss": 0.4004, + "step": 1156 + }, + { + "epoch": 0.7426187419768935, + "grad_norm": 0.015565620735287666, + "learning_rate": 1.6394379928448083e-05, + "loss": 0.3984, + "step": 1157 + }, + { + "epoch": 0.7432605905006419, + "grad_norm": 0.012965993955731392, + "learning_rate": 1.6317477452600917e-05, + "loss": 0.3809, + "step": 1158 + }, + { + "epoch": 0.7439024390243902, + "grad_norm": 0.013348723761737347, + "learning_rate": 1.624072058141202e-05, + "loss": 0.3828, + "step": 1159 + }, + { + "epoch": 0.7445442875481386, + "grad_norm": 0.013615473173558712, + "learning_rate": 1.616410964669025e-05, + "loss": 0.3516, + "step": 1160 + }, + { + "epoch": 0.745186136071887, + "grad_norm": 0.012235503643751144, + "learning_rate": 1.6087644979613514e-05, + "loss": 0.3711, + "step": 1161 + }, + { + "epoch": 0.7458279845956355, + "grad_norm": 0.01184153463691473, + "learning_rate": 1.6011326910727492e-05, + "loss": 0.3574, + "step": 1162 + }, + { + "epoch": 0.7464698331193839, + "grad_norm": 0.012317166663706303, + "learning_rate": 1.593515576994415e-05, + "loss": 0.3516, + "step": 1163 + }, + { + "epoch": 0.7471116816431322, + "grad_norm": 0.013108180835843086, + "learning_rate": 1.5859131886540214e-05, + "loss": 0.4004, + "step": 1164 + }, + { + "epoch": 0.7477535301668806, + "grad_norm": 0.013911323621869087, + "learning_rate": 1.578325558915598e-05, + "loss": 0.4062, + "step": 1165 + }, + { + "epoch": 0.748395378690629, + "grad_norm": 0.014530111104249954, + "learning_rate": 1.570752720579362e-05, + "loss": 0.3926, + "step": 1166 + }, + { + "epoch": 0.7490372272143774, + "grad_norm": 0.014330373145639896, + "learning_rate": 1.563194706381597e-05, + "loss": 0.3789, + "step": 1167 + }, + { + "epoch": 0.7496790757381258, + "grad_norm": 0.013744387775659561, + "learning_rate": 1.555651548994503e-05, + "loss": 0.3867, + "step": 1168 + }, + { + "epoch": 0.7503209242618742, + "grad_norm": 0.01154696848243475, + "learning_rate": 1.548123281026051e-05, + "loss": 0.3789, + "step": 1169 + }, + { + "epoch": 0.7509627727856226, + "grad_norm": 0.01333354040980339, + "learning_rate": 1.5406099350198544e-05, + "loss": 0.3789, + "step": 1170 + }, + { + "epoch": 0.751604621309371, + "grad_norm": 0.01610654406249523, + "learning_rate": 1.533111543455017e-05, + "loss": 0.3691, + "step": 1171 + }, + { + "epoch": 0.7522464698331194, + "grad_norm": 0.012678599916398525, + "learning_rate": 1.5256281387459975e-05, + "loss": 0.3438, + "step": 1172 + }, + { + "epoch": 0.7528883183568678, + "grad_norm": 0.013093999586999416, + "learning_rate": 1.5181597532424713e-05, + "loss": 0.3633, + "step": 1173 + }, + { + "epoch": 0.7535301668806161, + "grad_norm": 0.014720738865435123, + "learning_rate": 1.5107064192291808e-05, + "loss": 0.3906, + "step": 1174 + }, + { + "epoch": 0.7541720154043645, + "grad_norm": 0.011939354240894318, + "learning_rate": 1.5032681689258105e-05, + "loss": 0.3984, + "step": 1175 + }, + { + "epoch": 0.754813863928113, + "grad_norm": 0.014355121180415154, + "learning_rate": 1.495845034486837e-05, + "loss": 0.3633, + "step": 1176 + }, + { + "epoch": 0.7554557124518614, + "grad_norm": 0.013115710578858852, + "learning_rate": 1.4884370480013931e-05, + "loss": 0.418, + "step": 1177 + }, + { + "epoch": 0.7560975609756098, + "grad_norm": 0.012193549424409866, + "learning_rate": 1.481044241493132e-05, + "loss": 0.3906, + "step": 1178 + }, + { + "epoch": 0.7567394094993581, + "grad_norm": 0.013995474204421043, + "learning_rate": 1.4736666469200793e-05, + "loss": 0.3848, + "step": 1179 + }, + { + "epoch": 0.7573812580231065, + "grad_norm": 0.01327692624181509, + "learning_rate": 1.4663042961745083e-05, + "loss": 0.3457, + "step": 1180 + }, + { + "epoch": 0.7580231065468549, + "grad_norm": 0.013241326436400414, + "learning_rate": 1.4589572210827934e-05, + "loss": 0.3926, + "step": 1181 + }, + { + "epoch": 0.7586649550706034, + "grad_norm": 0.014227569103240967, + "learning_rate": 1.4516254534052736e-05, + "loss": 0.3945, + "step": 1182 + }, + { + "epoch": 0.7593068035943518, + "grad_norm": 0.012828252278268337, + "learning_rate": 1.444309024836119e-05, + "loss": 0.3594, + "step": 1183 + }, + { + "epoch": 0.7599486521181001, + "grad_norm": 0.01393461786210537, + "learning_rate": 1.4370079670031854e-05, + "loss": 0.3867, + "step": 1184 + }, + { + "epoch": 0.7605905006418485, + "grad_norm": 0.013498412445187569, + "learning_rate": 1.4297223114678887e-05, + "loss": 0.4082, + "step": 1185 + }, + { + "epoch": 0.7612323491655969, + "grad_norm": 0.013050911016762257, + "learning_rate": 1.4224520897250599e-05, + "loss": 0.3711, + "step": 1186 + }, + { + "epoch": 0.7618741976893453, + "grad_norm": 0.012300627306103706, + "learning_rate": 1.4151973332028134e-05, + "loss": 0.3379, + "step": 1187 + }, + { + "epoch": 0.7625160462130937, + "grad_norm": 0.011986950412392616, + "learning_rate": 1.4079580732624104e-05, + "loss": 0.3555, + "step": 1188 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.013296863064169884, + "learning_rate": 1.4007343411981189e-05, + "loss": 0.3438, + "step": 1189 + }, + { + "epoch": 0.7637997432605905, + "grad_norm": 0.014532539993524551, + "learning_rate": 1.3935261682370849e-05, + "loss": 0.3672, + "step": 1190 + }, + { + "epoch": 0.7644415917843389, + "grad_norm": 0.012205745093524456, + "learning_rate": 1.3863335855391967e-05, + "loss": 0.3711, + "step": 1191 + }, + { + "epoch": 0.7650834403080873, + "grad_norm": 0.012984980829060078, + "learning_rate": 1.379156624196941e-05, + "loss": 0.3984, + "step": 1192 + }, + { + "epoch": 0.7657252888318357, + "grad_norm": 0.012554211542010307, + "learning_rate": 1.371995315235287e-05, + "loss": 0.3477, + "step": 1193 + }, + { + "epoch": 0.766367137355584, + "grad_norm": 0.013144235126674175, + "learning_rate": 1.3648496896115292e-05, + "loss": 0.3535, + "step": 1194 + }, + { + "epoch": 0.7670089858793325, + "grad_norm": 0.012609812431037426, + "learning_rate": 1.3577197782151724e-05, + "loss": 0.3633, + "step": 1195 + }, + { + "epoch": 0.7676508344030809, + "grad_norm": 0.012705634348094463, + "learning_rate": 1.35060561186779e-05, + "loss": 0.3984, + "step": 1196 + }, + { + "epoch": 0.7682926829268293, + "grad_norm": 0.013995714485645294, + "learning_rate": 1.3435072213228889e-05, + "loss": 0.3711, + "step": 1197 + }, + { + "epoch": 0.7689345314505777, + "grad_norm": 0.0138862868770957, + "learning_rate": 1.3364246372657824e-05, + "loss": 0.3984, + "step": 1198 + }, + { + "epoch": 0.769576379974326, + "grad_norm": 0.013842416927218437, + "learning_rate": 1.3293578903134546e-05, + "loss": 0.3984, + "step": 1199 + }, + { + "epoch": 0.7702182284980744, + "grad_norm": 0.013586601242423058, + "learning_rate": 1.3223070110144265e-05, + "loss": 0.3906, + "step": 1200 + }, + { + "epoch": 0.7708600770218228, + "grad_norm": 0.014347325079143047, + "learning_rate": 1.3152720298486276e-05, + "loss": 0.3984, + "step": 1201 + }, + { + "epoch": 0.7715019255455713, + "grad_norm": 0.012852374464273453, + "learning_rate": 1.308252977227259e-05, + "loss": 0.3945, + "step": 1202 + }, + { + "epoch": 0.7721437740693197, + "grad_norm": 0.012284908443689346, + "learning_rate": 1.3012498834926662e-05, + "loss": 0.3672, + "step": 1203 + }, + { + "epoch": 0.772785622593068, + "grad_norm": 0.013469730503857136, + "learning_rate": 1.2942627789182121e-05, + "loss": 0.3633, + "step": 1204 + }, + { + "epoch": 0.7734274711168164, + "grad_norm": 0.014604169875383377, + "learning_rate": 1.2872916937081308e-05, + "loss": 0.3945, + "step": 1205 + }, + { + "epoch": 0.7740693196405648, + "grad_norm": 0.012190227396786213, + "learning_rate": 1.280336657997417e-05, + "loss": 0.3984, + "step": 1206 + }, + { + "epoch": 0.7747111681643132, + "grad_norm": 0.015167908743023872, + "learning_rate": 1.2733977018516758e-05, + "loss": 0.4238, + "step": 1207 + }, + { + "epoch": 0.7753530166880617, + "grad_norm": 0.01425865013152361, + "learning_rate": 1.2664748552670113e-05, + "loss": 0.3672, + "step": 1208 + }, + { + "epoch": 0.77599486521181, + "grad_norm": 0.01407993957400322, + "learning_rate": 1.259568148169884e-05, + "loss": 0.3945, + "step": 1209 + }, + { + "epoch": 0.7766367137355584, + "grad_norm": 0.013557854108512402, + "learning_rate": 1.2526776104169868e-05, + "loss": 0.3809, + "step": 1210 + }, + { + "epoch": 0.7772785622593068, + "grad_norm": 0.013125860132277012, + "learning_rate": 1.2458032717951163e-05, + "loss": 0.3594, + "step": 1211 + }, + { + "epoch": 0.7779204107830552, + "grad_norm": 0.012713338248431683, + "learning_rate": 1.238945162021038e-05, + "loss": 0.3906, + "step": 1212 + }, + { + "epoch": 0.7785622593068036, + "grad_norm": 0.013876782730221748, + "learning_rate": 1.2321033107413666e-05, + "loss": 0.3789, + "step": 1213 + }, + { + "epoch": 0.7792041078305519, + "grad_norm": 0.013413132168352604, + "learning_rate": 1.2252777475324334e-05, + "loss": 0.3633, + "step": 1214 + }, + { + "epoch": 0.7798459563543004, + "grad_norm": 0.012726066634058952, + "learning_rate": 1.2184685019001574e-05, + "loss": 0.3887, + "step": 1215 + }, + { + "epoch": 0.7804878048780488, + "grad_norm": 0.013412890955805779, + "learning_rate": 1.2116756032799193e-05, + "loss": 0.3945, + "step": 1216 + }, + { + "epoch": 0.7811296534017972, + "grad_norm": 0.012434509582817554, + "learning_rate": 1.2048990810364363e-05, + "loss": 0.3906, + "step": 1217 + }, + { + "epoch": 0.7817715019255456, + "grad_norm": 0.014923973940312862, + "learning_rate": 1.1981389644636276e-05, + "loss": 0.4062, + "step": 1218 + }, + { + "epoch": 0.7824133504492939, + "grad_norm": 0.012189259752631187, + "learning_rate": 1.1913952827844993e-05, + "loss": 0.3691, + "step": 1219 + }, + { + "epoch": 0.7830551989730423, + "grad_norm": 0.012858893722295761, + "learning_rate": 1.184668065151005e-05, + "loss": 0.3711, + "step": 1220 + }, + { + "epoch": 0.7836970474967908, + "grad_norm": 0.013458888977766037, + "learning_rate": 1.1779573406439343e-05, + "loss": 0.3555, + "step": 1221 + }, + { + "epoch": 0.7843388960205392, + "grad_norm": 0.014321232214570045, + "learning_rate": 1.1712631382727763e-05, + "loss": 0.3906, + "step": 1222 + }, + { + "epoch": 0.7849807445442876, + "grad_norm": 0.013449189253151417, + "learning_rate": 1.1645854869755928e-05, + "loss": 0.3711, + "step": 1223 + }, + { + "epoch": 0.785622593068036, + "grad_norm": 0.013625388965010643, + "learning_rate": 1.1579244156189057e-05, + "loss": 0.4023, + "step": 1224 + }, + { + "epoch": 0.7862644415917843, + "grad_norm": 0.012530328705906868, + "learning_rate": 1.151279952997556e-05, + "loss": 0.3711, + "step": 1225 + }, + { + "epoch": 0.7869062901155327, + "grad_norm": 0.013215388171374798, + "learning_rate": 1.1446521278345928e-05, + "loss": 0.375, + "step": 1226 + }, + { + "epoch": 0.7875481386392811, + "grad_norm": 0.011637991294264793, + "learning_rate": 1.1380409687811461e-05, + "loss": 0.3809, + "step": 1227 + }, + { + "epoch": 0.7881899871630296, + "grad_norm": 0.014323128387331963, + "learning_rate": 1.1314465044162932e-05, + "loss": 0.4062, + "step": 1228 + }, + { + "epoch": 0.788831835686778, + "grad_norm": 0.011796939186751842, + "learning_rate": 1.1248687632469485e-05, + "loss": 0.3672, + "step": 1229 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.01327999122440815, + "learning_rate": 1.1183077737077336e-05, + "loss": 0.373, + "step": 1230 + }, + { + "epoch": 0.7901155327342747, + "grad_norm": 0.012252957560122013, + "learning_rate": 1.1117635641608509e-05, + "loss": 0.3945, + "step": 1231 + }, + { + "epoch": 0.7907573812580231, + "grad_norm": 0.011731931939721107, + "learning_rate": 1.1052361628959745e-05, + "loss": 0.3633, + "step": 1232 + }, + { + "epoch": 0.7913992297817715, + "grad_norm": 0.013072323985397816, + "learning_rate": 1.098725598130108e-05, + "loss": 0.3555, + "step": 1233 + }, + { + "epoch": 0.79204107830552, + "grad_norm": 0.01314816065132618, + "learning_rate": 1.0922318980074808e-05, + "loss": 0.3594, + "step": 1234 + }, + { + "epoch": 0.7926829268292683, + "grad_norm": 0.012540193274617195, + "learning_rate": 1.0857550905994175e-05, + "loss": 0.3809, + "step": 1235 + }, + { + "epoch": 0.7933247753530167, + "grad_norm": 0.012471678666770458, + "learning_rate": 1.079295203904213e-05, + "loss": 0.3594, + "step": 1236 + }, + { + "epoch": 0.7939666238767651, + "grad_norm": 0.012217460200190544, + "learning_rate": 1.0728522658470231e-05, + "loss": 0.3789, + "step": 1237 + }, + { + "epoch": 0.7946084724005135, + "grad_norm": 0.0153912752866745, + "learning_rate": 1.0664263042797335e-05, + "loss": 0.3594, + "step": 1238 + }, + { + "epoch": 0.7952503209242618, + "grad_norm": 0.013098095543682575, + "learning_rate": 1.0600173469808444e-05, + "loss": 0.3516, + "step": 1239 + }, + { + "epoch": 0.7958921694480102, + "grad_norm": 0.0131600983440876, + "learning_rate": 1.0536254216553487e-05, + "loss": 0.3828, + "step": 1240 + }, + { + "epoch": 0.7965340179717587, + "grad_norm": 0.01303099375218153, + "learning_rate": 1.0472505559346102e-05, + "loss": 0.377, + "step": 1241 + }, + { + "epoch": 0.7971758664955071, + "grad_norm": 0.013537733815610409, + "learning_rate": 1.04089277737625e-05, + "loss": 0.3828, + "step": 1242 + }, + { + "epoch": 0.7978177150192555, + "grad_norm": 0.016070852056145668, + "learning_rate": 1.0345521134640224e-05, + "loss": 0.373, + "step": 1243 + }, + { + "epoch": 0.7984595635430038, + "grad_norm": 0.012657620012760162, + "learning_rate": 1.0282285916076973e-05, + "loss": 0.3633, + "step": 1244 + }, + { + "epoch": 0.7991014120667522, + "grad_norm": 0.013497685082256794, + "learning_rate": 1.021922239142944e-05, + "loss": 0.3945, + "step": 1245 + }, + { + "epoch": 0.7997432605905006, + "grad_norm": 0.014096570201218128, + "learning_rate": 1.015633083331206e-05, + "loss": 0.3809, + "step": 1246 + }, + { + "epoch": 0.800385109114249, + "grad_norm": 0.0128650376573205, + "learning_rate": 1.009361151359593e-05, + "loss": 0.3848, + "step": 1247 + }, + { + "epoch": 0.8010269576379975, + "grad_norm": 0.012854866683483124, + "learning_rate": 1.0031064703407572e-05, + "loss": 0.3555, + "step": 1248 + }, + { + "epoch": 0.8016688061617459, + "grad_norm": 0.012545584701001644, + "learning_rate": 9.968690673127774e-06, + "loss": 0.3594, + "step": 1249 + }, + { + "epoch": 0.8023106546854942, + "grad_norm": 0.013711373321712017, + "learning_rate": 9.906489692390426e-06, + "loss": 0.3945, + "step": 1250 + }, + { + "epoch": 0.8029525032092426, + "grad_norm": 0.012173696421086788, + "learning_rate": 9.84446203008132e-06, + "loss": 0.3477, + "step": 1251 + }, + { + "epoch": 0.803594351732991, + "grad_norm": 0.013947328552603722, + "learning_rate": 9.782607954337059e-06, + "loss": 0.3652, + "step": 1252 + }, + { + "epoch": 0.8042362002567394, + "grad_norm": 0.012948707677423954, + "learning_rate": 9.720927732543845e-06, + "loss": 0.3984, + "step": 1253 + }, + { + "epoch": 0.8048780487804879, + "grad_norm": 0.013634390197694302, + "learning_rate": 9.659421631336295e-06, + "loss": 0.3535, + "step": 1254 + }, + { + "epoch": 0.8055198973042362, + "grad_norm": 0.012434390373528004, + "learning_rate": 9.59808991659641e-06, + "loss": 0.3555, + "step": 1255 + }, + { + "epoch": 0.8061617458279846, + "grad_norm": 0.013895786367356777, + "learning_rate": 9.536932853452252e-06, + "loss": 0.375, + "step": 1256 + }, + { + "epoch": 0.806803594351733, + "grad_norm": 0.014669674448668957, + "learning_rate": 9.475950706276948e-06, + "loss": 0.3984, + "step": 1257 + }, + { + "epoch": 0.8074454428754814, + "grad_norm": 0.013176476582884789, + "learning_rate": 9.415143738687493e-06, + "loss": 0.3281, + "step": 1258 + }, + { + "epoch": 0.8080872913992297, + "grad_norm": 0.014073810540139675, + "learning_rate": 9.354512213543538e-06, + "loss": 0.3848, + "step": 1259 + }, + { + "epoch": 0.8087291399229781, + "grad_norm": 0.013018610887229443, + "learning_rate": 9.294056392946427e-06, + "loss": 0.3906, + "step": 1260 + }, + { + "epoch": 0.8093709884467266, + "grad_norm": 0.013114540837705135, + "learning_rate": 9.233776538237854e-06, + "loss": 0.373, + "step": 1261 + }, + { + "epoch": 0.810012836970475, + "grad_norm": 0.013796181418001652, + "learning_rate": 9.17367290999891e-06, + "loss": 0.373, + "step": 1262 + }, + { + "epoch": 0.8106546854942234, + "grad_norm": 0.01330437883734703, + "learning_rate": 9.113745768048865e-06, + "loss": 0.3574, + "step": 1263 + }, + { + "epoch": 0.8112965340179717, + "grad_norm": 0.01230168342590332, + "learning_rate": 9.053995371444035e-06, + "loss": 0.3809, + "step": 1264 + }, + { + "epoch": 0.8119383825417201, + "grad_norm": 0.01638161949813366, + "learning_rate": 8.994421978476735e-06, + "loss": 0.3574, + "step": 1265 + }, + { + "epoch": 0.8125802310654685, + "grad_norm": 0.014375623315572739, + "learning_rate": 8.935025846674088e-06, + "loss": 0.4043, + "step": 1266 + }, + { + "epoch": 0.813222079589217, + "grad_norm": 0.012246549129486084, + "learning_rate": 8.875807232796968e-06, + "loss": 0.3711, + "step": 1267 + }, + { + "epoch": 0.8138639281129654, + "grad_norm": 0.01342903170734644, + "learning_rate": 8.816766392838855e-06, + "loss": 0.3711, + "step": 1268 + }, + { + "epoch": 0.8145057766367138, + "grad_norm": 0.012210494838654995, + "learning_rate": 8.757903582024706e-06, + "loss": 0.3672, + "step": 1269 + }, + { + "epoch": 0.8151476251604621, + "grad_norm": 0.012337923049926758, + "learning_rate": 8.699219054809937e-06, + "loss": 0.3438, + "step": 1270 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.01411160733550787, + "learning_rate": 8.640713064879236e-06, + "loss": 0.4023, + "step": 1271 + }, + { + "epoch": 0.8164313222079589, + "grad_norm": 0.013467960990965366, + "learning_rate": 8.582385865145508e-06, + "loss": 0.3867, + "step": 1272 + }, + { + "epoch": 0.8170731707317073, + "grad_norm": 0.012686812318861485, + "learning_rate": 8.5242377077488e-06, + "loss": 0.375, + "step": 1273 + }, + { + "epoch": 0.8177150192554558, + "grad_norm": 0.01201626192778349, + "learning_rate": 8.46626884405512e-06, + "loss": 0.3809, + "step": 1274 + }, + { + "epoch": 0.8183568677792041, + "grad_norm": 0.012600788846611977, + "learning_rate": 8.408479524655477e-06, + "loss": 0.3828, + "step": 1275 + }, + { + "epoch": 0.8189987163029525, + "grad_norm": 0.012168734334409237, + "learning_rate": 8.350869999364713e-06, + "loss": 0.373, + "step": 1276 + }, + { + "epoch": 0.8196405648267009, + "grad_norm": 0.013481889851391315, + "learning_rate": 8.293440517220446e-06, + "loss": 0.3613, + "step": 1277 + }, + { + "epoch": 0.8202824133504493, + "grad_norm": 0.0141798360273242, + "learning_rate": 8.236191326482007e-06, + "loss": 0.3906, + "step": 1278 + }, + { + "epoch": 0.8209242618741976, + "grad_norm": 0.012813771143555641, + "learning_rate": 8.179122674629324e-06, + "loss": 0.375, + "step": 1279 + }, + { + "epoch": 0.8215661103979461, + "grad_norm": 0.014093801379203796, + "learning_rate": 8.122234808361907e-06, + "loss": 0.3945, + "step": 1280 + }, + { + "epoch": 0.8222079589216945, + "grad_norm": 0.013460157439112663, + "learning_rate": 8.065527973597742e-06, + "loss": 0.4004, + "step": 1281 + }, + { + "epoch": 0.8228498074454429, + "grad_norm": 0.01379771064966917, + "learning_rate": 8.00900241547226e-06, + "loss": 0.3867, + "step": 1282 + }, + { + "epoch": 0.8234916559691913, + "grad_norm": 0.013519086875021458, + "learning_rate": 7.952658378337252e-06, + "loss": 0.4043, + "step": 1283 + }, + { + "epoch": 0.8241335044929397, + "grad_norm": 0.012356378138065338, + "learning_rate": 7.896496105759799e-06, + "loss": 0.3906, + "step": 1284 + }, + { + "epoch": 0.824775353016688, + "grad_norm": 0.013102208264172077, + "learning_rate": 7.840515840521263e-06, + "loss": 0.3418, + "step": 1285 + }, + { + "epoch": 0.8254172015404364, + "grad_norm": 0.012744414620101452, + "learning_rate": 7.784717824616222e-06, + "loss": 0.3711, + "step": 1286 + }, + { + "epoch": 0.8260590500641849, + "grad_norm": 0.01288849301636219, + "learning_rate": 7.729102299251367e-06, + "loss": 0.3906, + "step": 1287 + }, + { + "epoch": 0.8267008985879333, + "grad_norm": 0.012660497799515724, + "learning_rate": 7.673669504844599e-06, + "loss": 0.3906, + "step": 1288 + }, + { + "epoch": 0.8273427471116817, + "grad_norm": 0.011713840067386627, + "learning_rate": 7.618419681023809e-06, + "loss": 0.3594, + "step": 1289 + }, + { + "epoch": 0.82798459563543, + "grad_norm": 0.013892716728150845, + "learning_rate": 7.563353066625972e-06, + "loss": 0.3633, + "step": 1290 + }, + { + "epoch": 0.8286264441591784, + "grad_norm": 0.01375551987439394, + "learning_rate": 7.508469899696096e-06, + "loss": 0.3789, + "step": 1291 + }, + { + "epoch": 0.8292682926829268, + "grad_norm": 0.013938702642917633, + "learning_rate": 7.453770417486123e-06, + "loss": 0.3652, + "step": 1292 + }, + { + "epoch": 0.8299101412066753, + "grad_norm": 0.012904335744678974, + "learning_rate": 7.399254856453985e-06, + "loss": 0.3848, + "step": 1293 + }, + { + "epoch": 0.8305519897304237, + "grad_norm": 0.01319153606891632, + "learning_rate": 7.344923452262548e-06, + "loss": 0.3789, + "step": 1294 + }, + { + "epoch": 0.831193838254172, + "grad_norm": 0.0127947349101305, + "learning_rate": 7.2907764397785845e-06, + "loss": 0.3672, + "step": 1295 + }, + { + "epoch": 0.8318356867779204, + "grad_norm": 0.014471789821982384, + "learning_rate": 7.2368140530717945e-06, + "loss": 0.3828, + "step": 1296 + }, + { + "epoch": 0.8324775353016688, + "grad_norm": 0.011869586072862148, + "learning_rate": 7.183036525413716e-06, + "loss": 0.375, + "step": 1297 + }, + { + "epoch": 0.8331193838254172, + "grad_norm": 0.012698578648269176, + "learning_rate": 7.1294440892768155e-06, + "loss": 0.3457, + "step": 1298 + }, + { + "epoch": 0.8337612323491655, + "grad_norm": 0.011361399665474892, + "learning_rate": 7.076036976333417e-06, + "loss": 0.3945, + "step": 1299 + }, + { + "epoch": 0.834403080872914, + "grad_norm": 0.01272041629999876, + "learning_rate": 7.02281541745472e-06, + "loss": 0.3652, + "step": 1300 + }, + { + "epoch": 0.8350449293966624, + "grad_norm": 0.012732606381177902, + "learning_rate": 6.969779642709817e-06, + "loss": 0.3789, + "step": 1301 + }, + { + "epoch": 0.8356867779204108, + "grad_norm": 0.012499982491135597, + "learning_rate": 6.916929881364642e-06, + "loss": 0.3906, + "step": 1302 + }, + { + "epoch": 0.8363286264441592, + "grad_norm": 0.01187952607870102, + "learning_rate": 6.864266361881056e-06, + "loss": 0.3789, + "step": 1303 + }, + { + "epoch": 0.8369704749679076, + "grad_norm": 0.011955213733017445, + "learning_rate": 6.8117893119158176e-06, + "loss": 0.3906, + "step": 1304 + }, + { + "epoch": 0.8376123234916559, + "grad_norm": 0.011965644545853138, + "learning_rate": 6.759498958319599e-06, + "loss": 0.3555, + "step": 1305 + }, + { + "epoch": 0.8382541720154044, + "grad_norm": 0.012529222294688225, + "learning_rate": 6.707395527136018e-06, + "loss": 0.3633, + "step": 1306 + }, + { + "epoch": 0.8388960205391528, + "grad_norm": 0.012780443765223026, + "learning_rate": 6.655479243600632e-06, + "loss": 0.3789, + "step": 1307 + }, + { + "epoch": 0.8395378690629012, + "grad_norm": 0.014528384432196617, + "learning_rate": 6.603750332140019e-06, + "loss": 0.4121, + "step": 1308 + }, + { + "epoch": 0.8401797175866496, + "grad_norm": 0.012622330337762833, + "learning_rate": 6.5522090163707415e-06, + "loss": 0.375, + "step": 1309 + }, + { + "epoch": 0.8408215661103979, + "grad_norm": 0.013054592534899712, + "learning_rate": 6.500855519098448e-06, + "loss": 0.3809, + "step": 1310 + }, + { + "epoch": 0.8414634146341463, + "grad_norm": 0.011494606733322144, + "learning_rate": 6.44969006231686e-06, + "loss": 0.3398, + "step": 1311 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.011829941533505917, + "learning_rate": 6.398712867206797e-06, + "loss": 0.3652, + "step": 1312 + }, + { + "epoch": 0.8427471116816432, + "grad_norm": 0.012617502361536026, + "learning_rate": 6.347924154135298e-06, + "loss": 0.3828, + "step": 1313 + }, + { + "epoch": 0.8433889602053916, + "grad_norm": 0.013350032269954681, + "learning_rate": 6.2973241426546095e-06, + "loss": 0.3594, + "step": 1314 + }, + { + "epoch": 0.8440308087291399, + "grad_norm": 0.012515634298324585, + "learning_rate": 6.246913051501202e-06, + "loss": 0.3828, + "step": 1315 + }, + { + "epoch": 0.8446726572528883, + "grad_norm": 0.012150679714977741, + "learning_rate": 6.196691098594953e-06, + "loss": 0.3516, + "step": 1316 + }, + { + "epoch": 0.8453145057766367, + "grad_norm": 0.012245026417076588, + "learning_rate": 6.146658501038055e-06, + "loss": 0.3945, + "step": 1317 + }, + { + "epoch": 0.8459563543003851, + "grad_norm": 0.012508809566497803, + "learning_rate": 6.09681547511417e-06, + "loss": 0.3848, + "step": 1318 + }, + { + "epoch": 0.8465982028241335, + "grad_norm": 0.014197138138115406, + "learning_rate": 6.047162236287485e-06, + "loss": 0.3809, + "step": 1319 + }, + { + "epoch": 0.8472400513478819, + "grad_norm": 0.011651071719825268, + "learning_rate": 5.997698999201723e-06, + "loss": 0.3906, + "step": 1320 + }, + { + "epoch": 0.8478818998716303, + "grad_norm": 0.012558822520077229, + "learning_rate": 5.948425977679289e-06, + "loss": 0.3945, + "step": 1321 + }, + { + "epoch": 0.8485237483953787, + "grad_norm": 0.01349442545324564, + "learning_rate": 5.899343384720313e-06, + "loss": 0.3789, + "step": 1322 + }, + { + "epoch": 0.8491655969191271, + "grad_norm": 0.013269457034766674, + "learning_rate": 5.850451432501725e-06, + "loss": 0.3594, + "step": 1323 + }, + { + "epoch": 0.8498074454428755, + "grad_norm": 0.01424845214933157, + "learning_rate": 5.8017503323763375e-06, + "loss": 0.3984, + "step": 1324 + }, + { + "epoch": 0.8504492939666238, + "grad_norm": 0.012251721695065498, + "learning_rate": 5.753240294871937e-06, + "loss": 0.3633, + "step": 1325 + }, + { + "epoch": 0.8510911424903723, + "grad_norm": 0.01245789136737585, + "learning_rate": 5.704921529690377e-06, + "loss": 0.3555, + "step": 1326 + }, + { + "epoch": 0.8517329910141207, + "grad_norm": 0.012180859223008156, + "learning_rate": 5.656794245706676e-06, + "loss": 0.3848, + "step": 1327 + }, + { + "epoch": 0.8523748395378691, + "grad_norm": 0.011467906646430492, + "learning_rate": 5.608858650968096e-06, + "loss": 0.3438, + "step": 1328 + }, + { + "epoch": 0.8530166880616175, + "grad_norm": 0.01379603985697031, + "learning_rate": 5.561114952693269e-06, + "loss": 0.3672, + "step": 1329 + }, + { + "epoch": 0.8536585365853658, + "grad_norm": 0.013673442415893078, + "learning_rate": 5.513563357271256e-06, + "loss": 0.3516, + "step": 1330 + }, + { + "epoch": 0.8543003851091142, + "grad_norm": 0.012737073935568333, + "learning_rate": 5.466204070260717e-06, + "loss": 0.4004, + "step": 1331 + }, + { + "epoch": 0.8549422336328626, + "grad_norm": 0.011530360206961632, + "learning_rate": 5.4190372963889794e-06, + "loss": 0.3594, + "step": 1332 + }, + { + "epoch": 0.8555840821566111, + "grad_norm": 0.01489552017301321, + "learning_rate": 5.372063239551162e-06, + "loss": 0.3691, + "step": 1333 + }, + { + "epoch": 0.8562259306803595, + "grad_norm": 0.01377855334430933, + "learning_rate": 5.325282102809304e-06, + "loss": 0.3672, + "step": 1334 + }, + { + "epoch": 0.8568677792041078, + "grad_norm": 0.012593116611242294, + "learning_rate": 5.278694088391462e-06, + "loss": 0.4102, + "step": 1335 + }, + { + "epoch": 0.8575096277278562, + "grad_norm": 0.011791418306529522, + "learning_rate": 5.23229939769086e-06, + "loss": 0.3867, + "step": 1336 + }, + { + "epoch": 0.8581514762516046, + "grad_norm": 0.01504043210297823, + "learning_rate": 5.186098231265024e-06, + "loss": 0.3984, + "step": 1337 + }, + { + "epoch": 0.858793324775353, + "grad_norm": 0.013028615154325962, + "learning_rate": 5.140090788834895e-06, + "loss": 0.3516, + "step": 1338 + }, + { + "epoch": 0.8594351732991015, + "grad_norm": 0.01274171657860279, + "learning_rate": 5.0942772692839755e-06, + "loss": 0.377, + "step": 1339 + }, + { + "epoch": 0.8600770218228498, + "grad_norm": 0.015606660395860672, + "learning_rate": 5.048657870657447e-06, + "loss": 0.4062, + "step": 1340 + }, + { + "epoch": 0.8607188703465982, + "grad_norm": 0.01197892427444458, + "learning_rate": 5.003232790161366e-06, + "loss": 0.3496, + "step": 1341 + }, + { + "epoch": 0.8613607188703466, + "grad_norm": 0.013816019520163536, + "learning_rate": 4.9580022241617674e-06, + "loss": 0.377, + "step": 1342 + }, + { + "epoch": 0.862002567394095, + "grad_norm": 0.012484337203204632, + "learning_rate": 4.912966368183797e-06, + "loss": 0.3516, + "step": 1343 + }, + { + "epoch": 0.8626444159178434, + "grad_norm": 0.012311607599258423, + "learning_rate": 4.868125416910957e-06, + "loss": 0.375, + "step": 1344 + }, + { + "epoch": 0.8632862644415917, + "grad_norm": 0.013449694029986858, + "learning_rate": 4.82347956418418e-06, + "loss": 0.418, + "step": 1345 + }, + { + "epoch": 0.8639281129653402, + "grad_norm": 0.01339624635875225, + "learning_rate": 4.779029003000979e-06, + "loss": 0.3477, + "step": 1346 + }, + { + "epoch": 0.8645699614890886, + "grad_norm": 0.012256176210939884, + "learning_rate": 4.734773925514712e-06, + "loss": 0.3789, + "step": 1347 + }, + { + "epoch": 0.865211810012837, + "grad_norm": 0.012527239508926868, + "learning_rate": 4.690714523033635e-06, + "loss": 0.3613, + "step": 1348 + }, + { + "epoch": 0.8658536585365854, + "grad_norm": 0.011592155322432518, + "learning_rate": 4.646850986020151e-06, + "loss": 0.3945, + "step": 1349 + }, + { + "epoch": 0.8664955070603337, + "grad_norm": 0.012507716193795204, + "learning_rate": 4.603183504089997e-06, + "loss": 0.3828, + "step": 1350 + }, + { + "epoch": 0.8671373555840821, + "grad_norm": 0.012843828648328781, + "learning_rate": 4.559712266011329e-06, + "loss": 0.3984, + "step": 1351 + }, + { + "epoch": 0.8677792041078306, + "grad_norm": 0.0132277337834239, + "learning_rate": 4.516437459704032e-06, + "loss": 0.3887, + "step": 1352 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.01294110156595707, + "learning_rate": 4.473359272238786e-06, + "loss": 0.3828, + "step": 1353 + }, + { + "epoch": 0.8690629011553274, + "grad_norm": 0.012413974851369858, + "learning_rate": 4.430477889836348e-06, + "loss": 0.3594, + "step": 1354 + }, + { + "epoch": 0.8697047496790757, + "grad_norm": 0.013880117796361446, + "learning_rate": 4.387793497866744e-06, + "loss": 0.377, + "step": 1355 + }, + { + "epoch": 0.8703465982028241, + "grad_norm": 0.013049155473709106, + "learning_rate": 4.345306280848377e-06, + "loss": 0.3906, + "step": 1356 + }, + { + "epoch": 0.8709884467265725, + "grad_norm": 0.013583934865891933, + "learning_rate": 4.3030164224473365e-06, + "loss": 0.3496, + "step": 1357 + }, + { + "epoch": 0.8716302952503209, + "grad_norm": 0.0125590730458498, + "learning_rate": 4.260924105476544e-06, + "loss": 0.4023, + "step": 1358 + }, + { + "epoch": 0.8722721437740694, + "grad_norm": 0.014681275002658367, + "learning_rate": 4.219029511894973e-06, + "loss": 0.3789, + "step": 1359 + }, + { + "epoch": 0.8729139922978177, + "grad_norm": 0.015730444341897964, + "learning_rate": 4.177332822806873e-06, + "loss": 0.3633, + "step": 1360 + }, + { + "epoch": 0.8735558408215661, + "grad_norm": 0.0119576221331954, + "learning_rate": 4.135834218460982e-06, + "loss": 0.3672, + "step": 1361 + }, + { + "epoch": 0.8741976893453145, + "grad_norm": 0.013510183431208134, + "learning_rate": 4.094533878249751e-06, + "loss": 0.3555, + "step": 1362 + }, + { + "epoch": 0.8748395378690629, + "grad_norm": 0.014038784429430962, + "learning_rate": 4.053431980708566e-06, + "loss": 0.3711, + "step": 1363 + }, + { + "epoch": 0.8754813863928113, + "grad_norm": 0.013317780569195747, + "learning_rate": 4.012528703514951e-06, + "loss": 0.3574, + "step": 1364 + }, + { + "epoch": 0.8761232349165597, + "grad_norm": 0.012826112098991871, + "learning_rate": 3.971824223487841e-06, + "loss": 0.3711, + "step": 1365 + }, + { + "epoch": 0.8767650834403081, + "grad_norm": 0.012482481077313423, + "learning_rate": 3.931318716586807e-06, + "loss": 0.373, + "step": 1366 + }, + { + "epoch": 0.8774069319640565, + "grad_norm": 0.01242758333683014, + "learning_rate": 3.891012357911272e-06, + "loss": 0.3906, + "step": 1367 + }, + { + "epoch": 0.8780487804878049, + "grad_norm": 0.011938832700252533, + "learning_rate": 3.850905321699788e-06, + "loss": 0.3535, + "step": 1368 + }, + { + "epoch": 0.8786906290115533, + "grad_norm": 0.015014137141406536, + "learning_rate": 3.810997781329234e-06, + "loss": 0.3945, + "step": 1369 + }, + { + "epoch": 0.8793324775353016, + "grad_norm": 0.01675754226744175, + "learning_rate": 3.7712899093141407e-06, + "loss": 0.3652, + "step": 1370 + }, + { + "epoch": 0.87997432605905, + "grad_norm": 0.013484257273375988, + "learning_rate": 3.7317818773058456e-06, + "loss": 0.377, + "step": 1371 + }, + { + "epoch": 0.8806161745827985, + "grad_norm": 0.012842392548918724, + "learning_rate": 3.692473856091866e-06, + "loss": 0.3789, + "step": 1372 + }, + { + "epoch": 0.8812580231065469, + "grad_norm": 0.012742377817630768, + "learning_rate": 3.6533660155950776e-06, + "loss": 0.3672, + "step": 1373 + }, + { + "epoch": 0.8818998716302953, + "grad_norm": 0.012952295131981373, + "learning_rate": 3.6144585248729824e-06, + "loss": 0.3906, + "step": 1374 + }, + { + "epoch": 0.8825417201540436, + "grad_norm": 0.012831165455281734, + "learning_rate": 3.575751552117029e-06, + "loss": 0.3848, + "step": 1375 + }, + { + "epoch": 0.883183568677792, + "grad_norm": 0.013856924138963223, + "learning_rate": 3.537245264651845e-06, + "loss": 0.3965, + "step": 1376 + }, + { + "epoch": 0.8838254172015404, + "grad_norm": 0.014107098802924156, + "learning_rate": 3.498939828934511e-06, + "loss": 0.3828, + "step": 1377 + }, + { + "epoch": 0.8844672657252889, + "grad_norm": 0.012293580919504166, + "learning_rate": 3.4608354105538976e-06, + "loss": 0.373, + "step": 1378 + }, + { + "epoch": 0.8851091142490373, + "grad_norm": 0.012825761921703815, + "learning_rate": 3.4229321742298427e-06, + "loss": 0.3613, + "step": 1379 + }, + { + "epoch": 0.8857509627727856, + "grad_norm": 0.015361820347607136, + "learning_rate": 3.3852302838125626e-06, + "loss": 0.3984, + "step": 1380 + }, + { + "epoch": 0.886392811296534, + "grad_norm": 0.012380460277199745, + "learning_rate": 3.3477299022818544e-06, + "loss": 0.3555, + "step": 1381 + }, + { + "epoch": 0.8870346598202824, + "grad_norm": 0.012735201977193356, + "learning_rate": 3.3104311917464104e-06, + "loss": 0.3633, + "step": 1382 + }, + { + "epoch": 0.8876765083440308, + "grad_norm": 0.013652042485773563, + "learning_rate": 3.2733343134431714e-06, + "loss": 0.3867, + "step": 1383 + }, + { + "epoch": 0.8883183568677792, + "grad_norm": 0.013185295276343822, + "learning_rate": 3.236439427736543e-06, + "loss": 0.3867, + "step": 1384 + }, + { + "epoch": 0.8889602053915276, + "grad_norm": 0.012528514489531517, + "learning_rate": 3.1997466941177666e-06, + "loss": 0.3711, + "step": 1385 + }, + { + "epoch": 0.889602053915276, + "grad_norm": 0.013030996546149254, + "learning_rate": 3.163256271204218e-06, + "loss": 0.3672, + "step": 1386 + }, + { + "epoch": 0.8902439024390244, + "grad_norm": 0.011969957500696182, + "learning_rate": 3.1269683167386764e-06, + "loss": 0.3867, + "step": 1387 + }, + { + "epoch": 0.8908857509627728, + "grad_norm": 0.014140266925096512, + "learning_rate": 3.0908829875887223e-06, + "loss": 0.3809, + "step": 1388 + }, + { + "epoch": 0.8915275994865212, + "grad_norm": 0.01246250607073307, + "learning_rate": 3.0550004397459976e-06, + "loss": 0.3672, + "step": 1389 + }, + { + "epoch": 0.8921694480102695, + "grad_norm": 0.012091501615941525, + "learning_rate": 3.019320828325539e-06, + "loss": 0.3711, + "step": 1390 + }, + { + "epoch": 0.8928112965340179, + "grad_norm": 0.012726930901408195, + "learning_rate": 2.9838443075651535e-06, + "loss": 0.3945, + "step": 1391 + }, + { + "epoch": 0.8934531450577664, + "grad_norm": 0.01227556075900793, + "learning_rate": 2.9485710308246706e-06, + "loss": 0.3789, + "step": 1392 + }, + { + "epoch": 0.8940949935815148, + "grad_norm": 0.013887990266084671, + "learning_rate": 2.9135011505853572e-06, + "loss": 0.3828, + "step": 1393 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.01397820096462965, + "learning_rate": 2.8786348184492105e-06, + "loss": 0.3867, + "step": 1394 + }, + { + "epoch": 0.8953786906290115, + "grad_norm": 0.012721987441182137, + "learning_rate": 2.8439721851383383e-06, + "loss": 0.3984, + "step": 1395 + }, + { + "epoch": 0.8960205391527599, + "grad_norm": 0.012987341731786728, + "learning_rate": 2.8095134004942737e-06, + "loss": 0.373, + "step": 1396 + }, + { + "epoch": 0.8966623876765083, + "grad_norm": 0.013185744173824787, + "learning_rate": 2.7752586134773327e-06, + "loss": 0.3828, + "step": 1397 + }, + { + "epoch": 0.8973042362002568, + "grad_norm": 0.012743929401040077, + "learning_rate": 2.7412079721659988e-06, + "loss": 0.3789, + "step": 1398 + }, + { + "epoch": 0.8979460847240052, + "grad_norm": 0.014703886583447456, + "learning_rate": 2.707361623756249e-06, + "loss": 0.3711, + "step": 1399 + }, + { + "epoch": 0.8985879332477535, + "grad_norm": 0.011874165385961533, + "learning_rate": 2.6737197145609404e-06, + "loss": 0.3867, + "step": 1400 + }, + { + "epoch": 0.8992297817715019, + "grad_norm": 0.013200345449149609, + "learning_rate": 2.6402823900091744e-06, + "loss": 0.3555, + "step": 1401 + }, + { + "epoch": 0.8998716302952503, + "grad_norm": 0.013502005487680435, + "learning_rate": 2.607049794645633e-06, + "loss": 0.3516, + "step": 1402 + }, + { + "epoch": 0.9005134788189987, + "grad_norm": 0.013549595139920712, + "learning_rate": 2.574022072130017e-06, + "loss": 0.375, + "step": 1403 + }, + { + "epoch": 0.9011553273427471, + "grad_norm": 0.013294607400894165, + "learning_rate": 2.541199365236391e-06, + "loss": 0.3789, + "step": 1404 + }, + { + "epoch": 0.9017971758664955, + "grad_norm": 0.01240833755582571, + "learning_rate": 2.508581815852523e-06, + "loss": 0.3711, + "step": 1405 + }, + { + "epoch": 0.9024390243902439, + "grad_norm": 0.013817439787089825, + "learning_rate": 2.4761695649793804e-06, + "loss": 0.3828, + "step": 1406 + }, + { + "epoch": 0.9030808729139923, + "grad_norm": 0.012335661798715591, + "learning_rate": 2.4439627527303997e-06, + "loss": 0.3789, + "step": 1407 + }, + { + "epoch": 0.9037227214377407, + "grad_norm": 0.011767784133553505, + "learning_rate": 2.4119615183309628e-06, + "loss": 0.375, + "step": 1408 + }, + { + "epoch": 0.9043645699614891, + "grad_norm": 0.012927521951496601, + "learning_rate": 2.3801660001177782e-06, + "loss": 0.3535, + "step": 1409 + }, + { + "epoch": 0.9050064184852374, + "grad_norm": 0.014147186651825905, + "learning_rate": 2.3485763355382273e-06, + "loss": 0.3672, + "step": 1410 + }, + { + "epoch": 0.9056482670089859, + "grad_norm": 0.01323013473302126, + "learning_rate": 2.3171926611498808e-06, + "loss": 0.3691, + "step": 1411 + }, + { + "epoch": 0.9062901155327343, + "grad_norm": 0.011728956364095211, + "learning_rate": 2.2860151126197825e-06, + "loss": 0.3516, + "step": 1412 + }, + { + "epoch": 0.9069319640564827, + "grad_norm": 0.014273487031459808, + "learning_rate": 2.255043824723968e-06, + "loss": 0.3984, + "step": 1413 + }, + { + "epoch": 0.9075738125802311, + "grad_norm": 0.011534382589161396, + "learning_rate": 2.224278931346824e-06, + "loss": 0.3555, + "step": 1414 + }, + { + "epoch": 0.9082156611039794, + "grad_norm": 0.01171374786645174, + "learning_rate": 2.1937205654805004e-06, + "loss": 0.377, + "step": 1415 + }, + { + "epoch": 0.9088575096277278, + "grad_norm": 0.012463386170566082, + "learning_rate": 2.1633688592244016e-06, + "loss": 0.3594, + "step": 1416 + }, + { + "epoch": 0.9094993581514762, + "grad_norm": 0.012697212398052216, + "learning_rate": 2.1332239437845345e-06, + "loss": 0.3809, + "step": 1417 + }, + { + "epoch": 0.9101412066752247, + "grad_norm": 0.012483914382755756, + "learning_rate": 2.1032859494730052e-06, + "loss": 0.3867, + "step": 1418 + }, + { + "epoch": 0.9107830551989731, + "grad_norm": 0.0118754543364048, + "learning_rate": 2.0735550057074225e-06, + "loss": 0.3672, + "step": 1419 + }, + { + "epoch": 0.9114249037227214, + "grad_norm": 0.013126706704497337, + "learning_rate": 2.04403124101033e-06, + "loss": 0.3711, + "step": 1420 + }, + { + "epoch": 0.9120667522464698, + "grad_norm": 0.013734589330852032, + "learning_rate": 2.0147147830086866e-06, + "loss": 0.3496, + "step": 1421 + }, + { + "epoch": 0.9127086007702182, + "grad_norm": 0.013323265127837658, + "learning_rate": 1.98560575843329e-06, + "loss": 0.3789, + "step": 1422 + }, + { + "epoch": 0.9133504492939666, + "grad_norm": 0.01396789401769638, + "learning_rate": 1.956704293118222e-06, + "loss": 0.3711, + "step": 1423 + }, + { + "epoch": 0.9139922978177151, + "grad_norm": 0.013047968968749046, + "learning_rate": 1.9280105120003333e-06, + "loss": 0.3691, + "step": 1424 + }, + { + "epoch": 0.9146341463414634, + "grad_norm": 0.01341619435697794, + "learning_rate": 1.8995245391186688e-06, + "loss": 0.3691, + "step": 1425 + }, + { + "epoch": 0.9152759948652118, + "grad_norm": 0.014094233512878418, + "learning_rate": 1.87124649761396e-06, + "loss": 0.3672, + "step": 1426 + }, + { + "epoch": 0.9159178433889602, + "grad_norm": 0.012613237835466862, + "learning_rate": 1.8431765097280784e-06, + "loss": 0.3633, + "step": 1427 + }, + { + "epoch": 0.9165596919127086, + "grad_norm": 0.013313335366547108, + "learning_rate": 1.8153146968035107e-06, + "loss": 0.3828, + "step": 1428 + }, + { + "epoch": 0.917201540436457, + "grad_norm": 0.0134997284039855, + "learning_rate": 1.7876611792828346e-06, + "loss": 0.4023, + "step": 1429 + }, + { + "epoch": 0.9178433889602053, + "grad_norm": 0.013145639561116695, + "learning_rate": 1.7602160767081822e-06, + "loss": 0.3535, + "step": 1430 + }, + { + "epoch": 0.9184852374839538, + "grad_norm": 0.014171122573316097, + "learning_rate": 1.7329795077207556e-06, + "loss": 0.375, + "step": 1431 + }, + { + "epoch": 0.9191270860077022, + "grad_norm": 0.015409087762236595, + "learning_rate": 1.7059515900603007e-06, + "loss": 0.373, + "step": 1432 + }, + { + "epoch": 0.9197689345314506, + "grad_norm": 0.013116035610437393, + "learning_rate": 1.6791324405645626e-06, + "loss": 0.3672, + "step": 1433 + }, + { + "epoch": 0.920410783055199, + "grad_norm": 0.012658379971981049, + "learning_rate": 1.6525221751688636e-06, + "loss": 0.3594, + "step": 1434 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.015524683520197868, + "learning_rate": 1.6261209089054986e-06, + "loss": 0.3828, + "step": 1435 + }, + { + "epoch": 0.9216944801026957, + "grad_norm": 0.013421234674751759, + "learning_rate": 1.5999287559033127e-06, + "loss": 0.3711, + "step": 1436 + }, + { + "epoch": 0.9223363286264442, + "grad_norm": 0.012796991504728794, + "learning_rate": 1.5739458293871968e-06, + "loss": 0.3633, + "step": 1437 + }, + { + "epoch": 0.9229781771501926, + "grad_norm": 0.01368678081780672, + "learning_rate": 1.5481722416775479e-06, + "loss": 0.3789, + "step": 1438 + }, + { + "epoch": 0.923620025673941, + "grad_norm": 0.014133045449852943, + "learning_rate": 1.5226081041898766e-06, + "loss": 0.3867, + "step": 1439 + }, + { + "epoch": 0.9242618741976893, + "grad_norm": 0.012482895515859127, + "learning_rate": 1.4972535274342225e-06, + "loss": 0.3809, + "step": 1440 + }, + { + "epoch": 0.9249037227214377, + "grad_norm": 0.011843664571642876, + "learning_rate": 1.472108621014745e-06, + "loss": 0.3535, + "step": 1441 + }, + { + "epoch": 0.9255455712451861, + "grad_norm": 0.013735570013523102, + "learning_rate": 1.4471734936292281e-06, + "loss": 0.3828, + "step": 1442 + }, + { + "epoch": 0.9261874197689345, + "grad_norm": 0.01336585357785225, + "learning_rate": 1.422448253068609e-06, + "loss": 0.3496, + "step": 1443 + }, + { + "epoch": 0.926829268292683, + "grad_norm": 0.01314416341483593, + "learning_rate": 1.3979330062165119e-06, + "loss": 0.3906, + "step": 1444 + }, + { + "epoch": 0.9274711168164314, + "grad_norm": 0.013479986228048801, + "learning_rate": 1.3736278590487927e-06, + "loss": 0.375, + "step": 1445 + }, + { + "epoch": 0.9281129653401797, + "grad_norm": 0.012583770789206028, + "learning_rate": 1.349532916633084e-06, + "loss": 0.375, + "step": 1446 + }, + { + "epoch": 0.9287548138639281, + "grad_norm": 0.012261179275810719, + "learning_rate": 1.3256482831283278e-06, + "loss": 0.3711, + "step": 1447 + }, + { + "epoch": 0.9293966623876765, + "grad_norm": 0.012957144528627396, + "learning_rate": 1.3019740617843167e-06, + "loss": 0.4043, + "step": 1448 + }, + { + "epoch": 0.9300385109114249, + "grad_norm": 0.012559046968817711, + "learning_rate": 1.2785103549412814e-06, + "loss": 0.3789, + "step": 1449 + }, + { + "epoch": 0.9306803594351734, + "grad_norm": 0.012377286329865456, + "learning_rate": 1.2552572640294247e-06, + "loss": 0.3711, + "step": 1450 + }, + { + "epoch": 0.9313222079589217, + "grad_norm": 0.012806874699890614, + "learning_rate": 1.2322148895684838e-06, + "loss": 0.3574, + "step": 1451 + }, + { + "epoch": 0.9319640564826701, + "grad_norm": 0.015327047556638718, + "learning_rate": 1.209383331167302e-06, + "loss": 0.3691, + "step": 1452 + }, + { + "epoch": 0.9326059050064185, + "grad_norm": 0.011319216340780258, + "learning_rate": 1.1867626875233851e-06, + "loss": 0.3613, + "step": 1453 + }, + { + "epoch": 0.9332477535301669, + "grad_norm": 0.012484993785619736, + "learning_rate": 1.1643530564225013e-06, + "loss": 0.3711, + "step": 1454 + }, + { + "epoch": 0.9338896020539152, + "grad_norm": 0.011937733739614487, + "learning_rate": 1.1421545347382378e-06, + "loss": 0.3496, + "step": 1455 + }, + { + "epoch": 0.9345314505776636, + "grad_norm": 0.012133189477026463, + "learning_rate": 1.1201672184315726e-06, + "loss": 0.3496, + "step": 1456 + }, + { + "epoch": 0.9351732991014121, + "grad_norm": 0.013368112966418266, + "learning_rate": 1.0983912025505028e-06, + "loss": 0.3652, + "step": 1457 + }, + { + "epoch": 0.9358151476251605, + "grad_norm": 0.011588167399168015, + "learning_rate": 1.076826581229562e-06, + "loss": 0.3496, + "step": 1458 + }, + { + "epoch": 0.9364569961489089, + "grad_norm": 0.013561406172811985, + "learning_rate": 1.0554734476894923e-06, + "loss": 0.3555, + "step": 1459 + }, + { + "epoch": 0.9370988446726572, + "grad_norm": 0.012453819625079632, + "learning_rate": 1.0343318942367951e-06, + "loss": 0.3906, + "step": 1460 + }, + { + "epoch": 0.9377406931964056, + "grad_norm": 0.01237612683326006, + "learning_rate": 1.0134020122633313e-06, + "loss": 0.373, + "step": 1461 + }, + { + "epoch": 0.938382541720154, + "grad_norm": 0.013195016421377659, + "learning_rate": 9.926838922459492e-07, + "loss": 0.3867, + "step": 1462 + }, + { + "epoch": 0.9390243902439024, + "grad_norm": 0.014692552387714386, + "learning_rate": 9.721776237460734e-07, + "loss": 0.3672, + "step": 1463 + }, + { + "epoch": 0.9396662387676509, + "grad_norm": 0.012344533577561378, + "learning_rate": 9.518832954093282e-07, + "loss": 0.3789, + "step": 1464 + }, + { + "epoch": 0.9403080872913993, + "grad_norm": 0.012914840131998062, + "learning_rate": 9.31800994965154e-07, + "loss": 0.3711, + "step": 1465 + }, + { + "epoch": 0.9409499358151476, + "grad_norm": 0.012996658682823181, + "learning_rate": 9.119308092264078e-07, + "loss": 0.3633, + "step": 1466 + }, + { + "epoch": 0.941591784338896, + "grad_norm": 0.0120243513956666, + "learning_rate": 8.92272824089041e-07, + "loss": 0.3672, + "step": 1467 + }, + { + "epoch": 0.9422336328626444, + "grad_norm": 0.012632016092538834, + "learning_rate": 8.728271245316555e-07, + "loss": 0.3672, + "step": 1468 + }, + { + "epoch": 0.9428754813863928, + "grad_norm": 0.012309476733207703, + "learning_rate": 8.535937946151983e-07, + "loss": 0.3848, + "step": 1469 + }, + { + "epoch": 0.9435173299101413, + "grad_norm": 0.0127309774979949, + "learning_rate": 8.345729174825623e-07, + "loss": 0.375, + "step": 1470 + }, + { + "epoch": 0.9441591784338896, + "grad_norm": 0.013301864266395569, + "learning_rate": 8.157645753582299e-07, + "loss": 0.375, + "step": 1471 + }, + { + "epoch": 0.944801026957638, + "grad_norm": 0.012385779060423374, + "learning_rate": 7.971688495479468e-07, + "loss": 0.373, + "step": 1472 + }, + { + "epoch": 0.9454428754813864, + "grad_norm": 0.012698372825980186, + "learning_rate": 7.78785820438338e-07, + "loss": 0.3672, + "step": 1473 + }, + { + "epoch": 0.9460847240051348, + "grad_norm": 0.012753445655107498, + "learning_rate": 7.606155674965698e-07, + "loss": 0.375, + "step": 1474 + }, + { + "epoch": 0.9467265725288831, + "grad_norm": 0.01262593548744917, + "learning_rate": 7.426581692700052e-07, + "loss": 0.3711, + "step": 1475 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.012594195082783699, + "learning_rate": 7.24913703385871e-07, + "loss": 0.3789, + "step": 1476 + }, + { + "epoch": 0.94801026957638, + "grad_norm": 0.011902299709618092, + "learning_rate": 7.073822465509194e-07, + "loss": 0.377, + "step": 1477 + }, + { + "epoch": 0.9486521181001284, + "grad_norm": 0.013070139102637768, + "learning_rate": 6.900638745511057e-07, + "loss": 0.3789, + "step": 1478 + }, + { + "epoch": 0.9492939666238768, + "grad_norm": 0.013074691407382488, + "learning_rate": 6.729586622512274e-07, + "loss": 0.3926, + "step": 1479 + }, + { + "epoch": 0.9499358151476252, + "grad_norm": 0.012703335843980312, + "learning_rate": 6.560666835946416e-07, + "loss": 0.377, + "step": 1480 + }, + { + "epoch": 0.9505776636713735, + "grad_norm": 0.012635687366127968, + "learning_rate": 6.39388011602915e-07, + "loss": 0.4004, + "step": 1481 + }, + { + "epoch": 0.9512195121951219, + "grad_norm": 0.013130598701536655, + "learning_rate": 6.229227183755293e-07, + "loss": 0.3633, + "step": 1482 + }, + { + "epoch": 0.9518613607188704, + "grad_norm": 0.011977552436292171, + "learning_rate": 6.0667087508956e-07, + "loss": 0.3652, + "step": 1483 + }, + { + "epoch": 0.9525032092426188, + "grad_norm": 0.013489103876054287, + "learning_rate": 5.906325519993705e-07, + "loss": 0.3691, + "step": 1484 + }, + { + "epoch": 0.9531450577663672, + "grad_norm": 0.013404084369540215, + "learning_rate": 5.748078184363015e-07, + "loss": 0.3711, + "step": 1485 + }, + { + "epoch": 0.9537869062901155, + "grad_norm": 0.012206465937197208, + "learning_rate": 5.591967428083822e-07, + "loss": 0.3672, + "step": 1486 + }, + { + "epoch": 0.9544287548138639, + "grad_norm": 0.01376590970903635, + "learning_rate": 5.437993926000195e-07, + "loss": 0.3965, + "step": 1487 + }, + { + "epoch": 0.9550706033376123, + "grad_norm": 0.011835059151053429, + "learning_rate": 5.286158343717429e-07, + "loss": 0.3438, + "step": 1488 + }, + { + "epoch": 0.9557124518613607, + "grad_norm": 0.012890417128801346, + "learning_rate": 5.136461337598486e-07, + "loss": 0.3848, + "step": 1489 + }, + { + "epoch": 0.9563543003851092, + "grad_norm": 0.013126579113304615, + "learning_rate": 4.988903554761948e-07, + "loss": 0.4023, + "step": 1490 + }, + { + "epoch": 0.9569961489088575, + "grad_norm": 0.013360566459596157, + "learning_rate": 4.843485633078681e-07, + "loss": 0.3867, + "step": 1491 + }, + { + "epoch": 0.9576379974326059, + "grad_norm": 0.012557647190988064, + "learning_rate": 4.700208201169232e-07, + "loss": 0.3477, + "step": 1492 + }, + { + "epoch": 0.9582798459563543, + "grad_norm": 0.012240637093782425, + "learning_rate": 4.559071878401211e-07, + "loss": 0.3359, + "step": 1493 + }, + { + "epoch": 0.9589216944801027, + "grad_norm": 0.011939102783799171, + "learning_rate": 4.420077274886414e-07, + "loss": 0.3906, + "step": 1494 + }, + { + "epoch": 0.959563543003851, + "grad_norm": 0.013211379759013653, + "learning_rate": 4.283224991478374e-07, + "loss": 0.3809, + "step": 1495 + }, + { + "epoch": 0.9602053915275995, + "grad_norm": 0.012315848842263222, + "learning_rate": 4.148515619769644e-07, + "loss": 0.3809, + "step": 1496 + }, + { + "epoch": 0.9608472400513479, + "grad_norm": 0.013882388360798359, + "learning_rate": 4.0159497420892976e-07, + "loss": 0.3984, + "step": 1497 + }, + { + "epoch": 0.9614890885750963, + "grad_norm": 0.014947362244129181, + "learning_rate": 3.885527931500377e-07, + "loss": 0.3809, + "step": 1498 + }, + { + "epoch": 0.9621309370988447, + "grad_norm": 0.011844776570796967, + "learning_rate": 3.7572507517974475e-07, + "loss": 0.3945, + "step": 1499 + }, + { + "epoch": 0.962772785622593, + "grad_norm": 0.011926044709980488, + "learning_rate": 3.631118757504159e-07, + "loss": 0.3633, + "step": 1500 + }, + { + "epoch": 0.9634146341463414, + "grad_norm": 0.012529637664556503, + "learning_rate": 3.507132493870857e-07, + "loss": 0.3672, + "step": 1501 + }, + { + "epoch": 0.9640564826700898, + "grad_norm": 0.012636232189834118, + "learning_rate": 3.3852924968720834e-07, + "loss": 0.3711, + "step": 1502 + }, + { + "epoch": 0.9646983311938383, + "grad_norm": 0.01360595878213644, + "learning_rate": 3.2655992932045796e-07, + "loss": 0.3906, + "step": 1503 + }, + { + "epoch": 0.9653401797175867, + "grad_norm": 0.014324655756354332, + "learning_rate": 3.148053400284623e-07, + "loss": 0.3516, + "step": 1504 + }, + { + "epoch": 0.9659820282413351, + "grad_norm": 0.013343398459255695, + "learning_rate": 3.0326553262460255e-07, + "loss": 0.3711, + "step": 1505 + }, + { + "epoch": 0.9666238767650834, + "grad_norm": 0.01268340926617384, + "learning_rate": 2.9194055699380275e-07, + "loss": 0.3711, + "step": 1506 + }, + { + "epoch": 0.9672657252888318, + "grad_norm": 0.012414415366947651, + "learning_rate": 2.8083046209228523e-07, + "loss": 0.3867, + "step": 1507 + }, + { + "epoch": 0.9679075738125802, + "grad_norm": 0.012590642087161541, + "learning_rate": 2.699352959473711e-07, + "loss": 0.3633, + "step": 1508 + }, + { + "epoch": 0.9685494223363287, + "grad_norm": 0.0130212577059865, + "learning_rate": 2.5925510565729115e-07, + "loss": 0.377, + "step": 1509 + }, + { + "epoch": 0.9691912708600771, + "grad_norm": 0.013702025637030602, + "learning_rate": 2.4878993739095857e-07, + "loss": 0.4102, + "step": 1510 + }, + { + "epoch": 0.9698331193838254, + "grad_norm": 0.013444257900118828, + "learning_rate": 2.385398363877689e-07, + "loss": 0.3672, + "step": 1511 + }, + { + "epoch": 0.9704749679075738, + "grad_norm": 0.011984091252088547, + "learning_rate": 2.2850484695743357e-07, + "loss": 0.373, + "step": 1512 + }, + { + "epoch": 0.9711168164313222, + "grad_norm": 0.01249521505087614, + "learning_rate": 2.1868501247974683e-07, + "loss": 0.3418, + "step": 1513 + }, + { + "epoch": 0.9717586649550706, + "grad_norm": 0.012439033947885036, + "learning_rate": 2.090803754044357e-07, + "loss": 0.3984, + "step": 1514 + }, + { + "epoch": 0.972400513478819, + "grad_norm": 0.012088153511285782, + "learning_rate": 1.9969097725094366e-07, + "loss": 0.3555, + "step": 1515 + }, + { + "epoch": 0.9730423620025674, + "grad_norm": 0.013228670693933964, + "learning_rate": 1.9051685860828062e-07, + "loss": 0.3828, + "step": 1516 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.013256936334073544, + "learning_rate": 1.8155805913483427e-07, + "loss": 0.3867, + "step": 1517 + }, + { + "epoch": 0.9743260590500642, + "grad_norm": 0.0124072739854455, + "learning_rate": 1.728146175581924e-07, + "loss": 0.3906, + "step": 1518 + }, + { + "epoch": 0.9749679075738126, + "grad_norm": 0.012670646421611309, + "learning_rate": 1.642865716749875e-07, + "loss": 0.3926, + "step": 1519 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.011904159560799599, + "learning_rate": 1.5597395835071915e-07, + "loss": 0.3633, + "step": 1520 + }, + { + "epoch": 0.9762516046213093, + "grad_norm": 0.014184362255036831, + "learning_rate": 1.4787681351960959e-07, + "loss": 0.375, + "step": 1521 + }, + { + "epoch": 0.9768934531450578, + "grad_norm": 0.012831350788474083, + "learning_rate": 1.399951721844428e-07, + "loss": 0.3984, + "step": 1522 + }, + { + "epoch": 0.9775353016688062, + "grad_norm": 0.012582185678184032, + "learning_rate": 1.3232906841641469e-07, + "loss": 0.375, + "step": 1523 + }, + { + "epoch": 0.9781771501925546, + "grad_norm": 0.013945076614618301, + "learning_rate": 1.2487853535497197e-07, + "loss": 0.3574, + "step": 1524 + }, + { + "epoch": 0.978818998716303, + "grad_norm": 0.013039125129580498, + "learning_rate": 1.1764360520769568e-07, + "loss": 0.3672, + "step": 1525 + }, + { + "epoch": 0.9794608472400513, + "grad_norm": 0.013115467503666878, + "learning_rate": 1.1062430925012911e-07, + "loss": 0.3633, + "step": 1526 + }, + { + "epoch": 0.9801026957637997, + "grad_norm": 0.012725510634481907, + "learning_rate": 1.0382067782568338e-07, + "loss": 0.3867, + "step": 1527 + }, + { + "epoch": 0.9807445442875481, + "grad_norm": 0.013744517229497433, + "learning_rate": 9.723274034545981e-08, + "loss": 0.3672, + "step": 1528 + }, + { + "epoch": 0.9813863928112966, + "grad_norm": 0.01282016932964325, + "learning_rate": 9.086052528816113e-08, + "loss": 0.3672, + "step": 1529 + }, + { + "epoch": 0.982028241335045, + "grad_norm": 0.012451869435608387, + "learning_rate": 8.470406019994714e-08, + "loss": 0.3789, + "step": 1530 + }, + { + "epoch": 0.9826700898587933, + "grad_norm": 0.012845706194639206, + "learning_rate": 7.876337169432368e-08, + "loss": 0.3906, + "step": 1531 + }, + { + "epoch": 0.9833119383825417, + "grad_norm": 0.01277588028460741, + "learning_rate": 7.303848545202052e-08, + "loss": 0.3984, + "step": 1532 + }, + { + "epoch": 0.9839537869062901, + "grad_norm": 0.012288976460695267, + "learning_rate": 6.752942622089697e-08, + "loss": 0.3828, + "step": 1533 + }, + { + "epoch": 0.9845956354300385, + "grad_norm": 0.012748691253364086, + "learning_rate": 6.223621781581979e-08, + "loss": 0.3984, + "step": 1534 + }, + { + "epoch": 0.9852374839537869, + "grad_norm": 0.012774264439940453, + "learning_rate": 5.715888311855211e-08, + "loss": 0.3711, + "step": 1535 + }, + { + "epoch": 0.9858793324775353, + "grad_norm": 0.012290029786527157, + "learning_rate": 5.2297444077675784e-08, + "loss": 0.3867, + "step": 1536 + }, + { + "epoch": 0.9865211810012837, + "grad_norm": 0.012472493574023247, + "learning_rate": 4.765192170849697e-08, + "loss": 0.3516, + "step": 1537 + }, + { + "epoch": 0.9871630295250321, + "grad_norm": 0.012907999567687511, + "learning_rate": 4.322233609292403e-08, + "loss": 0.3926, + "step": 1538 + }, + { + "epoch": 0.9878048780487805, + "grad_norm": 0.013008086942136288, + "learning_rate": 3.9008706379412005e-08, + "loss": 0.3809, + "step": 1539 + }, + { + "epoch": 0.9884467265725289, + "grad_norm": 0.012015080079436302, + "learning_rate": 3.5011050782879364e-08, + "loss": 0.3691, + "step": 1540 + }, + { + "epoch": 0.9890885750962772, + "grad_norm": 0.012214963324368, + "learning_rate": 3.122938658460806e-08, + "loss": 0.3789, + "step": 1541 + }, + { + "epoch": 0.9897304236200257, + "grad_norm": 0.011233390308916569, + "learning_rate": 2.7663730132182485e-08, + "loss": 0.3516, + "step": 1542 + }, + { + "epoch": 0.9903722721437741, + "grad_norm": 0.011511021293699741, + "learning_rate": 2.4314096839417323e-08, + "loss": 0.3516, + "step": 1543 + }, + { + "epoch": 0.9910141206675225, + "grad_norm": 0.013254697434604168, + "learning_rate": 2.118050118629089e-08, + "loss": 0.3672, + "step": 1544 + }, + { + "epoch": 0.9916559691912709, + "grad_norm": 0.013225373812019825, + "learning_rate": 1.8262956718884117e-08, + "loss": 0.4023, + "step": 1545 + }, + { + "epoch": 0.9922978177150192, + "grad_norm": 0.012755938805639744, + "learning_rate": 1.5561476049325007e-08, + "loss": 0.3809, + "step": 1546 + }, + { + "epoch": 0.9929396662387676, + "grad_norm": 0.012606845237314701, + "learning_rate": 1.3076070855710942e-08, + "loss": 0.3555, + "step": 1547 + }, + { + "epoch": 0.993581514762516, + "grad_norm": 0.01340476330369711, + "learning_rate": 1.0806751882092014e-08, + "loss": 0.4023, + "step": 1548 + }, + { + "epoch": 0.9942233632862645, + "grad_norm": 0.013462042436003685, + "learning_rate": 8.753528938409972e-09, + "loss": 0.3516, + "step": 1549 + }, + { + "epoch": 0.9948652118100129, + "grad_norm": 0.01249738223850727, + "learning_rate": 6.9164109004427046e-09, + "loss": 0.3887, + "step": 1550 + }, + { + "epoch": 0.9955070603337612, + "grad_norm": 0.014013971202075481, + "learning_rate": 5.295405709787593e-09, + "loss": 0.4062, + "step": 1551 + }, + { + "epoch": 0.9961489088575096, + "grad_norm": 0.012625960633158684, + "learning_rate": 3.890520373817097e-09, + "loss": 0.3574, + "step": 1552 + }, + { + "epoch": 0.996790757381258, + "grad_norm": 0.012531465850770473, + "learning_rate": 2.7017609656454503e-09, + "loss": 0.3594, + "step": 1553 + }, + { + "epoch": 0.9974326059050064, + "grad_norm": 0.012438458390533924, + "learning_rate": 1.7291326241064553e-09, + "loss": 0.3711, + "step": 1554 + }, + { + "epoch": 0.9980744544287549, + "grad_norm": 0.013172676786780357, + "learning_rate": 9.726395537312806e-10, + "loss": 0.3613, + "step": 1555 + }, + { + "epoch": 0.9987163029525032, + "grad_norm": 0.013000345788896084, + "learning_rate": 4.3228502473180445e-10, + "loss": 0.3906, + "step": 1556 + }, + { + "epoch": 0.9993581514762516, + "grad_norm": 0.012555932626128197, + "learning_rate": 1.0807137297841329e-10, + "loss": 0.3828, + "step": 1557 + }, + { + "epoch": 1.0, + "grad_norm": 0.013687052763998508, + "learning_rate": 0.0, + "loss": 0.373, + "step": 1558 + }, + { + "epoch": 1.0, + "step": 1558, + "total_flos": 0.0, + "train_loss": 0.16030793685815148, + "train_runtime": 10881.6807, + "train_samples_per_second": 586.271, + "train_steps_per_second": 0.143 + } + ], + "logging_steps": 1.0, + "max_steps": 1558, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}