{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.808664259927799, "eval_steps": 25, "global_step": 510, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019253910950661854, "grad_norm": 65.07794952392578, "learning_rate": 2e-05, "loss": 10.0254, "step": 1 }, { "epoch": 0.03850782190132371, "grad_norm": 35.06503677368164, "learning_rate": 2e-05, "loss": 9.6326, "step": 2 }, { "epoch": 0.05776173285198556, "grad_norm": Infinity, "learning_rate": 2e-05, "loss": 9.7444, "step": 3 }, { "epoch": 0.07701564380264742, "grad_norm": 91.54127502441406, "learning_rate": 2e-05, "loss": 9.9694, "step": 4 }, { "epoch": 0.09626955475330927, "grad_norm": 48.64268493652344, "learning_rate": 2e-05, "loss": 9.777, "step": 5 }, { "epoch": 0.11552346570397112, "grad_norm": 45.61400604248047, "learning_rate": 2e-05, "loss": 9.6125, "step": 6 }, { "epoch": 0.13477737665463296, "grad_norm": 53.380062103271484, "learning_rate": 2e-05, "loss": 9.6476, "step": 7 }, { "epoch": 0.15403128760529483, "grad_norm": 48.681732177734375, "learning_rate": 2e-05, "loss": 9.5976, "step": 8 }, { "epoch": 0.17328519855595667, "grad_norm": 75.82644653320312, "learning_rate": 2e-05, "loss": 9.635, "step": 9 }, { "epoch": 0.19253910950661854, "grad_norm": 52.92415237426758, "learning_rate": 2e-05, "loss": 9.4741, "step": 10 }, { "epoch": 0.21179302045728038, "grad_norm": 45.76670455932617, "learning_rate": 2e-05, "loss": 9.4043, "step": 11 }, { "epoch": 0.23104693140794225, "grad_norm": 83.89037322998047, "learning_rate": 2e-05, "loss": 9.4551, "step": 12 }, { "epoch": 0.2503008423586041, "grad_norm": 63.491943359375, "learning_rate": 2e-05, "loss": 9.3453, "step": 13 }, { "epoch": 0.2695547533092659, "grad_norm": 51.0808219909668, "learning_rate": 2e-05, "loss": 9.2297, "step": 14 }, { "epoch": 0.2888086642599278, "grad_norm": 44.37099838256836, "learning_rate": 2e-05, "loss": 9.1875, "step": 15 }, { "epoch": 0.30806257521058966, "grad_norm": 67.27417755126953, "learning_rate": 2e-05, "loss": 9.1203, "step": 16 }, { "epoch": 0.32731648616125153, "grad_norm": 77.5335922241211, "learning_rate": 2e-05, "loss": 9.0269, "step": 17 }, { "epoch": 0.34657039711191334, "grad_norm": 83.21548461914062, "learning_rate": 2e-05, "loss": 9.0223, "step": 18 }, { "epoch": 0.3658243080625752, "grad_norm": 83.81153106689453, "learning_rate": 2e-05, "loss": 8.8856, "step": 19 }, { "epoch": 0.3850782190132371, "grad_norm": 70.34122467041016, "learning_rate": 2e-05, "loss": 8.7546, "step": 20 }, { "epoch": 0.4043321299638989, "grad_norm": 71.87590789794922, "learning_rate": 2e-05, "loss": 8.692, "step": 21 }, { "epoch": 0.42358604091456076, "grad_norm": 61.52598571777344, "learning_rate": 2e-05, "loss": 8.7245, "step": 22 }, { "epoch": 0.4428399518652226, "grad_norm": 81.4962158203125, "learning_rate": 2e-05, "loss": 8.4164, "step": 23 }, { "epoch": 0.4620938628158845, "grad_norm": 66.4041976928711, "learning_rate": 2e-05, "loss": 8.4377, "step": 24 }, { "epoch": 0.4813477737665463, "grad_norm": 74.9264144897461, "learning_rate": 2e-05, "loss": 8.305, "step": 25 }, { "epoch": 0.4813477737665463, "eval_clap": 0.2508222758769989, "eval_loss": 3.7162673473358154, "eval_runtime": 195.557, "eval_samples_per_second": 0.164, "eval_steps_per_second": 0.164, "step": 25 }, { "epoch": 0.5006016847172082, "grad_norm": 57.42286682128906, "learning_rate": 2e-05, "loss": 8.3213, "step": 26 }, { "epoch": 0.51985559566787, "grad_norm": 64.55001831054688, "learning_rate": 2e-05, "loss": 8.2368, "step": 27 }, { "epoch": 0.5391095066185319, "grad_norm": 51.85919952392578, "learning_rate": 2e-05, "loss": 8.2753, "step": 28 }, { "epoch": 0.5583634175691937, "grad_norm": 53.471458435058594, "learning_rate": 2e-05, "loss": 8.0826, "step": 29 }, { "epoch": 0.5776173285198556, "grad_norm": 49.86967849731445, "learning_rate": 2e-05, "loss": 7.9446, "step": 30 }, { "epoch": 0.5968712394705175, "grad_norm": 62.497581481933594, "learning_rate": 2e-05, "loss": 7.7911, "step": 31 }, { "epoch": 0.6161251504211793, "grad_norm": 46.13528823852539, "learning_rate": 2e-05, "loss": 7.7964, "step": 32 }, { "epoch": 0.6353790613718412, "grad_norm": 56.71400451660156, "learning_rate": 2e-05, "loss": 7.5232, "step": 33 }, { "epoch": 0.6546329723225031, "grad_norm": 43.25479507446289, "learning_rate": 2e-05, "loss": 7.7154, "step": 34 }, { "epoch": 0.6738868832731648, "grad_norm": 44.234336853027344, "learning_rate": 2e-05, "loss": 7.3668, "step": 35 }, { "epoch": 0.6931407942238267, "grad_norm": 43.533878326416016, "learning_rate": 2e-05, "loss": 7.8787, "step": 36 }, { "epoch": 0.7123947051744886, "grad_norm": 37.30876922607422, "learning_rate": 2e-05, "loss": 7.661, "step": 37 }, { "epoch": 0.7316486161251504, "grad_norm": 37.29338073730469, "learning_rate": 2e-05, "loss": 7.5059, "step": 38 }, { "epoch": 0.7509025270758123, "grad_norm": 39.37627410888672, "learning_rate": 2e-05, "loss": 7.6255, "step": 39 }, { "epoch": 0.7701564380264742, "grad_norm": 33.975730895996094, "learning_rate": 2e-05, "loss": 7.271, "step": 40 }, { "epoch": 0.789410348977136, "grad_norm": 43.57024002075195, "learning_rate": 2e-05, "loss": 7.6087, "step": 41 }, { "epoch": 0.8086642599277978, "grad_norm": 41.361568450927734, "learning_rate": 2e-05, "loss": 7.2042, "step": 42 }, { "epoch": 0.8279181708784596, "grad_norm": 41.087318420410156, "learning_rate": 2e-05, "loss": 7.188, "step": 43 }, { "epoch": 0.8471720818291215, "grad_norm": 39.903804779052734, "learning_rate": 2e-05, "loss": 7.0335, "step": 44 }, { "epoch": 0.8664259927797834, "grad_norm": 43.420780181884766, "learning_rate": 2e-05, "loss": 7.1334, "step": 45 }, { "epoch": 0.8856799037304453, "grad_norm": 38.208740234375, "learning_rate": 2e-05, "loss": 6.9014, "step": 46 }, { "epoch": 0.9049338146811071, "grad_norm": 34.4349250793457, "learning_rate": 2e-05, "loss": 6.9772, "step": 47 }, { "epoch": 0.924187725631769, "grad_norm": 42.55630874633789, "learning_rate": 2e-05, "loss": 7.3901, "step": 48 }, { "epoch": 0.9434416365824309, "grad_norm": 37.8997917175293, "learning_rate": 2e-05, "loss": 6.9604, "step": 49 }, { "epoch": 0.9626955475330926, "grad_norm": 27.6505184173584, "learning_rate": 2e-05, "loss": 6.9842, "step": 50 }, { "epoch": 0.9626955475330926, "eval_clap": 0.05512786656618118, "eval_loss": 4.608064651489258, "eval_runtime": 203.976, "eval_samples_per_second": 0.157, "eval_steps_per_second": 0.157, "step": 50 }, { "epoch": 0.9819494584837545, "grad_norm": 29.874601364135742, "learning_rate": 2e-05, "loss": 6.5285, "step": 51 }, { "epoch": 1.0, "grad_norm": 32.52230453491211, "learning_rate": 2e-05, "loss": 6.0624, "step": 52 }, { "epoch": 1.0192539109506618, "grad_norm": 25.571067810058594, "learning_rate": 2e-05, "loss": 6.965, "step": 53 }, { "epoch": 1.0385078219013237, "grad_norm": 24.804780960083008, "learning_rate": 2e-05, "loss": 6.6082, "step": 54 }, { "epoch": 1.0577617328519855, "grad_norm": 26.17034339904785, "learning_rate": 2e-05, "loss": 6.8646, "step": 55 }, { "epoch": 1.0770156438026475, "grad_norm": 26.33489990234375, "learning_rate": 2e-05, "loss": 7.0679, "step": 56 }, { "epoch": 1.0962695547533092, "grad_norm": 22.185443878173828, "learning_rate": 2e-05, "loss": 6.9692, "step": 57 }, { "epoch": 1.1155234657039712, "grad_norm": 24.1581974029541, "learning_rate": 2e-05, "loss": 6.7359, "step": 58 }, { "epoch": 1.134777376654633, "grad_norm": 19.098318099975586, "learning_rate": 2e-05, "loss": 6.8621, "step": 59 }, { "epoch": 1.154031287605295, "grad_norm": 18.446277618408203, "learning_rate": 2e-05, "loss": 6.9175, "step": 60 }, { "epoch": 1.1732851985559567, "grad_norm": 18.82305908203125, "learning_rate": 2e-05, "loss": 6.8505, "step": 61 }, { "epoch": 1.1925391095066185, "grad_norm": 23.049110412597656, "learning_rate": 2e-05, "loss": 6.4014, "step": 62 }, { "epoch": 1.2117930204572804, "grad_norm": 30.802398681640625, "learning_rate": 2e-05, "loss": 6.1833, "step": 63 }, { "epoch": 1.2310469314079422, "grad_norm": 31.79558563232422, "learning_rate": 2e-05, "loss": 6.3301, "step": 64 }, { "epoch": 1.2503008423586042, "grad_norm": 17.523427963256836, "learning_rate": 2e-05, "loss": 6.5382, "step": 65 }, { "epoch": 1.269554753309266, "grad_norm": 20.55564308166504, "learning_rate": 2e-05, "loss": 6.8307, "step": 66 }, { "epoch": 1.288808664259928, "grad_norm": 14.00365924835205, "learning_rate": 2e-05, "loss": 6.688, "step": 67 }, { "epoch": 1.3080625752105897, "grad_norm": 20.74002456665039, "learning_rate": 2e-05, "loss": 6.3775, "step": 68 }, { "epoch": 1.3273164861612514, "grad_norm": 13.387882232666016, "learning_rate": 2e-05, "loss": 6.4859, "step": 69 }, { "epoch": 1.3465703971119134, "grad_norm": 13.83588981628418, "learning_rate": 2e-05, "loss": 6.6777, "step": 70 }, { "epoch": 1.3658243080625752, "grad_norm": 18.38031005859375, "learning_rate": 2e-05, "loss": 6.712, "step": 71 }, { "epoch": 1.3850782190132371, "grad_norm": 15.858037948608398, "learning_rate": 2e-05, "loss": 6.7095, "step": 72 }, { "epoch": 1.404332129963899, "grad_norm": 14.21243667602539, "learning_rate": 2e-05, "loss": 6.6714, "step": 73 }, { "epoch": 1.4235860409145609, "grad_norm": 13.775053024291992, "learning_rate": 2e-05, "loss": 6.3512, "step": 74 }, { "epoch": 1.4428399518652226, "grad_norm": 18.616239547729492, "learning_rate": 2e-05, "loss": 6.9013, "step": 75 }, { "epoch": 1.4428399518652226, "eval_clap": 0.05818195268511772, "eval_loss": 5.64979362487793, "eval_runtime": 195.9522, "eval_samples_per_second": 0.163, "eval_steps_per_second": 0.163, "step": 75 }, { "epoch": 1.4620938628158844, "grad_norm": 27.710369110107422, "learning_rate": 2e-05, "loss": 6.0035, "step": 76 }, { "epoch": 1.4813477737665464, "grad_norm": 20.711942672729492, "learning_rate": 2e-05, "loss": 6.1597, "step": 77 }, { "epoch": 1.5006016847172083, "grad_norm": 11.78789234161377, "learning_rate": 2e-05, "loss": 6.4754, "step": 78 }, { "epoch": 1.5198555956678699, "grad_norm": 18.79988670349121, "learning_rate": 2e-05, "loss": 6.7886, "step": 79 }, { "epoch": 1.5391095066185319, "grad_norm": 9.708053588867188, "learning_rate": 2e-05, "loss": 6.4123, "step": 80 }, { "epoch": 1.5583634175691938, "grad_norm": 10.420573234558105, "learning_rate": 2e-05, "loss": 6.3586, "step": 81 }, { "epoch": 1.5776173285198556, "grad_norm": 12.453520774841309, "learning_rate": 2e-05, "loss": 6.2891, "step": 82 }, { "epoch": 1.5968712394705173, "grad_norm": 18.412593841552734, "learning_rate": 2e-05, "loss": 6.0656, "step": 83 }, { "epoch": 1.6161251504211793, "grad_norm": 15.798463821411133, "learning_rate": 2e-05, "loss": 6.5867, "step": 84 }, { "epoch": 1.6353790613718413, "grad_norm": 9.848593711853027, "learning_rate": 2e-05, "loss": 6.3659, "step": 85 }, { "epoch": 1.654632972322503, "grad_norm": 10.542499542236328, "learning_rate": 2e-05, "loss": 6.1898, "step": 86 }, { "epoch": 1.6738868832731648, "grad_norm": 10.715902328491211, "learning_rate": 2e-05, "loss": 6.2704, "step": 87 }, { "epoch": 1.6931407942238268, "grad_norm": 10.027088165283203, "learning_rate": 2e-05, "loss": 6.1835, "step": 88 }, { "epoch": 1.7123947051744886, "grad_norm": 17.48147964477539, "learning_rate": 2e-05, "loss": 6.0763, "step": 89 }, { "epoch": 1.7316486161251503, "grad_norm": 17.746797561645508, "learning_rate": 2e-05, "loss": 5.8703, "step": 90 }, { "epoch": 1.7509025270758123, "grad_norm": 28.819969177246094, "learning_rate": 2e-05, "loss": 6.8541, "step": 91 }, { "epoch": 1.7701564380264743, "grad_norm": 13.42017936706543, "learning_rate": 2e-05, "loss": 6.2243, "step": 92 }, { "epoch": 1.789410348977136, "grad_norm": 17.955583572387695, "learning_rate": 2e-05, "loss": 6.5344, "step": 93 }, { "epoch": 1.8086642599277978, "grad_norm": 22.70500946044922, "learning_rate": 2e-05, "loss": 6.6459, "step": 94 }, { "epoch": 1.8279181708784598, "grad_norm": 13.345677375793457, "learning_rate": 2e-05, "loss": 6.425, "step": 95 }, { "epoch": 1.8471720818291215, "grad_norm": 13.83546257019043, "learning_rate": 2e-05, "loss": 6.4378, "step": 96 }, { "epoch": 1.8664259927797833, "grad_norm": 23.364137649536133, "learning_rate": 2e-05, "loss": 5.854, "step": 97 }, { "epoch": 1.8856799037304453, "grad_norm": 8.366693496704102, "learning_rate": 2e-05, "loss": 6.2678, "step": 98 }, { "epoch": 1.9049338146811072, "grad_norm": 14.662858009338379, "learning_rate": 2e-05, "loss": 5.9462, "step": 99 }, { "epoch": 1.924187725631769, "grad_norm": 12.826397895812988, "learning_rate": 2e-05, "loss": 6.0195, "step": 100 }, { "epoch": 1.924187725631769, "eval_clap": 0.14504113793373108, "eval_loss": 5.763121604919434, "eval_runtime": 204.238, "eval_samples_per_second": 0.157, "eval_steps_per_second": 0.157, "step": 100 }, { "epoch": 1.9434416365824307, "grad_norm": 11.27551555633545, "learning_rate": 2e-05, "loss": 6.1058, "step": 101 }, { "epoch": 1.9626955475330927, "grad_norm": 16.934494018554688, "learning_rate": 2e-05, "loss": 6.6169, "step": 102 }, { "epoch": 1.9819494584837545, "grad_norm": 10.172548294067383, "learning_rate": 2e-05, "loss": 6.085, "step": 103 }, { "epoch": 2.0, "grad_norm": 21.66511344909668, "learning_rate": 2e-05, "loss": 6.374, "step": 104 }, { "epoch": 2.019253910950662, "grad_norm": 14.909863471984863, "learning_rate": 2e-05, "loss": 6.5841, "step": 105 }, { "epoch": 2.0385078219013235, "grad_norm": 13.277375221252441, "learning_rate": 2e-05, "loss": 6.0897, "step": 106 }, { "epoch": 2.0577617328519855, "grad_norm": 14.995245933532715, "learning_rate": 2e-05, "loss": 6.6047, "step": 107 }, { "epoch": 2.0770156438026475, "grad_norm": 8.810375213623047, "learning_rate": 2e-05, "loss": 6.4019, "step": 108 }, { "epoch": 2.0962695547533094, "grad_norm": 9.020201683044434, "learning_rate": 2e-05, "loss": 6.1859, "step": 109 }, { "epoch": 2.115523465703971, "grad_norm": 13.133101463317871, "learning_rate": 2e-05, "loss": 6.1151, "step": 110 }, { "epoch": 2.134777376654633, "grad_norm": 9.436896324157715, "learning_rate": 2e-05, "loss": 6.5699, "step": 111 }, { "epoch": 2.154031287605295, "grad_norm": 7.937966346740723, "learning_rate": 2e-05, "loss": 6.4376, "step": 112 }, { "epoch": 2.1732851985559565, "grad_norm": 9.059781074523926, "learning_rate": 2e-05, "loss": 6.514, "step": 113 }, { "epoch": 2.1925391095066185, "grad_norm": 6.838661193847656, "learning_rate": 2e-05, "loss": 6.4801, "step": 114 }, { "epoch": 2.2117930204572804, "grad_norm": 21.459503173828125, "learning_rate": 2e-05, "loss": 5.9983, "step": 115 }, { "epoch": 2.2310469314079424, "grad_norm": 10.961411476135254, "learning_rate": 2e-05, "loss": 6.257, "step": 116 }, { "epoch": 2.250300842358604, "grad_norm": 24.96747398376465, "learning_rate": 2e-05, "loss": 6.0786, "step": 117 }, { "epoch": 2.269554753309266, "grad_norm": 10.531516075134277, "learning_rate": 2e-05, "loss": 6.3389, "step": 118 }, { "epoch": 2.288808664259928, "grad_norm": 12.07296085357666, "learning_rate": 2e-05, "loss": 6.2715, "step": 119 }, { "epoch": 2.30806257521059, "grad_norm": 8.665770530700684, "learning_rate": 2e-05, "loss": 6.3378, "step": 120 }, { "epoch": 2.3273164861612514, "grad_norm": 11.358579635620117, "learning_rate": 2e-05, "loss": 6.2564, "step": 121 }, { "epoch": 2.3465703971119134, "grad_norm": 13.47236156463623, "learning_rate": 2e-05, "loss": 6.1455, "step": 122 }, { "epoch": 2.3658243080625754, "grad_norm": 6.947666168212891, "learning_rate": 2e-05, "loss": 6.3356, "step": 123 }, { "epoch": 2.385078219013237, "grad_norm": 13.422441482543945, "learning_rate": 2e-05, "loss": 6.5743, "step": 124 }, { "epoch": 2.404332129963899, "grad_norm": 8.865825653076172, "learning_rate": 2e-05, "loss": 6.447, "step": 125 }, { "epoch": 2.404332129963899, "eval_clap": 0.11870799958705902, "eval_loss": 5.7946672439575195, "eval_runtime": 196.002, "eval_samples_per_second": 0.163, "eval_steps_per_second": 0.163, "step": 125 }, { "epoch": 2.423586040914561, "grad_norm": 14.267729759216309, "learning_rate": 2e-05, "loss": 6.1854, "step": 126 }, { "epoch": 2.4428399518652224, "grad_norm": 11.992257118225098, "learning_rate": 2e-05, "loss": 6.4266, "step": 127 }, { "epoch": 2.4620938628158844, "grad_norm": 8.23071575164795, "learning_rate": 2e-05, "loss": 6.1821, "step": 128 }, { "epoch": 2.4813477737665464, "grad_norm": 14.663762092590332, "learning_rate": 2e-05, "loss": 5.8956, "step": 129 }, { "epoch": 2.5006016847172083, "grad_norm": 19.018505096435547, "learning_rate": 2e-05, "loss": 6.748, "step": 130 }, { "epoch": 2.51985559566787, "grad_norm": 15.996222496032715, "learning_rate": 2e-05, "loss": 6.6005, "step": 131 }, { "epoch": 2.539109506618532, "grad_norm": 15.696185111999512, "learning_rate": 2e-05, "loss": 5.94, "step": 132 }, { "epoch": 2.558363417569194, "grad_norm": 28.96451759338379, "learning_rate": 2e-05, "loss": 5.6643, "step": 133 }, { "epoch": 2.577617328519856, "grad_norm": 10.068239212036133, "learning_rate": 2e-05, "loss": 6.0527, "step": 134 }, { "epoch": 2.5968712394705173, "grad_norm": 20.770143508911133, "learning_rate": 2e-05, "loss": 5.8241, "step": 135 }, { "epoch": 2.6161251504211793, "grad_norm": 8.34460735321045, "learning_rate": 2e-05, "loss": 6.2976, "step": 136 }, { "epoch": 2.6353790613718413, "grad_norm": 7.031894683837891, "learning_rate": 2e-05, "loss": 6.1223, "step": 137 }, { "epoch": 2.654632972322503, "grad_norm": 13.371209144592285, "learning_rate": 2e-05, "loss": 6.4443, "step": 138 }, { "epoch": 2.673886883273165, "grad_norm": 13.32111930847168, "learning_rate": 2e-05, "loss": 6.0339, "step": 139 }, { "epoch": 2.693140794223827, "grad_norm": 12.334694862365723, "learning_rate": 2e-05, "loss": 6.1672, "step": 140 }, { "epoch": 2.7123947051744883, "grad_norm": 18.95940589904785, "learning_rate": 2e-05, "loss": 5.7344, "step": 141 }, { "epoch": 2.7316486161251503, "grad_norm": 16.167844772338867, "learning_rate": 2e-05, "loss": 5.8938, "step": 142 }, { "epoch": 2.7509025270758123, "grad_norm": 13.141815185546875, "learning_rate": 2e-05, "loss": 5.7477, "step": 143 }, { "epoch": 2.7701564380264743, "grad_norm": 9.945357322692871, "learning_rate": 2e-05, "loss": 6.0525, "step": 144 }, { "epoch": 2.7894103489771362, "grad_norm": 18.400161743164062, "learning_rate": 2e-05, "loss": 6.4865, "step": 145 }, { "epoch": 2.808664259927798, "grad_norm": 9.252588272094727, "learning_rate": 2e-05, "loss": 6.1021, "step": 146 }, { "epoch": 2.8279181708784598, "grad_norm": 9.42403793334961, "learning_rate": 2e-05, "loss": 5.9196, "step": 147 }, { "epoch": 2.8471720818291217, "grad_norm": 16.049497604370117, "learning_rate": 2e-05, "loss": 6.3472, "step": 148 }, { "epoch": 2.8664259927797833, "grad_norm": 16.170551300048828, "learning_rate": 2e-05, "loss": 6.4077, "step": 149 }, { "epoch": 2.8856799037304453, "grad_norm": 13.0892915725708, "learning_rate": 2e-05, "loss": 6.215, "step": 150 }, { "epoch": 2.8856799037304453, "eval_clap": 0.11637458205223083, "eval_loss": 5.839620590209961, "eval_runtime": 203.9479, "eval_samples_per_second": 0.157, "eval_steps_per_second": 0.157, "step": 150 }, { "epoch": 2.9049338146811072, "grad_norm": 11.786327362060547, "learning_rate": 2e-05, "loss": 6.3015, "step": 151 }, { "epoch": 2.9241877256317688, "grad_norm": 8.937137603759766, "learning_rate": 2e-05, "loss": 6.1963, "step": 152 }, { "epoch": 2.9434416365824307, "grad_norm": 15.840847969055176, "learning_rate": 2e-05, "loss": 6.4957, "step": 153 }, { "epoch": 2.9626955475330927, "grad_norm": 11.317403793334961, "learning_rate": 2e-05, "loss": 6.4699, "step": 154 }, { "epoch": 2.9819494584837543, "grad_norm": 11.2737398147583, "learning_rate": 2e-05, "loss": 6.0559, "step": 155 }, { "epoch": 3.0, "grad_norm": 15.351630210876465, "learning_rate": 2e-05, "loss": 5.5406, "step": 156 }, { "epoch": 3.019253910950662, "grad_norm": 10.597087860107422, "learning_rate": 2e-05, "loss": 6.5337, "step": 157 }, { "epoch": 3.0385078219013235, "grad_norm": 8.941291809082031, "learning_rate": 2e-05, "loss": 6.2708, "step": 158 }, { "epoch": 3.0577617328519855, "grad_norm": 15.444201469421387, "learning_rate": 2e-05, "loss": 6.1448, "step": 159 }, { "epoch": 3.0770156438026475, "grad_norm": 8.186105728149414, "learning_rate": 2e-05, "loss": 6.2508, "step": 160 }, { "epoch": 3.0962695547533094, "grad_norm": 25.041446685791016, "learning_rate": 2e-05, "loss": 5.9141, "step": 161 }, { "epoch": 3.115523465703971, "grad_norm": 25.764001846313477, "learning_rate": 2e-05, "loss": 5.7969, "step": 162 }, { "epoch": 3.134777376654633, "grad_norm": 8.077290534973145, "learning_rate": 2e-05, "loss": 6.485, "step": 163 }, { "epoch": 3.154031287605295, "grad_norm": 6.288991451263428, "learning_rate": 2e-05, "loss": 6.3003, "step": 164 }, { "epoch": 3.1732851985559565, "grad_norm": 7.810708522796631, "learning_rate": 2e-05, "loss": 6.191, "step": 165 }, { "epoch": 3.1925391095066185, "grad_norm": 12.891411781311035, "learning_rate": 2e-05, "loss": 5.9668, "step": 166 }, { "epoch": 3.2117930204572804, "grad_norm": 9.540118217468262, "learning_rate": 2e-05, "loss": 6.093, "step": 167 }, { "epoch": 3.2310469314079424, "grad_norm": 9.51327896118164, "learning_rate": 2e-05, "loss": 6.1793, "step": 168 }, { "epoch": 3.250300842358604, "grad_norm": 16.127683639526367, "learning_rate": 2e-05, "loss": 6.0557, "step": 169 }, { "epoch": 3.269554753309266, "grad_norm": 17.678020477294922, "learning_rate": 2e-05, "loss": 6.5721, "step": 170 }, { "epoch": 3.288808664259928, "grad_norm": 11.319859504699707, "learning_rate": 2e-05, "loss": 5.9967, "step": 171 }, { "epoch": 3.30806257521059, "grad_norm": 15.657870292663574, "learning_rate": 2e-05, "loss": 5.9343, "step": 172 }, { "epoch": 3.3273164861612514, "grad_norm": 15.364697456359863, "learning_rate": 2e-05, "loss": 6.2908, "step": 173 }, { "epoch": 3.3465703971119134, "grad_norm": 23.229448318481445, "learning_rate": 2e-05, "loss": 5.7732, "step": 174 }, { "epoch": 3.3658243080625754, "grad_norm": 13.746365547180176, "learning_rate": 2e-05, "loss": 6.4075, "step": 175 }, { "epoch": 3.3658243080625754, "eval_clap": 0.11743690818548203, "eval_loss": 5.836143493652344, "eval_runtime": 204.2948, "eval_samples_per_second": 0.157, "eval_steps_per_second": 0.157, "step": 175 }, { "epoch": 3.385078219013237, "grad_norm": 9.525516510009766, "learning_rate": 2e-05, "loss": 6.0857, "step": 176 }, { "epoch": 3.404332129963899, "grad_norm": 18.301898956298828, "learning_rate": 2e-05, "loss": 6.4325, "step": 177 }, { "epoch": 3.423586040914561, "grad_norm": 12.342935562133789, "learning_rate": 2e-05, "loss": 6.1896, "step": 178 }, { "epoch": 3.4428399518652224, "grad_norm": 12.635440826416016, "learning_rate": 2e-05, "loss": 6.2818, "step": 179 }, { "epoch": 3.4620938628158844, "grad_norm": 10.180573463439941, "learning_rate": 2e-05, "loss": 6.2591, "step": 180 }, { "epoch": 3.4813477737665464, "grad_norm": 9.508129119873047, "learning_rate": 2e-05, "loss": 6.2806, "step": 181 }, { "epoch": 3.5006016847172083, "grad_norm": 10.56059455871582, "learning_rate": 2e-05, "loss": 6.459, "step": 182 }, { "epoch": 3.51985559566787, "grad_norm": 10.22647762298584, "learning_rate": 2e-05, "loss": 6.0333, "step": 183 }, { "epoch": 3.539109506618532, "grad_norm": 8.610014915466309, "learning_rate": 2e-05, "loss": 6.4448, "step": 184 }, { "epoch": 3.558363417569194, "grad_norm": 12.473418235778809, "learning_rate": 2e-05, "loss": 6.0245, "step": 185 }, { "epoch": 3.577617328519856, "grad_norm": 15.734831809997559, "learning_rate": 2e-05, "loss": 5.9932, "step": 186 }, { "epoch": 3.5968712394705173, "grad_norm": 8.682183265686035, "learning_rate": 2e-05, "loss": 6.2845, "step": 187 }, { "epoch": 3.6161251504211793, "grad_norm": 21.06833839416504, "learning_rate": 2e-05, "loss": 5.8533, "step": 188 }, { "epoch": 3.6353790613718413, "grad_norm": 22.429513931274414, "learning_rate": 2e-05, "loss": 5.7251, "step": 189 }, { "epoch": 3.654632972322503, "grad_norm": 7.118320941925049, "learning_rate": 2e-05, "loss": 6.2949, "step": 190 }, { "epoch": 3.673886883273165, "grad_norm": 11.422521591186523, "learning_rate": 2e-05, "loss": 6.1418, "step": 191 }, { "epoch": 3.693140794223827, "grad_norm": 11.730757713317871, "learning_rate": 2e-05, "loss": 6.0665, "step": 192 }, { "epoch": 3.7123947051744883, "grad_norm": 7.965202331542969, "learning_rate": 2e-05, "loss": 6.3251, "step": 193 }, { "epoch": 3.7316486161251503, "grad_norm": 9.634132385253906, "learning_rate": 2e-05, "loss": 6.0327, "step": 194 }, { "epoch": 3.7509025270758123, "grad_norm": 12.675938606262207, "learning_rate": 2e-05, "loss": 6.4535, "step": 195 }, { "epoch": 3.7701564380264743, "grad_norm": 11.747628211975098, "learning_rate": 2e-05, "loss": 5.8515, "step": 196 }, { "epoch": 3.7894103489771362, "grad_norm": 9.878296852111816, "learning_rate": 2e-05, "loss": 6.3101, "step": 197 }, { "epoch": 3.808664259927798, "grad_norm": 8.255465507507324, "learning_rate": 2e-05, "loss": 6.1733, "step": 198 }, { "epoch": 3.8279181708784598, "grad_norm": 15.391761779785156, "learning_rate": 2e-05, "loss": 6.4827, "step": 199 }, { "epoch": 3.8471720818291217, "grad_norm": 16.02042579650879, "learning_rate": 2e-05, "loss": 5.797, "step": 200 }, { "epoch": 3.8471720818291217, "eval_clap": 0.10931383073329926, "eval_loss": 5.815006256103516, "eval_runtime": 204.8855, "eval_samples_per_second": 0.156, "eval_steps_per_second": 0.156, "step": 200 }, { "epoch": 3.8664259927797833, "grad_norm": 11.096480369567871, "learning_rate": 2e-05, "loss": 6.3396, "step": 201 }, { "epoch": 3.8856799037304453, "grad_norm": 17.267515182495117, "learning_rate": 2e-05, "loss": 5.7346, "step": 202 }, { "epoch": 3.9049338146811072, "grad_norm": 10.836710929870605, "learning_rate": 2e-05, "loss": 6.3001, "step": 203 }, { "epoch": 3.9241877256317688, "grad_norm": 6.178803443908691, "learning_rate": 2e-05, "loss": 6.1283, "step": 204 }, { "epoch": 3.9434416365824307, "grad_norm": 8.270339965820312, "learning_rate": 2e-05, "loss": 6.2692, "step": 205 }, { "epoch": 3.9626955475330927, "grad_norm": 8.276531219482422, "learning_rate": 2e-05, "loss": 6.2181, "step": 206 }, { "epoch": 3.9819494584837543, "grad_norm": 9.491342544555664, "learning_rate": 2e-05, "loss": 6.4109, "step": 207 }, { "epoch": 4.0, "grad_norm": 16.148122787475586, "learning_rate": 2e-05, "loss": 5.4716, "step": 208 }, { "epoch": 4.0192539109506615, "grad_norm": 5.439089298248291, "learning_rate": 2e-05, "loss": 6.1773, "step": 209 }, { "epoch": 4.038507821901324, "grad_norm": 8.583178520202637, "learning_rate": 2e-05, "loss": 6.097, "step": 210 }, { "epoch": 4.0577617328519855, "grad_norm": 5.862834453582764, "learning_rate": 2e-05, "loss": 6.2586, "step": 211 }, { "epoch": 4.077015643802647, "grad_norm": 19.868268966674805, "learning_rate": 2e-05, "loss": 5.7732, "step": 212 }, { "epoch": 4.0962695547533094, "grad_norm": 8.219894409179688, "learning_rate": 2e-05, "loss": 6.1416, "step": 213 }, { "epoch": 4.115523465703971, "grad_norm": 8.35651683807373, "learning_rate": 2e-05, "loss": 6.3144, "step": 214 }, { "epoch": 4.1347773766546325, "grad_norm": 9.109415054321289, "learning_rate": 2e-05, "loss": 6.1057, "step": 215 }, { "epoch": 4.154031287605295, "grad_norm": 7.91225004196167, "learning_rate": 2e-05, "loss": 6.3672, "step": 216 }, { "epoch": 4.1732851985559565, "grad_norm": 13.663270950317383, "learning_rate": 2e-05, "loss": 6.104, "step": 217 }, { "epoch": 4.192539109506619, "grad_norm": 7.140188694000244, "learning_rate": 2e-05, "loss": 6.214, "step": 218 }, { "epoch": 4.21179302045728, "grad_norm": 20.248258590698242, "learning_rate": 2e-05, "loss": 5.7409, "step": 219 }, { "epoch": 4.231046931407942, "grad_norm": 6.1542487144470215, "learning_rate": 2e-05, "loss": 6.1971, "step": 220 }, { "epoch": 4.250300842358604, "grad_norm": 16.50935935974121, "learning_rate": 2e-05, "loss": 5.7022, "step": 221 }, { "epoch": 4.269554753309266, "grad_norm": 6.602309226989746, "learning_rate": 2e-05, "loss": 6.0883, "step": 222 }, { "epoch": 4.2888086642599275, "grad_norm": 10.636795043945312, "learning_rate": 2e-05, "loss": 6.1146, "step": 223 }, { "epoch": 4.30806257521059, "grad_norm": 15.217754364013672, "learning_rate": 2e-05, "loss": 6.4887, "step": 224 }, { "epoch": 4.327316486161251, "grad_norm": 11.162585258483887, "learning_rate": 2e-05, "loss": 6.347, "step": 225 }, { "epoch": 4.327316486161251, "eval_clap": 0.10287218540906906, "eval_loss": 5.793514251708984, "eval_runtime": 195.2618, "eval_samples_per_second": 0.164, "eval_steps_per_second": 0.164, "step": 225 }, { "epoch": 4.346570397111913, "grad_norm": 6.50248908996582, "learning_rate": 2e-05, "loss": 6.128, "step": 226 }, { "epoch": 4.365824308062575, "grad_norm": 12.357426643371582, "learning_rate": 2e-05, "loss": 6.2944, "step": 227 }, { "epoch": 4.385078219013237, "grad_norm": 11.979768753051758, "learning_rate": 2e-05, "loss": 6.0331, "step": 228 }, { "epoch": 4.404332129963899, "grad_norm": 14.140297889709473, "learning_rate": 2e-05, "loss": 5.8348, "step": 229 }, { "epoch": 4.423586040914561, "grad_norm": 13.665949821472168, "learning_rate": 2e-05, "loss": 5.9629, "step": 230 }, { "epoch": 4.442839951865222, "grad_norm": 25.926544189453125, "learning_rate": 2e-05, "loss": 5.5899, "step": 231 }, { "epoch": 4.462093862815885, "grad_norm": 11.478584289550781, "learning_rate": 2e-05, "loss": 6.06, "step": 232 }, { "epoch": 4.481347773766546, "grad_norm": 7.444418907165527, "learning_rate": 2e-05, "loss": 6.0127, "step": 233 }, { "epoch": 4.500601684717208, "grad_norm": 11.302255630493164, "learning_rate": 2e-05, "loss": 6.3942, "step": 234 }, { "epoch": 4.51985559566787, "grad_norm": 12.104110717773438, "learning_rate": 2e-05, "loss": 6.0193, "step": 235 }, { "epoch": 4.539109506618532, "grad_norm": 11.314817428588867, "learning_rate": 2e-05, "loss": 5.8347, "step": 236 }, { "epoch": 4.558363417569193, "grad_norm": 9.530527114868164, "learning_rate": 2e-05, "loss": 5.9688, "step": 237 }, { "epoch": 4.577617328519856, "grad_norm": 14.764016151428223, "learning_rate": 2e-05, "loss": 6.4083, "step": 238 }, { "epoch": 4.596871239470517, "grad_norm": 12.319718360900879, "learning_rate": 2e-05, "loss": 6.2156, "step": 239 }, { "epoch": 4.61612515042118, "grad_norm": 14.426762580871582, "learning_rate": 2e-05, "loss": 6.3868, "step": 240 }, { "epoch": 4.635379061371841, "grad_norm": 11.573673248291016, "learning_rate": 2e-05, "loss": 6.2762, "step": 241 }, { "epoch": 4.654632972322503, "grad_norm": 6.983804702758789, "learning_rate": 2e-05, "loss": 6.0903, "step": 242 }, { "epoch": 4.673886883273164, "grad_norm": 8.895957946777344, "learning_rate": 2e-05, "loss": 6.3329, "step": 243 }, { "epoch": 4.693140794223827, "grad_norm": 11.459211349487305, "learning_rate": 2e-05, "loss": 6.532, "step": 244 }, { "epoch": 4.712394705174488, "grad_norm": 7.807408809661865, "learning_rate": 2e-05, "loss": 6.4127, "step": 245 }, { "epoch": 4.731648616125151, "grad_norm": 11.404520034790039, "learning_rate": 2e-05, "loss": 6.031, "step": 246 }, { "epoch": 4.750902527075812, "grad_norm": 5.500706672668457, "learning_rate": 2e-05, "loss": 6.3023, "step": 247 }, { "epoch": 4.770156438026474, "grad_norm": 12.774914741516113, "learning_rate": 2e-05, "loss": 6.1873, "step": 248 }, { "epoch": 4.789410348977136, "grad_norm": NaN, "learning_rate": 2e-05, "loss": 6.1218, "step": 249 }, { "epoch": 4.808664259927798, "grad_norm": 37.46445083618164, "learning_rate": 2e-05, "loss": 5.3935, "step": 250 }, { "epoch": 4.808664259927798, "eval_clap": 0.10396266728639603, "eval_loss": 5.760514259338379, "eval_runtime": 196.0139, "eval_samples_per_second": 0.163, "eval_steps_per_second": 0.163, "step": 250 }, { "epoch": 4.827918170878459, "grad_norm": NaN, "learning_rate": 2e-05, "loss": 6.6959, "step": 251 }, { "epoch": 4.847172081829122, "grad_norm": 19.07754135131836, "learning_rate": 2e-05, "loss": 5.9438, "step": 252 }, { "epoch": 4.866425992779783, "grad_norm": 7.950329780578613, "learning_rate": 2e-05, "loss": 6.2661, "step": 253 }, { "epoch": 4.885679903730445, "grad_norm": 6.870863437652588, "learning_rate": 2e-05, "loss": 6.3211, "step": 254 }, { "epoch": 4.904933814681107, "grad_norm": 24.65618133544922, "learning_rate": 2e-05, "loss": 5.755, "step": 255 }, { "epoch": 4.924187725631769, "grad_norm": 18.71941566467285, "learning_rate": 2e-05, "loss": 5.8712, "step": 256 }, { "epoch": 4.943441636582431, "grad_norm": 22.989364624023438, "learning_rate": 2e-05, "loss": 5.9747, "step": 257 }, { "epoch": 4.962695547533093, "grad_norm": 9.502798080444336, "learning_rate": 2e-05, "loss": 6.2862, "step": 258 }, { "epoch": 4.981949458483754, "grad_norm": 7.668398857116699, "learning_rate": 2e-05, "loss": 6.0501, "step": 259 }, { "epoch": 5.0, "grad_norm": 11.83493423461914, "learning_rate": 2e-05, "loss": 5.7551, "step": 260 }, { "epoch": 5.0192539109506615, "grad_norm": 10.512737274169922, "learning_rate": 2e-05, "loss": 6.1863, "step": 261 }, { "epoch": 5.038507821901324, "grad_norm": 11.778082847595215, "learning_rate": 2e-05, "loss": 6.2453, "step": 262 }, { "epoch": 5.0577617328519855, "grad_norm": 8.815237998962402, "learning_rate": 2e-05, "loss": 6.0878, "step": 263 }, { "epoch": 5.077015643802647, "grad_norm": 8.149435997009277, "learning_rate": 2e-05, "loss": 6.109, "step": 264 }, { "epoch": 5.0962695547533094, "grad_norm": 17.911197662353516, "learning_rate": 2e-05, "loss": 6.5584, "step": 265 }, { "epoch": 5.115523465703971, "grad_norm": 15.374232292175293, "learning_rate": 2e-05, "loss": 5.6938, "step": 266 }, { "epoch": 5.1347773766546325, "grad_norm": 14.54749870300293, "learning_rate": 2e-05, "loss": 6.0208, "step": 267 }, { "epoch": 5.154031287605295, "grad_norm": 22.222078323364258, "learning_rate": 2e-05, "loss": 6.6856, "step": 268 }, { "epoch": 5.1732851985559565, "grad_norm": 14.544611930847168, "learning_rate": 2e-05, "loss": 6.5379, "step": 269 }, { "epoch": 5.192539109506619, "grad_norm": 7.748518466949463, "learning_rate": 2e-05, "loss": 6.124, "step": 270 }, { "epoch": 5.21179302045728, "grad_norm": 38.051204681396484, "learning_rate": 2e-05, "loss": 5.5926, "step": 271 }, { "epoch": 5.231046931407942, "grad_norm": 8.9555025100708, "learning_rate": 2e-05, "loss": 6.2883, "step": 272 }, { "epoch": 5.250300842358604, "grad_norm": 8.490158081054688, "learning_rate": 2e-05, "loss": 6.1882, "step": 273 }, { "epoch": 5.269554753309266, "grad_norm": 23.794368743896484, "learning_rate": 2e-05, "loss": 5.7156, "step": 274 }, { "epoch": 5.2888086642599275, "grad_norm": 9.640610694885254, "learning_rate": 2e-05, "loss": 6.0328, "step": 275 }, { "epoch": 5.2888086642599275, "eval_clap": 0.10059554874897003, "eval_loss": 5.7012224197387695, "eval_runtime": 196.9351, "eval_samples_per_second": 0.162, "eval_steps_per_second": 0.162, "step": 275 }, { "epoch": 5.30806257521059, "grad_norm": 10.774741172790527, "learning_rate": 2e-05, "loss": 6.0371, "step": 276 }, { "epoch": 5.327316486161251, "grad_norm": 31.07454490661621, "learning_rate": 2e-05, "loss": 5.4832, "step": 277 }, { "epoch": 5.346570397111913, "grad_norm": 20.25762939453125, "learning_rate": 2e-05, "loss": 5.829, "step": 278 }, { "epoch": 5.365824308062575, "grad_norm": 14.205655097961426, "learning_rate": 2e-05, "loss": 5.8925, "step": 279 }, { "epoch": 5.385078219013237, "grad_norm": 10.45982551574707, "learning_rate": 2e-05, "loss": 5.9619, "step": 280 }, { "epoch": 5.404332129963899, "grad_norm": 10.787467956542969, "learning_rate": 2e-05, "loss": 6.2064, "step": 281 }, { "epoch": 5.423586040914561, "grad_norm": 14.345209121704102, "learning_rate": 2e-05, "loss": 6.4463, "step": 282 }, { "epoch": 5.442839951865222, "grad_norm": 11.769346237182617, "learning_rate": 2e-05, "loss": 6.2416, "step": 283 }, { "epoch": 5.462093862815885, "grad_norm": 10.520780563354492, "learning_rate": 2e-05, "loss": 5.9882, "step": 284 }, { "epoch": 5.481347773766546, "grad_norm": 9.365582466125488, "learning_rate": 2e-05, "loss": 6.1293, "step": 285 }, { "epoch": 5.500601684717208, "grad_norm": 11.546175003051758, "learning_rate": 2e-05, "loss": 6.1385, "step": 286 }, { "epoch": 5.51985559566787, "grad_norm": 8.784673690795898, "learning_rate": 2e-05, "loss": 5.9021, "step": 287 }, { "epoch": 5.539109506618532, "grad_norm": 14.96414566040039, "learning_rate": 2e-05, "loss": 5.7696, "step": 288 }, { "epoch": 5.558363417569193, "grad_norm": 10.28773307800293, "learning_rate": 2e-05, "loss": 5.8558, "step": 289 }, { "epoch": 5.577617328519856, "grad_norm": 8.956218719482422, "learning_rate": 2e-05, "loss": 6.1751, "step": 290 }, { "epoch": 5.596871239470517, "grad_norm": 8.992794036865234, "learning_rate": 2e-05, "loss": 5.9301, "step": 291 }, { "epoch": 5.61612515042118, "grad_norm": 15.411934852600098, "learning_rate": 2e-05, "loss": 5.7903, "step": 292 }, { "epoch": 5.635379061371841, "grad_norm": 17.23996925354004, "learning_rate": 2e-05, "loss": 5.7608, "step": 293 }, { "epoch": 5.654632972322503, "grad_norm": 9.80339241027832, "learning_rate": 2e-05, "loss": 5.9715, "step": 294 }, { "epoch": 5.673886883273164, "grad_norm": 9.020657539367676, "learning_rate": 2e-05, "loss": 5.9026, "step": 295 }, { "epoch": 5.693140794223827, "grad_norm": 10.856344223022461, "learning_rate": 2e-05, "loss": 5.9779, "step": 296 }, { "epoch": 5.712394705174488, "grad_norm": 8.707733154296875, "learning_rate": 2e-05, "loss": 6.1238, "step": 297 }, { "epoch": 5.731648616125151, "grad_norm": 14.64277172088623, "learning_rate": 2e-05, "loss": 6.0237, "step": 298 }, { "epoch": 5.750902527075812, "grad_norm": 11.925372123718262, "learning_rate": 2e-05, "loss": 6.393, "step": 299 }, { "epoch": 5.770156438026474, "grad_norm": 8.235823631286621, "learning_rate": 2e-05, "loss": 6.1476, "step": 300 }, { "epoch": 5.770156438026474, "eval_clap": 0.08804576098918915, "eval_loss": 5.670257568359375, "eval_runtime": 196.4571, "eval_samples_per_second": 0.163, "eval_steps_per_second": 0.163, "step": 300 }, { "epoch": 5.789410348977136, "grad_norm": 10.681437492370605, "learning_rate": 2e-05, "loss": 6.5, "step": 301 }, { "epoch": 5.808664259927798, "grad_norm": 7.725464820861816, "learning_rate": 2e-05, "loss": 6.3363, "step": 302 }, { "epoch": 5.827918170878459, "grad_norm": 23.754108428955078, "learning_rate": 2e-05, "loss": 5.4885, "step": 303 }, { "epoch": 5.847172081829122, "grad_norm": 16.799196243286133, "learning_rate": 2e-05, "loss": 5.8577, "step": 304 }, { "epoch": 5.866425992779783, "grad_norm": 18.410417556762695, "learning_rate": 2e-05, "loss": 5.7364, "step": 305 }, { "epoch": 5.885679903730445, "grad_norm": 13.101805686950684, "learning_rate": 2e-05, "loss": 6.1962, "step": 306 }, { "epoch": 5.904933814681107, "grad_norm": 20.438919067382812, "learning_rate": 2e-05, "loss": 5.6507, "step": 307 }, { "epoch": 5.924187725631769, "grad_norm": 16.583629608154297, "learning_rate": 2e-05, "loss": 5.6965, "step": 308 }, { "epoch": 5.943441636582431, "grad_norm": 12.213188171386719, "learning_rate": 2e-05, "loss": 5.9619, "step": 309 }, { "epoch": 5.962695547533093, "grad_norm": 10.092710494995117, "learning_rate": 2e-05, "loss": 5.9366, "step": 310 }, { "epoch": 5.981949458483754, "grad_norm": 11.882303237915039, "learning_rate": 2e-05, "loss": 5.653, "step": 311 }, { "epoch": 6.0, "grad_norm": 14.35622501373291, "learning_rate": 2e-05, "loss": 5.3527, "step": 312 }, { "epoch": 6.0192539109506615, "grad_norm": 15.661309242248535, "learning_rate": 2e-05, "loss": 5.9525, "step": 313 }, { "epoch": 6.038507821901324, "grad_norm": 20.924646377563477, "learning_rate": 2e-05, "loss": 6.5313, "step": 314 }, { "epoch": 6.0577617328519855, "grad_norm": 14.876949310302734, "learning_rate": 2e-05, "loss": 5.6237, "step": 315 }, { "epoch": 6.077015643802647, "grad_norm": 14.90765380859375, "learning_rate": 2e-05, "loss": 5.9514, "step": 316 }, { "epoch": 6.0962695547533094, "grad_norm": 11.085007667541504, "learning_rate": 2e-05, "loss": 6.3012, "step": 317 }, { "epoch": 6.115523465703971, "grad_norm": 7.102397918701172, "learning_rate": 2e-05, "loss": 6.2968, "step": 318 }, { "epoch": 6.1347773766546325, "grad_norm": 10.636114120483398, "learning_rate": 2e-05, "loss": 6.0036, "step": 319 }, { "epoch": 6.154031287605295, "grad_norm": 9.180078506469727, "learning_rate": 2e-05, "loss": 5.8668, "step": 320 }, { "epoch": 6.1732851985559565, "grad_norm": 8.283641815185547, "learning_rate": 2e-05, "loss": 5.9111, "step": 321 }, { "epoch": 6.192539109506619, "grad_norm": 16.489126205444336, "learning_rate": 2e-05, "loss": 5.7074, "step": 322 }, { "epoch": 6.21179302045728, "grad_norm": 8.536720275878906, "learning_rate": 2e-05, "loss": 6.0873, "step": 323 }, { "epoch": 6.231046931407942, "grad_norm": 19.378023147583008, "learning_rate": 2e-05, "loss": 5.7451, "step": 324 }, { "epoch": 6.250300842358604, "grad_norm": 8.368978500366211, "learning_rate": 2e-05, "loss": 5.9049, "step": 325 }, { "epoch": 6.250300842358604, "eval_clap": 0.12603802978992462, "eval_loss": 5.833277702331543, "eval_runtime": 196.2372, "eval_samples_per_second": 0.163, "eval_steps_per_second": 0.163, "step": 325 }, { "epoch": 6.269554753309266, "grad_norm": 9.310341835021973, "learning_rate": 2e-05, "loss": 5.9592, "step": 326 }, { "epoch": 6.2888086642599275, "grad_norm": 10.618816375732422, "learning_rate": 2e-05, "loss": 5.7331, "step": 327 }, { "epoch": 6.30806257521059, "grad_norm": 20.578750610351562, "learning_rate": 2e-05, "loss": 5.2624, "step": 328 }, { "epoch": 6.327316486161251, "grad_norm": 9.402525901794434, "learning_rate": 2e-05, "loss": 6.067, "step": 329 }, { "epoch": 6.346570397111913, "grad_norm": 9.60403060913086, "learning_rate": 2e-05, "loss": 6.214, "step": 330 }, { "epoch": 6.365824308062575, "grad_norm": 14.769834518432617, "learning_rate": 2e-05, "loss": 5.7947, "step": 331 }, { "epoch": 6.385078219013237, "grad_norm": 7.659432411193848, "learning_rate": 2e-05, "loss": 5.8996, "step": 332 }, { "epoch": 6.404332129963899, "grad_norm": 9.789624214172363, "learning_rate": 2e-05, "loss": 6.0822, "step": 333 }, { "epoch": 6.423586040914561, "grad_norm": 13.061244010925293, "learning_rate": 2e-05, "loss": 5.8736, "step": 334 }, { "epoch": 6.442839951865222, "grad_norm": 9.671375274658203, "learning_rate": 2e-05, "loss": 5.6454, "step": 335 }, { "epoch": 6.462093862815885, "grad_norm": 7.602629661560059, "learning_rate": 2e-05, "loss": 6.1014, "step": 336 }, { "epoch": 6.481347773766546, "grad_norm": 9.208328247070312, "learning_rate": 2e-05, "loss": 5.8553, "step": 337 }, { "epoch": 6.500601684717208, "grad_norm": 6.751482963562012, "learning_rate": 2e-05, "loss": 6.3873, "step": 338 }, { "epoch": 6.51985559566787, "grad_norm": 19.14604377746582, "learning_rate": 2e-05, "loss": 5.5979, "step": 339 }, { "epoch": 6.539109506618532, "grad_norm": 6.979920387268066, "learning_rate": 2e-05, "loss": 5.9776, "step": 340 }, { "epoch": 6.558363417569193, "grad_norm": 12.88497257232666, "learning_rate": 2e-05, "loss": 6.4447, "step": 341 }, { "epoch": 6.577617328519856, "grad_norm": 9.550447463989258, "learning_rate": 2e-05, "loss": 6.0607, "step": 342 }, { "epoch": 6.596871239470517, "grad_norm": 17.2004451751709, "learning_rate": 2e-05, "loss": 5.8465, "step": 343 }, { "epoch": 6.61612515042118, "grad_norm": 9.696534156799316, "learning_rate": 2e-05, "loss": 5.7955, "step": 344 }, { "epoch": 6.635379061371841, "grad_norm": 6.361357688903809, "learning_rate": 2e-05, "loss": 6.2663, "step": 345 }, { "epoch": 6.654632972322503, "grad_norm": 28.56427574157715, "learning_rate": 2e-05, "loss": 5.0443, "step": 346 }, { "epoch": 6.673886883273164, "grad_norm": 10.24687671661377, "learning_rate": 2e-05, "loss": 5.9321, "step": 347 }, { "epoch": 6.693140794223827, "grad_norm": 8.435123443603516, "learning_rate": 2e-05, "loss": 6.1062, "step": 348 }, { "epoch": 6.712394705174488, "grad_norm": 6.668614387512207, "learning_rate": 2e-05, "loss": 5.9755, "step": 349 }, { "epoch": 6.731648616125151, "grad_norm": 8.491061210632324, "learning_rate": 2e-05, "loss": 5.8547, "step": 350 }, { "epoch": 6.731648616125151, "eval_clap": 0.14147447049617767, "eval_loss": 5.917084693908691, "eval_runtime": 201.9514, "eval_samples_per_second": 0.158, "eval_steps_per_second": 0.158, "step": 350 }, { "epoch": 6.750902527075812, "grad_norm": 10.25600528717041, "learning_rate": 2e-05, "loss": 5.8823, "step": 351 }, { "epoch": 6.770156438026474, "grad_norm": 8.48415470123291, "learning_rate": 2e-05, "loss": 5.8709, "step": 352 }, { "epoch": 6.789410348977136, "grad_norm": 18.866851806640625, "learning_rate": 2e-05, "loss": 5.7262, "step": 353 }, { "epoch": 6.808664259927798, "grad_norm": 7.648865222930908, "learning_rate": 2e-05, "loss": 5.6922, "step": 354 }, { "epoch": 6.827918170878459, "grad_norm": 12.319436073303223, "learning_rate": 2e-05, "loss": 6.1132, "step": 355 }, { "epoch": 6.847172081829122, "grad_norm": 6.688267230987549, "learning_rate": 2e-05, "loss": 6.0394, "step": 356 }, { "epoch": 6.866425992779783, "grad_norm": 8.100188255310059, "learning_rate": 2e-05, "loss": 5.7567, "step": 357 }, { "epoch": 6.885679903730445, "grad_norm": 19.447267532348633, "learning_rate": 2e-05, "loss": 5.1313, "step": 358 }, { "epoch": 6.904933814681107, "grad_norm": 8.269752502441406, "learning_rate": 2e-05, "loss": 5.6813, "step": 359 }, { "epoch": 6.924187725631769, "grad_norm": 11.055656433105469, "learning_rate": 2e-05, "loss": 5.7952, "step": 360 }, { "epoch": 6.943441636582431, "grad_norm": 12.430220603942871, "learning_rate": 2e-05, "loss": 5.4678, "step": 361 }, { "epoch": 6.962695547533093, "grad_norm": 9.070528984069824, "learning_rate": 2e-05, "loss": 6.3099, "step": 362 }, { "epoch": 6.981949458483754, "grad_norm": 11.570778846740723, "learning_rate": 2e-05, "loss": 5.9188, "step": 363 }, { "epoch": 7.0, "grad_norm": 7.6774821281433105, "learning_rate": 2e-05, "loss": 5.7076, "step": 364 }, { "epoch": 7.0192539109506615, "grad_norm": 7.454392910003662, "learning_rate": 2e-05, "loss": 6.3174, "step": 365 }, { "epoch": 7.038507821901324, "grad_norm": 16.39904022216797, "learning_rate": 2e-05, "loss": 5.5487, "step": 366 }, { "epoch": 7.0577617328519855, "grad_norm": 8.567000389099121, "learning_rate": 2e-05, "loss": 5.8364, "step": 367 }, { "epoch": 7.077015643802647, "grad_norm": 11.228474617004395, "learning_rate": 2e-05, "loss": 6.1298, "step": 368 }, { "epoch": 7.0962695547533094, "grad_norm": 8.90956974029541, "learning_rate": 2e-05, "loss": 5.8714, "step": 369 }, { "epoch": 7.115523465703971, "grad_norm": 11.500467300415039, "learning_rate": 2e-05, "loss": 6.0238, "step": 370 }, { "epoch": 7.1347773766546325, "grad_norm": 9.86660099029541, "learning_rate": 2e-05, "loss": 5.4866, "step": 371 }, { "epoch": 7.154031287605295, "grad_norm": 15.263172149658203, "learning_rate": 2e-05, "loss": 5.6674, "step": 372 }, { "epoch": 7.1732851985559565, "grad_norm": 9.802757263183594, "learning_rate": 2e-05, "loss": 6.2604, "step": 373 }, { "epoch": 7.192539109506619, "grad_norm": 9.074177742004395, "learning_rate": 2e-05, "loss": 6.0723, "step": 374 }, { "epoch": 7.21179302045728, "grad_norm": 13.025900840759277, "learning_rate": 2e-05, "loss": 5.5151, "step": 375 }, { "epoch": 7.21179302045728, "eval_clap": 0.14877665042877197, "eval_loss": 6.030681610107422, "eval_runtime": 196.9659, "eval_samples_per_second": 0.162, "eval_steps_per_second": 0.162, "step": 375 }, { "epoch": 7.231046931407942, "grad_norm": 8.610788345336914, "learning_rate": 2e-05, "loss": 5.4889, "step": 376 }, { "epoch": 7.250300842358604, "grad_norm": 8.993633270263672, "learning_rate": 2e-05, "loss": 6.0395, "step": 377 }, { "epoch": 7.269554753309266, "grad_norm": 11.314407348632812, "learning_rate": 2e-05, "loss": 5.7346, "step": 378 }, { "epoch": 7.2888086642599275, "grad_norm": 6.467037677764893, "learning_rate": 2e-05, "loss": 5.7235, "step": 379 }, { "epoch": 7.30806257521059, "grad_norm": 8.104763984680176, "learning_rate": 2e-05, "loss": 5.9586, "step": 380 }, { "epoch": 7.327316486161251, "grad_norm": 7.068090915679932, "learning_rate": 2e-05, "loss": 5.9094, "step": 381 }, { "epoch": 7.346570397111913, "grad_norm": 12.21611213684082, "learning_rate": 2e-05, "loss": 5.4139, "step": 382 }, { "epoch": 7.365824308062575, "grad_norm": 7.650627136230469, "learning_rate": 2e-05, "loss": 5.9595, "step": 383 }, { "epoch": 7.385078219013237, "grad_norm": 9.73534107208252, "learning_rate": 2e-05, "loss": 5.6262, "step": 384 }, { "epoch": 7.404332129963899, "grad_norm": 4.8408989906311035, "learning_rate": 2e-05, "loss": 6.2293, "step": 385 }, { "epoch": 7.423586040914561, "grad_norm": 8.489951133728027, "learning_rate": 2e-05, "loss": 5.7281, "step": 386 }, { "epoch": 7.442839951865222, "grad_norm": 11.095331192016602, "learning_rate": 2e-05, "loss": 5.7553, "step": 387 }, { "epoch": 7.462093862815885, "grad_norm": 11.590924263000488, "learning_rate": 2e-05, "loss": 5.5951, "step": 388 }, { "epoch": 7.481347773766546, "grad_norm": 8.009002685546875, "learning_rate": 2e-05, "loss": 5.8574, "step": 389 }, { "epoch": 7.500601684717208, "grad_norm": 6.733258247375488, "learning_rate": 2e-05, "loss": 5.9511, "step": 390 }, { "epoch": 7.51985559566787, "grad_norm": 12.49974250793457, "learning_rate": 2e-05, "loss": 5.8766, "step": 391 }, { "epoch": 7.539109506618532, "grad_norm": 8.18664264678955, "learning_rate": 2e-05, "loss": 6.2401, "step": 392 }, { "epoch": 7.558363417569193, "grad_norm": 7.806461811065674, "learning_rate": 2e-05, "loss": 5.6931, "step": 393 }, { "epoch": 7.577617328519856, "grad_norm": 11.137080192565918, "learning_rate": 2e-05, "loss": 5.447, "step": 394 }, { "epoch": 7.596871239470517, "grad_norm": 7.18437385559082, "learning_rate": 2e-05, "loss": 5.8764, "step": 395 }, { "epoch": 7.61612515042118, "grad_norm": 7.777758598327637, "learning_rate": 2e-05, "loss": 5.9383, "step": 396 }, { "epoch": 7.635379061371841, "grad_norm": 10.361425399780273, "learning_rate": 2e-05, "loss": 5.8001, "step": 397 }, { "epoch": 7.654632972322503, "grad_norm": 6.713755130767822, "learning_rate": 2e-05, "loss": 6.1663, "step": 398 }, { "epoch": 7.673886883273164, "grad_norm": 10.869112014770508, "learning_rate": 2e-05, "loss": 5.9626, "step": 399 }, { "epoch": 7.693140794223827, "grad_norm": 7.977558135986328, "learning_rate": 2e-05, "loss": 6.0651, "step": 400 }, { "epoch": 7.693140794223827, "eval_clap": 0.15197913348674774, "eval_loss": 6.068316459655762, "eval_runtime": 198.5968, "eval_samples_per_second": 0.161, "eval_steps_per_second": 0.161, "step": 400 }, { "epoch": 7.712394705174488, "grad_norm": 9.780762672424316, "learning_rate": 2e-05, "loss": 5.5897, "step": 401 }, { "epoch": 7.731648616125151, "grad_norm": 6.951937198638916, "learning_rate": 2e-05, "loss": 6.2431, "step": 402 }, { "epoch": 7.750902527075812, "grad_norm": 9.692300796508789, "learning_rate": 2e-05, "loss": 5.4305, "step": 403 }, { "epoch": 7.770156438026474, "grad_norm": 7.418123245239258, "learning_rate": 2e-05, "loss": 5.8003, "step": 404 }, { "epoch": 7.789410348977136, "grad_norm": 6.651110649108887, "learning_rate": 2e-05, "loss": 5.7815, "step": 405 }, { "epoch": 7.808664259927798, "grad_norm": 6.156674861907959, "learning_rate": 2e-05, "loss": 6.4313, "step": 406 }, { "epoch": 7.827918170878459, "grad_norm": 8.222352981567383, "learning_rate": 2e-05, "loss": 6.2155, "step": 407 }, { "epoch": 7.847172081829122, "grad_norm": 16.708091735839844, "learning_rate": 2e-05, "loss": 5.5932, "step": 408 }, { "epoch": 7.866425992779783, "grad_norm": 7.200738906860352, "learning_rate": 2e-05, "loss": 5.7048, "step": 409 }, { "epoch": 7.885679903730445, "grad_norm": 7.8692450523376465, "learning_rate": 2e-05, "loss": 6.0216, "step": 410 }, { "epoch": 7.904933814681107, "grad_norm": 9.36465835571289, "learning_rate": 2e-05, "loss": 5.5713, "step": 411 }, { "epoch": 7.924187725631769, "grad_norm": 6.110987186431885, "learning_rate": 2e-05, "loss": 5.972, "step": 412 }, { "epoch": 7.943441636582431, "grad_norm": 10.555682182312012, "learning_rate": 2e-05, "loss": 5.1856, "step": 413 }, { "epoch": 7.962695547533093, "grad_norm": 8.164458274841309, "learning_rate": 2e-05, "loss": 5.9857, "step": 414 }, { "epoch": 7.981949458483754, "grad_norm": 6.370028018951416, "learning_rate": 2e-05, "loss": 6.2021, "step": 415 }, { "epoch": 8.0, "grad_norm": 7.894473552703857, "learning_rate": 2e-05, "loss": 5.1235, "step": 416 }, { "epoch": 8.019253910950662, "grad_norm": 8.825623512268066, "learning_rate": 2e-05, "loss": 5.7924, "step": 417 }, { "epoch": 8.038507821901323, "grad_norm": 10.459378242492676, "learning_rate": 2e-05, "loss": 5.8566, "step": 418 }, { "epoch": 8.057761732851986, "grad_norm": 7.0936713218688965, "learning_rate": 2e-05, "loss": 6.2791, "step": 419 }, { "epoch": 8.077015643802648, "grad_norm": 6.6698079109191895, "learning_rate": 2e-05, "loss": 6.1026, "step": 420 }, { "epoch": 8.09626955475331, "grad_norm": 12.212193489074707, "learning_rate": 2e-05, "loss": 5.4735, "step": 421 }, { "epoch": 8.115523465703971, "grad_norm": 10.958526611328125, "learning_rate": 2e-05, "loss": 5.3301, "step": 422 }, { "epoch": 8.134777376654633, "grad_norm": 9.644707679748535, "learning_rate": 2e-05, "loss": 6.2481, "step": 423 }, { "epoch": 8.154031287605294, "grad_norm": 7.314677715301514, "learning_rate": 2e-05, "loss": 5.6373, "step": 424 }, { "epoch": 8.173285198555957, "grad_norm": 6.350398063659668, "learning_rate": 2e-05, "loss": 6.0339, "step": 425 }, { "epoch": 8.173285198555957, "eval_clap": 0.14958661794662476, "eval_loss": 6.165280342102051, "eval_runtime": 204.1039, "eval_samples_per_second": 0.157, "eval_steps_per_second": 0.157, "step": 425 }, { "epoch": 8.192539109506619, "grad_norm": 5.304975509643555, "learning_rate": 2e-05, "loss": 6.1317, "step": 426 }, { "epoch": 8.21179302045728, "grad_norm": 6.82647705078125, "learning_rate": 2e-05, "loss": 5.4539, "step": 427 }, { "epoch": 8.231046931407942, "grad_norm": 8.981968879699707, "learning_rate": 2e-05, "loss": 5.739, "step": 428 }, { "epoch": 8.250300842358604, "grad_norm": 9.646061897277832, "learning_rate": 2e-05, "loss": 5.737, "step": 429 }, { "epoch": 8.269554753309265, "grad_norm": 9.535126686096191, "learning_rate": 2e-05, "loss": 6.1408, "step": 430 }, { "epoch": 8.288808664259928, "grad_norm": 6.139739513397217, "learning_rate": 2e-05, "loss": 6.2988, "step": 431 }, { "epoch": 8.30806257521059, "grad_norm": 6.903513431549072, "learning_rate": 2e-05, "loss": 6.0573, "step": 432 }, { "epoch": 8.327316486161251, "grad_norm": 11.271062850952148, "learning_rate": 2e-05, "loss": 6.2397, "step": 433 }, { "epoch": 8.346570397111913, "grad_norm": 6.281597137451172, "learning_rate": 2e-05, "loss": 6.16, "step": 434 }, { "epoch": 8.365824308062574, "grad_norm": 7.392561912536621, "learning_rate": 2e-05, "loss": 6.0408, "step": 435 }, { "epoch": 8.385078219013238, "grad_norm": 13.390826225280762, "learning_rate": 2e-05, "loss": 5.5365, "step": 436 }, { "epoch": 8.4043321299639, "grad_norm": 6.831297397613525, "learning_rate": 2e-05, "loss": 5.9492, "step": 437 }, { "epoch": 8.42358604091456, "grad_norm": 8.311152458190918, "learning_rate": 2e-05, "loss": 6.0555, "step": 438 }, { "epoch": 8.442839951865222, "grad_norm": 5.671159267425537, "learning_rate": 2e-05, "loss": 6.1021, "step": 439 }, { "epoch": 8.462093862815884, "grad_norm": 12.393537521362305, "learning_rate": 2e-05, "loss": 5.3335, "step": 440 }, { "epoch": 8.481347773766545, "grad_norm": 7.936432361602783, "learning_rate": 2e-05, "loss": 5.537, "step": 441 }, { "epoch": 8.500601684717209, "grad_norm": 8.93083667755127, "learning_rate": 2e-05, "loss": 5.6945, "step": 442 }, { "epoch": 8.51985559566787, "grad_norm": 7.8789238929748535, "learning_rate": 2e-05, "loss": 6.4316, "step": 443 }, { "epoch": 8.539109506618532, "grad_norm": 7.957115650177002, "learning_rate": 2e-05, "loss": 5.4203, "step": 444 }, { "epoch": 8.558363417569193, "grad_norm": 6.194324016571045, "learning_rate": 2e-05, "loss": 5.8967, "step": 445 }, { "epoch": 8.577617328519855, "grad_norm": 9.573834419250488, "learning_rate": 2e-05, "loss": 5.7136, "step": 446 }, { "epoch": 8.596871239470518, "grad_norm": 8.004561424255371, "learning_rate": 2e-05, "loss": 5.5264, "step": 447 }, { "epoch": 8.61612515042118, "grad_norm": 6.711611747741699, "learning_rate": 2e-05, "loss": 6.0729, "step": 448 }, { "epoch": 8.635379061371841, "grad_norm": 8.102109909057617, "learning_rate": 2e-05, "loss": 5.3048, "step": 449 }, { "epoch": 8.654632972322503, "grad_norm": 11.3334379196167, "learning_rate": 2e-05, "loss": 5.5607, "step": 450 }, { "epoch": 8.654632972322503, "eval_clap": 0.1344371885061264, "eval_loss": 6.358816146850586, "eval_runtime": 205.1551, "eval_samples_per_second": 0.156, "eval_steps_per_second": 0.156, "step": 450 }, { "epoch": 8.673886883273164, "grad_norm": 7.845404148101807, "learning_rate": 2e-05, "loss": 5.6627, "step": 451 }, { "epoch": 8.693140794223826, "grad_norm": 5.972433090209961, "learning_rate": 2e-05, "loss": 6.3562, "step": 452 }, { "epoch": 8.71239470517449, "grad_norm": 9.84822940826416, "learning_rate": 2e-05, "loss": 5.4597, "step": 453 }, { "epoch": 8.73164861612515, "grad_norm": 9.735132217407227, "learning_rate": 2e-05, "loss": 5.9511, "step": 454 }, { "epoch": 8.750902527075812, "grad_norm": 8.289355278015137, "learning_rate": 2e-05, "loss": 5.7793, "step": 455 }, { "epoch": 8.770156438026474, "grad_norm": 10.302011489868164, "learning_rate": 2e-05, "loss": 5.7178, "step": 456 }, { "epoch": 8.789410348977135, "grad_norm": 12.201030731201172, "learning_rate": 2e-05, "loss": 5.4838, "step": 457 }, { "epoch": 8.808664259927799, "grad_norm": 8.123917579650879, "learning_rate": 2e-05, "loss": 5.6764, "step": 458 }, { "epoch": 8.82791817087846, "grad_norm": 7.225223064422607, "learning_rate": 2e-05, "loss": 5.7118, "step": 459 }, { "epoch": 8.847172081829122, "grad_norm": 10.377509117126465, "learning_rate": 2e-05, "loss": 5.6576, "step": 460 }, { "epoch": 8.866425992779783, "grad_norm": 7.475393772125244, "learning_rate": 2e-05, "loss": 5.3861, "step": 461 }, { "epoch": 8.885679903730445, "grad_norm": 11.457231521606445, "learning_rate": 2e-05, "loss": 5.8163, "step": 462 }, { "epoch": 8.904933814681106, "grad_norm": 5.9624199867248535, "learning_rate": 2e-05, "loss": 6.0883, "step": 463 }, { "epoch": 8.92418772563177, "grad_norm": 15.740527153015137, "learning_rate": 2e-05, "loss": 5.1762, "step": 464 }, { "epoch": 8.943441636582431, "grad_norm": 6.486323833465576, "learning_rate": 2e-05, "loss": 6.0539, "step": 465 }, { "epoch": 8.962695547533093, "grad_norm": 6.787285804748535, "learning_rate": 2e-05, "loss": 6.007, "step": 466 }, { "epoch": 8.981949458483754, "grad_norm": 9.911301612854004, "learning_rate": 2e-05, "loss": 5.0872, "step": 467 }, { "epoch": 9.0, "grad_norm": 7.354274749755859, "learning_rate": 2e-05, "loss": 5.5188, "step": 468 }, { "epoch": 9.019253910950662, "grad_norm": 7.0421881675720215, "learning_rate": 2e-05, "loss": 5.6728, "step": 469 }, { "epoch": 9.038507821901323, "grad_norm": 8.74863052368164, "learning_rate": 2e-05, "loss": 5.5255, "step": 470 }, { "epoch": 9.057761732851986, "grad_norm": 6.350273609161377, "learning_rate": 2e-05, "loss": 5.7792, "step": 471 }, { "epoch": 9.077015643802648, "grad_norm": 5.793376922607422, "learning_rate": 2e-05, "loss": 6.019, "step": 472 }, { "epoch": 9.09626955475331, "grad_norm": 7.126486778259277, "learning_rate": 2e-05, "loss": 5.5166, "step": 473 }, { "epoch": 9.115523465703971, "grad_norm": 7.304249286651611, "learning_rate": 2e-05, "loss": 6.2965, "step": 474 }, { "epoch": 9.134777376654633, "grad_norm": 6.9295172691345215, "learning_rate": 2e-05, "loss": 6.0312, "step": 475 }, { "epoch": 9.134777376654633, "eval_clap": 0.13584719598293304, "eval_loss": 6.450309753417969, "eval_runtime": 205.0145, "eval_samples_per_second": 0.156, "eval_steps_per_second": 0.156, "step": 475 }, { "epoch": 9.154031287605294, "grad_norm": 8.397627830505371, "learning_rate": 2e-05, "loss": 5.1701, "step": 476 }, { "epoch": 9.173285198555957, "grad_norm": 8.74401569366455, "learning_rate": 2e-05, "loss": 5.9924, "step": 477 }, { "epoch": 9.192539109506619, "grad_norm": 4.799812316894531, "learning_rate": 2e-05, "loss": 6.1081, "step": 478 }, { "epoch": 9.21179302045728, "grad_norm": 8.459375381469727, "learning_rate": 2e-05, "loss": 5.1495, "step": 479 }, { "epoch": 9.231046931407942, "grad_norm": 12.414017677307129, "learning_rate": 2e-05, "loss": 5.3805, "step": 480 }, { "epoch": 9.250300842358604, "grad_norm": 7.55842924118042, "learning_rate": 2e-05, "loss": 5.9, "step": 481 }, { "epoch": 9.269554753309265, "grad_norm": 7.019160270690918, "learning_rate": 2e-05, "loss": 6.0315, "step": 482 }, { "epoch": 9.288808664259928, "grad_norm": 6.8493852615356445, "learning_rate": 2e-05, "loss": 5.989, "step": 483 }, { "epoch": 9.30806257521059, "grad_norm": 6.040197372436523, "learning_rate": 2e-05, "loss": 6.1547, "step": 484 }, { "epoch": 9.327316486161251, "grad_norm": 6.4489006996154785, "learning_rate": 2e-05, "loss": 6.1476, "step": 485 }, { "epoch": 9.346570397111913, "grad_norm": 6.498973846435547, "learning_rate": 2e-05, "loss": 5.6649, "step": 486 }, { "epoch": 9.365824308062574, "grad_norm": 7.927713394165039, "learning_rate": 2e-05, "loss": 5.4862, "step": 487 }, { "epoch": 9.385078219013238, "grad_norm": 7.368226051330566, "learning_rate": 2e-05, "loss": 5.8106, "step": 488 }, { "epoch": 9.4043321299639, "grad_norm": 8.220806121826172, "learning_rate": 2e-05, "loss": 5.9079, "step": 489 }, { "epoch": 9.42358604091456, "grad_norm": 9.574630737304688, "learning_rate": 2e-05, "loss": 6.2155, "step": 490 }, { "epoch": 9.442839951865222, "grad_norm": 6.782036304473877, "learning_rate": 2e-05, "loss": 6.0144, "step": 491 }, { "epoch": 9.462093862815884, "grad_norm": 8.27569580078125, "learning_rate": 2e-05, "loss": 5.4657, "step": 492 }, { "epoch": 9.481347773766545, "grad_norm": 9.7858304977417, "learning_rate": 2e-05, "loss": 5.4273, "step": 493 }, { "epoch": 9.500601684717209, "grad_norm": 6.2194905281066895, "learning_rate": 2e-05, "loss": 6.0479, "step": 494 }, { "epoch": 9.51985559566787, "grad_norm": 8.673853874206543, "learning_rate": 2e-05, "loss": 5.8649, "step": 495 }, { "epoch": 9.539109506618532, "grad_norm": 7.277229309082031, "learning_rate": 2e-05, "loss": 5.4538, "step": 496 }, { "epoch": 9.558363417569193, "grad_norm": 7.213720798492432, "learning_rate": 2e-05, "loss": 5.9019, "step": 497 }, { "epoch": 9.577617328519855, "grad_norm": 6.2223896980285645, "learning_rate": 2e-05, "loss": 6.0874, "step": 498 }, { "epoch": 9.596871239470518, "grad_norm": 7.928698539733887, "learning_rate": 2e-05, "loss": 5.5944, "step": 499 }, { "epoch": 9.61612515042118, "grad_norm": 6.528334617614746, "learning_rate": 2e-05, "loss": 5.9084, "step": 500 }, { "epoch": 9.61612515042118, "eval_clap": 0.1080508604645729, "eval_loss": 6.520473003387451, "eval_runtime": 205.013, "eval_samples_per_second": 0.156, "eval_steps_per_second": 0.156, "step": 500 }, { "epoch": 9.635379061371841, "grad_norm": 5.430109024047852, "learning_rate": 2e-05, "loss": 6.1004, "step": 501 }, { "epoch": 9.654632972322503, "grad_norm": 5.635249614715576, "learning_rate": 2e-05, "loss": 6.0965, "step": 502 }, { "epoch": 9.673886883273164, "grad_norm": 11.477826118469238, "learning_rate": 2e-05, "loss": 5.257, "step": 503 }, { "epoch": 9.693140794223826, "grad_norm": 13.920645713806152, "learning_rate": 2e-05, "loss": 5.6852, "step": 504 }, { "epoch": 9.71239470517449, "grad_norm": 7.871194362640381, "learning_rate": 2e-05, "loss": 5.635, "step": 505 }, { "epoch": 9.73164861612515, "grad_norm": 12.229103088378906, "learning_rate": 2e-05, "loss": 5.6566, "step": 506 }, { "epoch": 9.750902527075812, "grad_norm": 6.054317474365234, "learning_rate": 2e-05, "loss": 6.1042, "step": 507 }, { "epoch": 9.770156438026474, "grad_norm": 7.183253288269043, "learning_rate": 2e-05, "loss": 5.8351, "step": 508 }, { "epoch": 9.789410348977135, "grad_norm": 11.31477165222168, "learning_rate": 2e-05, "loss": 5.8469, "step": 509 }, { "epoch": 9.808664259927799, "grad_norm": 8.236515045166016, "learning_rate": 2e-05, "loss": 5.3767, "step": 510 }, { "epoch": 9.808664259927799, "step": 510, "total_flos": 1697829404754480.0, "train_loss": 6.270190904654709, "train_runtime": 7324.9933, "train_samples_per_second": 1.134, "train_steps_per_second": 0.07 } ], "logging_steps": 1.0, "max_steps": 510, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1697829404754480.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }