|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.808664259927799, |
|
"eval_steps": 25, |
|
"global_step": 510, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.019253910950661854, |
|
"grad_norm": 65.07794952392578, |
|
"learning_rate": 2e-05, |
|
"loss": 10.0254, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03850782190132371, |
|
"grad_norm": 35.06503677368164, |
|
"learning_rate": 2e-05, |
|
"loss": 9.6326, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.05776173285198556, |
|
"grad_norm": Infinity, |
|
"learning_rate": 2e-05, |
|
"loss": 9.7444, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.07701564380264742, |
|
"grad_norm": 91.54127502441406, |
|
"learning_rate": 2e-05, |
|
"loss": 9.9694, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.09626955475330927, |
|
"grad_norm": 48.64268493652344, |
|
"learning_rate": 2e-05, |
|
"loss": 9.777, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.11552346570397112, |
|
"grad_norm": 45.61400604248047, |
|
"learning_rate": 2e-05, |
|
"loss": 9.6125, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.13477737665463296, |
|
"grad_norm": 53.380062103271484, |
|
"learning_rate": 2e-05, |
|
"loss": 9.6476, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.15403128760529483, |
|
"grad_norm": 48.681732177734375, |
|
"learning_rate": 2e-05, |
|
"loss": 9.5976, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.17328519855595667, |
|
"grad_norm": 75.82644653320312, |
|
"learning_rate": 2e-05, |
|
"loss": 9.635, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.19253910950661854, |
|
"grad_norm": 52.92415237426758, |
|
"learning_rate": 2e-05, |
|
"loss": 9.4741, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.21179302045728038, |
|
"grad_norm": 45.76670455932617, |
|
"learning_rate": 2e-05, |
|
"loss": 9.4043, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.23104693140794225, |
|
"grad_norm": 83.89037322998047, |
|
"learning_rate": 2e-05, |
|
"loss": 9.4551, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.2503008423586041, |
|
"grad_norm": 63.491943359375, |
|
"learning_rate": 2e-05, |
|
"loss": 9.3453, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.2695547533092659, |
|
"grad_norm": 51.0808219909668, |
|
"learning_rate": 2e-05, |
|
"loss": 9.2297, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.2888086642599278, |
|
"grad_norm": 44.37099838256836, |
|
"learning_rate": 2e-05, |
|
"loss": 9.1875, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.30806257521058966, |
|
"grad_norm": 67.27417755126953, |
|
"learning_rate": 2e-05, |
|
"loss": 9.1203, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.32731648616125153, |
|
"grad_norm": 77.5335922241211, |
|
"learning_rate": 2e-05, |
|
"loss": 9.0269, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.34657039711191334, |
|
"grad_norm": 83.21548461914062, |
|
"learning_rate": 2e-05, |
|
"loss": 9.0223, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.3658243080625752, |
|
"grad_norm": 83.81153106689453, |
|
"learning_rate": 2e-05, |
|
"loss": 8.8856, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.3850782190132371, |
|
"grad_norm": 70.34122467041016, |
|
"learning_rate": 2e-05, |
|
"loss": 8.7546, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4043321299638989, |
|
"grad_norm": 71.87590789794922, |
|
"learning_rate": 2e-05, |
|
"loss": 8.692, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.42358604091456076, |
|
"grad_norm": 61.52598571777344, |
|
"learning_rate": 2e-05, |
|
"loss": 8.7245, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.4428399518652226, |
|
"grad_norm": 81.4962158203125, |
|
"learning_rate": 2e-05, |
|
"loss": 8.4164, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.4620938628158845, |
|
"grad_norm": 66.4041976928711, |
|
"learning_rate": 2e-05, |
|
"loss": 8.4377, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4813477737665463, |
|
"grad_norm": 74.9264144897461, |
|
"learning_rate": 2e-05, |
|
"loss": 8.305, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.4813477737665463, |
|
"eval_clap": 0.2508222758769989, |
|
"eval_loss": 3.7162673473358154, |
|
"eval_runtime": 195.557, |
|
"eval_samples_per_second": 0.164, |
|
"eval_steps_per_second": 0.164, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5006016847172082, |
|
"grad_norm": 57.42286682128906, |
|
"learning_rate": 2e-05, |
|
"loss": 8.3213, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.51985559566787, |
|
"grad_norm": 64.55001831054688, |
|
"learning_rate": 2e-05, |
|
"loss": 8.2368, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5391095066185319, |
|
"grad_norm": 51.85919952392578, |
|
"learning_rate": 2e-05, |
|
"loss": 8.2753, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.5583634175691937, |
|
"grad_norm": 53.471458435058594, |
|
"learning_rate": 2e-05, |
|
"loss": 8.0826, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.5776173285198556, |
|
"grad_norm": 49.86967849731445, |
|
"learning_rate": 2e-05, |
|
"loss": 7.9446, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5968712394705175, |
|
"grad_norm": 62.497581481933594, |
|
"learning_rate": 2e-05, |
|
"loss": 7.7911, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.6161251504211793, |
|
"grad_norm": 46.13528823852539, |
|
"learning_rate": 2e-05, |
|
"loss": 7.7964, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.6353790613718412, |
|
"grad_norm": 56.71400451660156, |
|
"learning_rate": 2e-05, |
|
"loss": 7.5232, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.6546329723225031, |
|
"grad_norm": 43.25479507446289, |
|
"learning_rate": 2e-05, |
|
"loss": 7.7154, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.6738868832731648, |
|
"grad_norm": 44.234336853027344, |
|
"learning_rate": 2e-05, |
|
"loss": 7.3668, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.6931407942238267, |
|
"grad_norm": 43.533878326416016, |
|
"learning_rate": 2e-05, |
|
"loss": 7.8787, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7123947051744886, |
|
"grad_norm": 37.30876922607422, |
|
"learning_rate": 2e-05, |
|
"loss": 7.661, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.7316486161251504, |
|
"grad_norm": 37.29338073730469, |
|
"learning_rate": 2e-05, |
|
"loss": 7.5059, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.7509025270758123, |
|
"grad_norm": 39.37627410888672, |
|
"learning_rate": 2e-05, |
|
"loss": 7.6255, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.7701564380264742, |
|
"grad_norm": 33.975730895996094, |
|
"learning_rate": 2e-05, |
|
"loss": 7.271, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.789410348977136, |
|
"grad_norm": 43.57024002075195, |
|
"learning_rate": 2e-05, |
|
"loss": 7.6087, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.8086642599277978, |
|
"grad_norm": 41.361568450927734, |
|
"learning_rate": 2e-05, |
|
"loss": 7.2042, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.8279181708784596, |
|
"grad_norm": 41.087318420410156, |
|
"learning_rate": 2e-05, |
|
"loss": 7.188, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.8471720818291215, |
|
"grad_norm": 39.903804779052734, |
|
"learning_rate": 2e-05, |
|
"loss": 7.0335, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.8664259927797834, |
|
"grad_norm": 43.420780181884766, |
|
"learning_rate": 2e-05, |
|
"loss": 7.1334, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.8856799037304453, |
|
"grad_norm": 38.208740234375, |
|
"learning_rate": 2e-05, |
|
"loss": 6.9014, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.9049338146811071, |
|
"grad_norm": 34.4349250793457, |
|
"learning_rate": 2e-05, |
|
"loss": 6.9772, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.924187725631769, |
|
"grad_norm": 42.55630874633789, |
|
"learning_rate": 2e-05, |
|
"loss": 7.3901, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.9434416365824309, |
|
"grad_norm": 37.8997917175293, |
|
"learning_rate": 2e-05, |
|
"loss": 6.9604, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.9626955475330926, |
|
"grad_norm": 27.6505184173584, |
|
"learning_rate": 2e-05, |
|
"loss": 6.9842, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9626955475330926, |
|
"eval_clap": 0.05512786656618118, |
|
"eval_loss": 4.608064651489258, |
|
"eval_runtime": 203.976, |
|
"eval_samples_per_second": 0.157, |
|
"eval_steps_per_second": 0.157, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9819494584837545, |
|
"grad_norm": 29.874601364135742, |
|
"learning_rate": 2e-05, |
|
"loss": 6.5285, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 32.52230453491211, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0624, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.0192539109506618, |
|
"grad_norm": 25.571067810058594, |
|
"learning_rate": 2e-05, |
|
"loss": 6.965, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.0385078219013237, |
|
"grad_norm": 24.804780960083008, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6082, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.0577617328519855, |
|
"grad_norm": 26.17034339904785, |
|
"learning_rate": 2e-05, |
|
"loss": 6.8646, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.0770156438026475, |
|
"grad_norm": 26.33489990234375, |
|
"learning_rate": 2e-05, |
|
"loss": 7.0679, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.0962695547533092, |
|
"grad_norm": 22.185443878173828, |
|
"learning_rate": 2e-05, |
|
"loss": 6.9692, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.1155234657039712, |
|
"grad_norm": 24.1581974029541, |
|
"learning_rate": 2e-05, |
|
"loss": 6.7359, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.134777376654633, |
|
"grad_norm": 19.098318099975586, |
|
"learning_rate": 2e-05, |
|
"loss": 6.8621, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.154031287605295, |
|
"grad_norm": 18.446277618408203, |
|
"learning_rate": 2e-05, |
|
"loss": 6.9175, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1732851985559567, |
|
"grad_norm": 18.82305908203125, |
|
"learning_rate": 2e-05, |
|
"loss": 6.8505, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.1925391095066185, |
|
"grad_norm": 23.049110412597656, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4014, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.2117930204572804, |
|
"grad_norm": 30.802398681640625, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1833, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.2310469314079422, |
|
"grad_norm": 31.79558563232422, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3301, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.2503008423586042, |
|
"grad_norm": 17.523427963256836, |
|
"learning_rate": 2e-05, |
|
"loss": 6.5382, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.269554753309266, |
|
"grad_norm": 20.55564308166504, |
|
"learning_rate": 2e-05, |
|
"loss": 6.8307, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.288808664259928, |
|
"grad_norm": 14.00365924835205, |
|
"learning_rate": 2e-05, |
|
"loss": 6.688, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.3080625752105897, |
|
"grad_norm": 20.74002456665039, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3775, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.3273164861612514, |
|
"grad_norm": 13.387882232666016, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4859, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.3465703971119134, |
|
"grad_norm": 13.83588981628418, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6777, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.3658243080625752, |
|
"grad_norm": 18.38031005859375, |
|
"learning_rate": 2e-05, |
|
"loss": 6.712, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.3850782190132371, |
|
"grad_norm": 15.858037948608398, |
|
"learning_rate": 2e-05, |
|
"loss": 6.7095, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.404332129963899, |
|
"grad_norm": 14.21243667602539, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6714, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.4235860409145609, |
|
"grad_norm": 13.775053024291992, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3512, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.4428399518652226, |
|
"grad_norm": 18.616239547729492, |
|
"learning_rate": 2e-05, |
|
"loss": 6.9013, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.4428399518652226, |
|
"eval_clap": 0.05818195268511772, |
|
"eval_loss": 5.64979362487793, |
|
"eval_runtime": 195.9522, |
|
"eval_samples_per_second": 0.163, |
|
"eval_steps_per_second": 0.163, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.4620938628158844, |
|
"grad_norm": 27.710369110107422, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0035, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.4813477737665464, |
|
"grad_norm": 20.711942672729492, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1597, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.5006016847172083, |
|
"grad_norm": 11.78789234161377, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4754, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.5198555956678699, |
|
"grad_norm": 18.79988670349121, |
|
"learning_rate": 2e-05, |
|
"loss": 6.7886, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.5391095066185319, |
|
"grad_norm": 9.708053588867188, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4123, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.5583634175691938, |
|
"grad_norm": 10.420573234558105, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3586, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.5776173285198556, |
|
"grad_norm": 12.453520774841309, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2891, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.5968712394705173, |
|
"grad_norm": 18.412593841552734, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0656, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.6161251504211793, |
|
"grad_norm": 15.798463821411133, |
|
"learning_rate": 2e-05, |
|
"loss": 6.5867, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.6353790613718413, |
|
"grad_norm": 9.848593711853027, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3659, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.654632972322503, |
|
"grad_norm": 10.542499542236328, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1898, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.6738868832731648, |
|
"grad_norm": 10.715902328491211, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2704, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.6931407942238268, |
|
"grad_norm": 10.027088165283203, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1835, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.7123947051744886, |
|
"grad_norm": 17.48147964477539, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0763, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.7316486161251503, |
|
"grad_norm": 17.746797561645508, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8703, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.7509025270758123, |
|
"grad_norm": 28.819969177246094, |
|
"learning_rate": 2e-05, |
|
"loss": 6.8541, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.7701564380264743, |
|
"grad_norm": 13.42017936706543, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2243, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.789410348977136, |
|
"grad_norm": 17.955583572387695, |
|
"learning_rate": 2e-05, |
|
"loss": 6.5344, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.8086642599277978, |
|
"grad_norm": 22.70500946044922, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6459, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.8279181708784598, |
|
"grad_norm": 13.345677375793457, |
|
"learning_rate": 2e-05, |
|
"loss": 6.425, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.8471720818291215, |
|
"grad_norm": 13.83546257019043, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4378, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.8664259927797833, |
|
"grad_norm": 23.364137649536133, |
|
"learning_rate": 2e-05, |
|
"loss": 5.854, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.8856799037304453, |
|
"grad_norm": 8.366693496704102, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2678, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.9049338146811072, |
|
"grad_norm": 14.662858009338379, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9462, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.924187725631769, |
|
"grad_norm": 12.826397895812988, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0195, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.924187725631769, |
|
"eval_clap": 0.14504113793373108, |
|
"eval_loss": 5.763121604919434, |
|
"eval_runtime": 204.238, |
|
"eval_samples_per_second": 0.157, |
|
"eval_steps_per_second": 0.157, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.9434416365824307, |
|
"grad_norm": 11.27551555633545, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1058, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.9626955475330927, |
|
"grad_norm": 16.934494018554688, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6169, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.9819494584837545, |
|
"grad_norm": 10.172548294067383, |
|
"learning_rate": 2e-05, |
|
"loss": 6.085, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 21.66511344909668, |
|
"learning_rate": 2e-05, |
|
"loss": 6.374, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.019253910950662, |
|
"grad_norm": 14.909863471984863, |
|
"learning_rate": 2e-05, |
|
"loss": 6.5841, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.0385078219013235, |
|
"grad_norm": 13.277375221252441, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0897, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.0577617328519855, |
|
"grad_norm": 14.995245933532715, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6047, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.0770156438026475, |
|
"grad_norm": 8.810375213623047, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4019, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.0962695547533094, |
|
"grad_norm": 9.020201683044434, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1859, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.115523465703971, |
|
"grad_norm": 13.133101463317871, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1151, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.134777376654633, |
|
"grad_norm": 9.436896324157715, |
|
"learning_rate": 2e-05, |
|
"loss": 6.5699, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.154031287605295, |
|
"grad_norm": 7.937966346740723, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4376, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.1732851985559565, |
|
"grad_norm": 9.059781074523926, |
|
"learning_rate": 2e-05, |
|
"loss": 6.514, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.1925391095066185, |
|
"grad_norm": 6.838661193847656, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4801, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.2117930204572804, |
|
"grad_norm": 21.459503173828125, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9983, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.2310469314079424, |
|
"grad_norm": 10.961411476135254, |
|
"learning_rate": 2e-05, |
|
"loss": 6.257, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.250300842358604, |
|
"grad_norm": 24.96747398376465, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0786, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.269554753309266, |
|
"grad_norm": 10.531516075134277, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3389, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.288808664259928, |
|
"grad_norm": 12.07296085357666, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2715, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.30806257521059, |
|
"grad_norm": 8.665770530700684, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3378, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.3273164861612514, |
|
"grad_norm": 11.358579635620117, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2564, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.3465703971119134, |
|
"grad_norm": 13.47236156463623, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1455, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.3658243080625754, |
|
"grad_norm": 6.947666168212891, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3356, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.385078219013237, |
|
"grad_norm": 13.422441482543945, |
|
"learning_rate": 2e-05, |
|
"loss": 6.5743, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.404332129963899, |
|
"grad_norm": 8.865825653076172, |
|
"learning_rate": 2e-05, |
|
"loss": 6.447, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.404332129963899, |
|
"eval_clap": 0.11870799958705902, |
|
"eval_loss": 5.7946672439575195, |
|
"eval_runtime": 196.002, |
|
"eval_samples_per_second": 0.163, |
|
"eval_steps_per_second": 0.163, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.423586040914561, |
|
"grad_norm": 14.267729759216309, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1854, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.4428399518652224, |
|
"grad_norm": 11.992257118225098, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4266, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.4620938628158844, |
|
"grad_norm": 8.23071575164795, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1821, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.4813477737665464, |
|
"grad_norm": 14.663762092590332, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8956, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.5006016847172083, |
|
"grad_norm": 19.018505096435547, |
|
"learning_rate": 2e-05, |
|
"loss": 6.748, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.51985559566787, |
|
"grad_norm": 15.996222496032715, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6005, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.539109506618532, |
|
"grad_norm": 15.696185111999512, |
|
"learning_rate": 2e-05, |
|
"loss": 5.94, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.558363417569194, |
|
"grad_norm": 28.96451759338379, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6643, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.577617328519856, |
|
"grad_norm": 10.068239212036133, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0527, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.5968712394705173, |
|
"grad_norm": 20.770143508911133, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8241, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.6161251504211793, |
|
"grad_norm": 8.34460735321045, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2976, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.6353790613718413, |
|
"grad_norm": 7.031894683837891, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1223, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.654632972322503, |
|
"grad_norm": 13.371209144592285, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4443, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.673886883273165, |
|
"grad_norm": 13.32111930847168, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0339, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.693140794223827, |
|
"grad_norm": 12.334694862365723, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1672, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.7123947051744883, |
|
"grad_norm": 18.95940589904785, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7344, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.7316486161251503, |
|
"grad_norm": 16.167844772338867, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8938, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.7509025270758123, |
|
"grad_norm": 13.141815185546875, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7477, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.7701564380264743, |
|
"grad_norm": 9.945357322692871, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0525, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.7894103489771362, |
|
"grad_norm": 18.400161743164062, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4865, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.808664259927798, |
|
"grad_norm": 9.252588272094727, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1021, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.8279181708784598, |
|
"grad_norm": 9.42403793334961, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9196, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.8471720818291217, |
|
"grad_norm": 16.049497604370117, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3472, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.8664259927797833, |
|
"grad_norm": 16.170551300048828, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4077, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.8856799037304453, |
|
"grad_norm": 13.0892915725708, |
|
"learning_rate": 2e-05, |
|
"loss": 6.215, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.8856799037304453, |
|
"eval_clap": 0.11637458205223083, |
|
"eval_loss": 5.839620590209961, |
|
"eval_runtime": 203.9479, |
|
"eval_samples_per_second": 0.157, |
|
"eval_steps_per_second": 0.157, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.9049338146811072, |
|
"grad_norm": 11.786327362060547, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3015, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.9241877256317688, |
|
"grad_norm": 8.937137603759766, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1963, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.9434416365824307, |
|
"grad_norm": 15.840847969055176, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4957, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.9626955475330927, |
|
"grad_norm": 11.317403793334961, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4699, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.9819494584837543, |
|
"grad_norm": 11.2737398147583, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0559, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 15.351630210876465, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5406, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 3.019253910950662, |
|
"grad_norm": 10.597087860107422, |
|
"learning_rate": 2e-05, |
|
"loss": 6.5337, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 3.0385078219013235, |
|
"grad_norm": 8.941291809082031, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2708, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 3.0577617328519855, |
|
"grad_norm": 15.444201469421387, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1448, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 3.0770156438026475, |
|
"grad_norm": 8.186105728149414, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2508, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.0962695547533094, |
|
"grad_norm": 25.041446685791016, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9141, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 3.115523465703971, |
|
"grad_norm": 25.764001846313477, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7969, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 3.134777376654633, |
|
"grad_norm": 8.077290534973145, |
|
"learning_rate": 2e-05, |
|
"loss": 6.485, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 3.154031287605295, |
|
"grad_norm": 6.288991451263428, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3003, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 3.1732851985559565, |
|
"grad_norm": 7.810708522796631, |
|
"learning_rate": 2e-05, |
|
"loss": 6.191, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.1925391095066185, |
|
"grad_norm": 12.891411781311035, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9668, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 3.2117930204572804, |
|
"grad_norm": 9.540118217468262, |
|
"learning_rate": 2e-05, |
|
"loss": 6.093, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 3.2310469314079424, |
|
"grad_norm": 9.51327896118164, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1793, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 3.250300842358604, |
|
"grad_norm": 16.127683639526367, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0557, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 3.269554753309266, |
|
"grad_norm": 17.678020477294922, |
|
"learning_rate": 2e-05, |
|
"loss": 6.5721, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.288808664259928, |
|
"grad_norm": 11.319859504699707, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9967, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 3.30806257521059, |
|
"grad_norm": 15.657870292663574, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9343, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 3.3273164861612514, |
|
"grad_norm": 15.364697456359863, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2908, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 3.3465703971119134, |
|
"grad_norm": 23.229448318481445, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7732, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 3.3658243080625754, |
|
"grad_norm": 13.746365547180176, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4075, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.3658243080625754, |
|
"eval_clap": 0.11743690818548203, |
|
"eval_loss": 5.836143493652344, |
|
"eval_runtime": 204.2948, |
|
"eval_samples_per_second": 0.157, |
|
"eval_steps_per_second": 0.157, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.385078219013237, |
|
"grad_norm": 9.525516510009766, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0857, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 3.404332129963899, |
|
"grad_norm": 18.301898956298828, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4325, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 3.423586040914561, |
|
"grad_norm": 12.342935562133789, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1896, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 3.4428399518652224, |
|
"grad_norm": 12.635440826416016, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2818, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 3.4620938628158844, |
|
"grad_norm": 10.180573463439941, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2591, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.4813477737665464, |
|
"grad_norm": 9.508129119873047, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2806, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 3.5006016847172083, |
|
"grad_norm": 10.56059455871582, |
|
"learning_rate": 2e-05, |
|
"loss": 6.459, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 3.51985559566787, |
|
"grad_norm": 10.22647762298584, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0333, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 3.539109506618532, |
|
"grad_norm": 8.610014915466309, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4448, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 3.558363417569194, |
|
"grad_norm": 12.473418235778809, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0245, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.577617328519856, |
|
"grad_norm": 15.734831809997559, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9932, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 3.5968712394705173, |
|
"grad_norm": 8.682183265686035, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2845, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 3.6161251504211793, |
|
"grad_norm": 21.06833839416504, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8533, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.6353790613718413, |
|
"grad_norm": 22.429513931274414, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7251, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 3.654632972322503, |
|
"grad_norm": 7.118320941925049, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2949, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.673886883273165, |
|
"grad_norm": 11.422521591186523, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1418, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 3.693140794223827, |
|
"grad_norm": 11.730757713317871, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0665, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 3.7123947051744883, |
|
"grad_norm": 7.965202331542969, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3251, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 3.7316486161251503, |
|
"grad_norm": 9.634132385253906, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0327, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 3.7509025270758123, |
|
"grad_norm": 12.675938606262207, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4535, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.7701564380264743, |
|
"grad_norm": 11.747628211975098, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8515, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 3.7894103489771362, |
|
"grad_norm": 9.878296852111816, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3101, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 3.808664259927798, |
|
"grad_norm": 8.255465507507324, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1733, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 3.8279181708784598, |
|
"grad_norm": 15.391761779785156, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4827, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 3.8471720818291217, |
|
"grad_norm": 16.02042579650879, |
|
"learning_rate": 2e-05, |
|
"loss": 5.797, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.8471720818291217, |
|
"eval_clap": 0.10931383073329926, |
|
"eval_loss": 5.815006256103516, |
|
"eval_runtime": 204.8855, |
|
"eval_samples_per_second": 0.156, |
|
"eval_steps_per_second": 0.156, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.8664259927797833, |
|
"grad_norm": 11.096480369567871, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3396, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 3.8856799037304453, |
|
"grad_norm": 17.267515182495117, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7346, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 3.9049338146811072, |
|
"grad_norm": 10.836710929870605, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3001, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 3.9241877256317688, |
|
"grad_norm": 6.178803443908691, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1283, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 3.9434416365824307, |
|
"grad_norm": 8.270339965820312, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2692, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 3.9626955475330927, |
|
"grad_norm": 8.276531219482422, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2181, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 3.9819494584837543, |
|
"grad_norm": 9.491342544555664, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4109, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 16.148122787475586, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4716, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 4.0192539109506615, |
|
"grad_norm": 5.439089298248291, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1773, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 4.038507821901324, |
|
"grad_norm": 8.583178520202637, |
|
"learning_rate": 2e-05, |
|
"loss": 6.097, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.0577617328519855, |
|
"grad_norm": 5.862834453582764, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2586, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 4.077015643802647, |
|
"grad_norm": 19.868268966674805, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7732, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 4.0962695547533094, |
|
"grad_norm": 8.219894409179688, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1416, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 4.115523465703971, |
|
"grad_norm": 8.35651683807373, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3144, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 4.1347773766546325, |
|
"grad_norm": 9.109415054321289, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1057, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 4.154031287605295, |
|
"grad_norm": 7.91225004196167, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3672, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 4.1732851985559565, |
|
"grad_norm": 13.663270950317383, |
|
"learning_rate": 2e-05, |
|
"loss": 6.104, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 4.192539109506619, |
|
"grad_norm": 7.140188694000244, |
|
"learning_rate": 2e-05, |
|
"loss": 6.214, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 4.21179302045728, |
|
"grad_norm": 20.248258590698242, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7409, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 4.231046931407942, |
|
"grad_norm": 6.1542487144470215, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1971, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.250300842358604, |
|
"grad_norm": 16.50935935974121, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7022, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 4.269554753309266, |
|
"grad_norm": 6.602309226989746, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0883, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 4.2888086642599275, |
|
"grad_norm": 10.636795043945312, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1146, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 4.30806257521059, |
|
"grad_norm": 15.217754364013672, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4887, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 4.327316486161251, |
|
"grad_norm": 11.162585258483887, |
|
"learning_rate": 2e-05, |
|
"loss": 6.347, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 4.327316486161251, |
|
"eval_clap": 0.10287218540906906, |
|
"eval_loss": 5.793514251708984, |
|
"eval_runtime": 195.2618, |
|
"eval_samples_per_second": 0.164, |
|
"eval_steps_per_second": 0.164, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 4.346570397111913, |
|
"grad_norm": 6.50248908996582, |
|
"learning_rate": 2e-05, |
|
"loss": 6.128, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 4.365824308062575, |
|
"grad_norm": 12.357426643371582, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2944, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 4.385078219013237, |
|
"grad_norm": 11.979768753051758, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0331, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 4.404332129963899, |
|
"grad_norm": 14.140297889709473, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8348, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 4.423586040914561, |
|
"grad_norm": 13.665949821472168, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9629, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.442839951865222, |
|
"grad_norm": 25.926544189453125, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5899, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 4.462093862815885, |
|
"grad_norm": 11.478584289550781, |
|
"learning_rate": 2e-05, |
|
"loss": 6.06, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 4.481347773766546, |
|
"grad_norm": 7.444418907165527, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0127, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 4.500601684717208, |
|
"grad_norm": 11.302255630493164, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3942, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 4.51985559566787, |
|
"grad_norm": 12.104110717773438, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0193, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 4.539109506618532, |
|
"grad_norm": 11.314817428588867, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8347, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 4.558363417569193, |
|
"grad_norm": 9.530527114868164, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9688, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 4.577617328519856, |
|
"grad_norm": 14.764016151428223, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4083, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 4.596871239470517, |
|
"grad_norm": 12.319718360900879, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2156, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 4.61612515042118, |
|
"grad_norm": 14.426762580871582, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3868, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.635379061371841, |
|
"grad_norm": 11.573673248291016, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2762, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 4.654632972322503, |
|
"grad_norm": 6.983804702758789, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0903, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 4.673886883273164, |
|
"grad_norm": 8.895957946777344, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3329, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 4.693140794223827, |
|
"grad_norm": 11.459211349487305, |
|
"learning_rate": 2e-05, |
|
"loss": 6.532, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 4.712394705174488, |
|
"grad_norm": 7.807408809661865, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4127, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 4.731648616125151, |
|
"grad_norm": 11.404520034790039, |
|
"learning_rate": 2e-05, |
|
"loss": 6.031, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 4.750902527075812, |
|
"grad_norm": 5.500706672668457, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3023, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 4.770156438026474, |
|
"grad_norm": 12.774914741516113, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1873, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 4.789410348977136, |
|
"grad_norm": NaN, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1218, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 4.808664259927798, |
|
"grad_norm": 37.46445083618164, |
|
"learning_rate": 2e-05, |
|
"loss": 5.3935, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.808664259927798, |
|
"eval_clap": 0.10396266728639603, |
|
"eval_loss": 5.760514259338379, |
|
"eval_runtime": 196.0139, |
|
"eval_samples_per_second": 0.163, |
|
"eval_steps_per_second": 0.163, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.827918170878459, |
|
"grad_norm": NaN, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6959, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 4.847172081829122, |
|
"grad_norm": 19.07754135131836, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9438, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 4.866425992779783, |
|
"grad_norm": 7.950329780578613, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2661, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 4.885679903730445, |
|
"grad_norm": 6.870863437652588, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3211, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 4.904933814681107, |
|
"grad_norm": 24.65618133544922, |
|
"learning_rate": 2e-05, |
|
"loss": 5.755, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 4.924187725631769, |
|
"grad_norm": 18.71941566467285, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8712, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 4.943441636582431, |
|
"grad_norm": 22.989364624023438, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9747, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 4.962695547533093, |
|
"grad_norm": 9.502798080444336, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2862, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 4.981949458483754, |
|
"grad_norm": 7.668398857116699, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0501, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 11.83493423461914, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7551, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.0192539109506615, |
|
"grad_norm": 10.512737274169922, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1863, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 5.038507821901324, |
|
"grad_norm": 11.778082847595215, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2453, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 5.0577617328519855, |
|
"grad_norm": 8.815237998962402, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0878, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 5.077015643802647, |
|
"grad_norm": 8.149435997009277, |
|
"learning_rate": 2e-05, |
|
"loss": 6.109, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 5.0962695547533094, |
|
"grad_norm": 17.911197662353516, |
|
"learning_rate": 2e-05, |
|
"loss": 6.5584, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 5.115523465703971, |
|
"grad_norm": 15.374232292175293, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6938, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 5.1347773766546325, |
|
"grad_norm": 14.54749870300293, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0208, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 5.154031287605295, |
|
"grad_norm": 22.222078323364258, |
|
"learning_rate": 2e-05, |
|
"loss": 6.6856, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 5.1732851985559565, |
|
"grad_norm": 14.544611930847168, |
|
"learning_rate": 2e-05, |
|
"loss": 6.5379, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 5.192539109506619, |
|
"grad_norm": 7.748518466949463, |
|
"learning_rate": 2e-05, |
|
"loss": 6.124, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.21179302045728, |
|
"grad_norm": 38.051204681396484, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5926, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 5.231046931407942, |
|
"grad_norm": 8.9555025100708, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2883, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 5.250300842358604, |
|
"grad_norm": 8.490158081054688, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1882, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 5.269554753309266, |
|
"grad_norm": 23.794368743896484, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7156, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 5.2888086642599275, |
|
"grad_norm": 9.640610694885254, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0328, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 5.2888086642599275, |
|
"eval_clap": 0.10059554874897003, |
|
"eval_loss": 5.7012224197387695, |
|
"eval_runtime": 196.9351, |
|
"eval_samples_per_second": 0.162, |
|
"eval_steps_per_second": 0.162, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 5.30806257521059, |
|
"grad_norm": 10.774741172790527, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0371, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 5.327316486161251, |
|
"grad_norm": 31.07454490661621, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4832, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 5.346570397111913, |
|
"grad_norm": 20.25762939453125, |
|
"learning_rate": 2e-05, |
|
"loss": 5.829, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 5.365824308062575, |
|
"grad_norm": 14.205655097961426, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8925, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 5.385078219013237, |
|
"grad_norm": 10.45982551574707, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9619, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.404332129963899, |
|
"grad_norm": 10.787467956542969, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2064, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 5.423586040914561, |
|
"grad_norm": 14.345209121704102, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4463, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 5.442839951865222, |
|
"grad_norm": 11.769346237182617, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2416, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 5.462093862815885, |
|
"grad_norm": 10.520780563354492, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9882, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 5.481347773766546, |
|
"grad_norm": 9.365582466125488, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1293, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 5.500601684717208, |
|
"grad_norm": 11.546175003051758, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1385, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 5.51985559566787, |
|
"grad_norm": 8.784673690795898, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9021, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 5.539109506618532, |
|
"grad_norm": 14.96414566040039, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7696, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 5.558363417569193, |
|
"grad_norm": 10.28773307800293, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8558, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 5.577617328519856, |
|
"grad_norm": 8.956218719482422, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1751, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 5.596871239470517, |
|
"grad_norm": 8.992794036865234, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9301, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 5.61612515042118, |
|
"grad_norm": 15.411934852600098, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7903, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 5.635379061371841, |
|
"grad_norm": 17.23996925354004, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7608, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 5.654632972322503, |
|
"grad_norm": 9.80339241027832, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9715, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 5.673886883273164, |
|
"grad_norm": 9.020657539367676, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9026, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 5.693140794223827, |
|
"grad_norm": 10.856344223022461, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9779, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 5.712394705174488, |
|
"grad_norm": 8.707733154296875, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1238, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 5.731648616125151, |
|
"grad_norm": 14.64277172088623, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0237, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 5.750902527075812, |
|
"grad_norm": 11.925372123718262, |
|
"learning_rate": 2e-05, |
|
"loss": 6.393, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 5.770156438026474, |
|
"grad_norm": 8.235823631286621, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1476, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.770156438026474, |
|
"eval_clap": 0.08804576098918915, |
|
"eval_loss": 5.670257568359375, |
|
"eval_runtime": 196.4571, |
|
"eval_samples_per_second": 0.163, |
|
"eval_steps_per_second": 0.163, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.789410348977136, |
|
"grad_norm": 10.681437492370605, |
|
"learning_rate": 2e-05, |
|
"loss": 6.5, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 5.808664259927798, |
|
"grad_norm": 7.725464820861816, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3363, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 5.827918170878459, |
|
"grad_norm": 23.754108428955078, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4885, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 5.847172081829122, |
|
"grad_norm": 16.799196243286133, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8577, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 5.866425992779783, |
|
"grad_norm": 18.410417556762695, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7364, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 5.885679903730445, |
|
"grad_norm": 13.101805686950684, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1962, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 5.904933814681107, |
|
"grad_norm": 20.438919067382812, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6507, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 5.924187725631769, |
|
"grad_norm": 16.583629608154297, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6965, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 5.943441636582431, |
|
"grad_norm": 12.213188171386719, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9619, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 5.962695547533093, |
|
"grad_norm": 10.092710494995117, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9366, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.981949458483754, |
|
"grad_norm": 11.882303237915039, |
|
"learning_rate": 2e-05, |
|
"loss": 5.653, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 14.35622501373291, |
|
"learning_rate": 2e-05, |
|
"loss": 5.3527, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 6.0192539109506615, |
|
"grad_norm": 15.661309242248535, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9525, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 6.038507821901324, |
|
"grad_norm": 20.924646377563477, |
|
"learning_rate": 2e-05, |
|
"loss": 6.5313, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 6.0577617328519855, |
|
"grad_norm": 14.876949310302734, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6237, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 6.077015643802647, |
|
"grad_norm": 14.90765380859375, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9514, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 6.0962695547533094, |
|
"grad_norm": 11.085007667541504, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3012, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 6.115523465703971, |
|
"grad_norm": 7.102397918701172, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2968, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 6.1347773766546325, |
|
"grad_norm": 10.636114120483398, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0036, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 6.154031287605295, |
|
"grad_norm": 9.180078506469727, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8668, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.1732851985559565, |
|
"grad_norm": 8.283641815185547, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9111, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 6.192539109506619, |
|
"grad_norm": 16.489126205444336, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7074, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 6.21179302045728, |
|
"grad_norm": 8.536720275878906, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0873, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 6.231046931407942, |
|
"grad_norm": 19.378023147583008, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7451, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 6.250300842358604, |
|
"grad_norm": 8.368978500366211, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9049, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 6.250300842358604, |
|
"eval_clap": 0.12603802978992462, |
|
"eval_loss": 5.833277702331543, |
|
"eval_runtime": 196.2372, |
|
"eval_samples_per_second": 0.163, |
|
"eval_steps_per_second": 0.163, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 6.269554753309266, |
|
"grad_norm": 9.310341835021973, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9592, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 6.2888086642599275, |
|
"grad_norm": 10.618816375732422, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7331, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 6.30806257521059, |
|
"grad_norm": 20.578750610351562, |
|
"learning_rate": 2e-05, |
|
"loss": 5.2624, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 6.327316486161251, |
|
"grad_norm": 9.402525901794434, |
|
"learning_rate": 2e-05, |
|
"loss": 6.067, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 6.346570397111913, |
|
"grad_norm": 9.60403060913086, |
|
"learning_rate": 2e-05, |
|
"loss": 6.214, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.365824308062575, |
|
"grad_norm": 14.769834518432617, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7947, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 6.385078219013237, |
|
"grad_norm": 7.659432411193848, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8996, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 6.404332129963899, |
|
"grad_norm": 9.789624214172363, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0822, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 6.423586040914561, |
|
"grad_norm": 13.061244010925293, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8736, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 6.442839951865222, |
|
"grad_norm": 9.671375274658203, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6454, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 6.462093862815885, |
|
"grad_norm": 7.602629661560059, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1014, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 6.481347773766546, |
|
"grad_norm": 9.208328247070312, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8553, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 6.500601684717208, |
|
"grad_norm": 6.751482963562012, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3873, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 6.51985559566787, |
|
"grad_norm": 19.14604377746582, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5979, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 6.539109506618532, |
|
"grad_norm": 6.979920387268066, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9776, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 6.558363417569193, |
|
"grad_norm": 12.88497257232666, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4447, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 6.577617328519856, |
|
"grad_norm": 9.550447463989258, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0607, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 6.596871239470517, |
|
"grad_norm": 17.2004451751709, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8465, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 6.61612515042118, |
|
"grad_norm": 9.696534156799316, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7955, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 6.635379061371841, |
|
"grad_norm": 6.361357688903809, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2663, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 6.654632972322503, |
|
"grad_norm": 28.56427574157715, |
|
"learning_rate": 2e-05, |
|
"loss": 5.0443, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 6.673886883273164, |
|
"grad_norm": 10.24687671661377, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9321, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 6.693140794223827, |
|
"grad_norm": 8.435123443603516, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1062, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 6.712394705174488, |
|
"grad_norm": 6.668614387512207, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9755, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 6.731648616125151, |
|
"grad_norm": 8.491061210632324, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8547, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 6.731648616125151, |
|
"eval_clap": 0.14147447049617767, |
|
"eval_loss": 5.917084693908691, |
|
"eval_runtime": 201.9514, |
|
"eval_samples_per_second": 0.158, |
|
"eval_steps_per_second": 0.158, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 6.750902527075812, |
|
"grad_norm": 10.25600528717041, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8823, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 6.770156438026474, |
|
"grad_norm": 8.48415470123291, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8709, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 6.789410348977136, |
|
"grad_norm": 18.866851806640625, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7262, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 6.808664259927798, |
|
"grad_norm": 7.648865222930908, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6922, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 6.827918170878459, |
|
"grad_norm": 12.319436073303223, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1132, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 6.847172081829122, |
|
"grad_norm": 6.688267230987549, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0394, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 6.866425992779783, |
|
"grad_norm": 8.100188255310059, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7567, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 6.885679903730445, |
|
"grad_norm": 19.447267532348633, |
|
"learning_rate": 2e-05, |
|
"loss": 5.1313, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 6.904933814681107, |
|
"grad_norm": 8.269752502441406, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6813, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 6.924187725631769, |
|
"grad_norm": 11.055656433105469, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7952, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 6.943441636582431, |
|
"grad_norm": 12.430220603942871, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4678, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 6.962695547533093, |
|
"grad_norm": 9.070528984069824, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3099, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 6.981949458483754, |
|
"grad_norm": 11.570778846740723, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9188, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 7.6774821281433105, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7076, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 7.0192539109506615, |
|
"grad_norm": 7.454392910003662, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3174, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 7.038507821901324, |
|
"grad_norm": 16.39904022216797, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5487, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 7.0577617328519855, |
|
"grad_norm": 8.567000389099121, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8364, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 7.077015643802647, |
|
"grad_norm": 11.228474617004395, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1298, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 7.0962695547533094, |
|
"grad_norm": 8.90956974029541, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8714, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 7.115523465703971, |
|
"grad_norm": 11.500467300415039, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0238, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 7.1347773766546325, |
|
"grad_norm": 9.86660099029541, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4866, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 7.154031287605295, |
|
"grad_norm": 15.263172149658203, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6674, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 7.1732851985559565, |
|
"grad_norm": 9.802757263183594, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2604, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 7.192539109506619, |
|
"grad_norm": 9.074177742004395, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0723, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 7.21179302045728, |
|
"grad_norm": 13.025900840759277, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5151, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 7.21179302045728, |
|
"eval_clap": 0.14877665042877197, |
|
"eval_loss": 6.030681610107422, |
|
"eval_runtime": 196.9659, |
|
"eval_samples_per_second": 0.162, |
|
"eval_steps_per_second": 0.162, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 7.231046931407942, |
|
"grad_norm": 8.610788345336914, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4889, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 7.250300842358604, |
|
"grad_norm": 8.993633270263672, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0395, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 7.269554753309266, |
|
"grad_norm": 11.314407348632812, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7346, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 7.2888086642599275, |
|
"grad_norm": 6.467037677764893, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7235, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 7.30806257521059, |
|
"grad_norm": 8.104763984680176, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9586, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.327316486161251, |
|
"grad_norm": 7.068090915679932, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9094, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 7.346570397111913, |
|
"grad_norm": 12.21611213684082, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4139, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 7.365824308062575, |
|
"grad_norm": 7.650627136230469, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9595, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 7.385078219013237, |
|
"grad_norm": 9.73534107208252, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6262, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 7.404332129963899, |
|
"grad_norm": 4.8408989906311035, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2293, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 7.423586040914561, |
|
"grad_norm": 8.489951133728027, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7281, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 7.442839951865222, |
|
"grad_norm": 11.095331192016602, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7553, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 7.462093862815885, |
|
"grad_norm": 11.590924263000488, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5951, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 7.481347773766546, |
|
"grad_norm": 8.009002685546875, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8574, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 7.500601684717208, |
|
"grad_norm": 6.733258247375488, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9511, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 7.51985559566787, |
|
"grad_norm": 12.49974250793457, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8766, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 7.539109506618532, |
|
"grad_norm": 8.18664264678955, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2401, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 7.558363417569193, |
|
"grad_norm": 7.806461811065674, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6931, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 7.577617328519856, |
|
"grad_norm": 11.137080192565918, |
|
"learning_rate": 2e-05, |
|
"loss": 5.447, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 7.596871239470517, |
|
"grad_norm": 7.18437385559082, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8764, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 7.61612515042118, |
|
"grad_norm": 7.777758598327637, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9383, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 7.635379061371841, |
|
"grad_norm": 10.361425399780273, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8001, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 7.654632972322503, |
|
"grad_norm": 6.713755130767822, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1663, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 7.673886883273164, |
|
"grad_norm": 10.869112014770508, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9626, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 7.693140794223827, |
|
"grad_norm": 7.977558135986328, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0651, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 7.693140794223827, |
|
"eval_clap": 0.15197913348674774, |
|
"eval_loss": 6.068316459655762, |
|
"eval_runtime": 198.5968, |
|
"eval_samples_per_second": 0.161, |
|
"eval_steps_per_second": 0.161, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 7.712394705174488, |
|
"grad_norm": 9.780762672424316, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5897, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 7.731648616125151, |
|
"grad_norm": 6.951937198638916, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2431, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 7.750902527075812, |
|
"grad_norm": 9.692300796508789, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4305, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 7.770156438026474, |
|
"grad_norm": 7.418123245239258, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8003, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 7.789410348977136, |
|
"grad_norm": 6.651110649108887, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7815, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 7.808664259927798, |
|
"grad_norm": 6.156674861907959, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4313, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 7.827918170878459, |
|
"grad_norm": 8.222352981567383, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2155, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 7.847172081829122, |
|
"grad_norm": 16.708091735839844, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5932, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 7.866425992779783, |
|
"grad_norm": 7.200738906860352, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7048, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 7.885679903730445, |
|
"grad_norm": 7.8692450523376465, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0216, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 7.904933814681107, |
|
"grad_norm": 9.36465835571289, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5713, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 7.924187725631769, |
|
"grad_norm": 6.110987186431885, |
|
"learning_rate": 2e-05, |
|
"loss": 5.972, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 7.943441636582431, |
|
"grad_norm": 10.555682182312012, |
|
"learning_rate": 2e-05, |
|
"loss": 5.1856, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 7.962695547533093, |
|
"grad_norm": 8.164458274841309, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9857, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 7.981949458483754, |
|
"grad_norm": 6.370028018951416, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2021, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 7.894473552703857, |
|
"learning_rate": 2e-05, |
|
"loss": 5.1235, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 8.019253910950662, |
|
"grad_norm": 8.825623512268066, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7924, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 8.038507821901323, |
|
"grad_norm": 10.459378242492676, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8566, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 8.057761732851986, |
|
"grad_norm": 7.0936713218688965, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2791, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 8.077015643802648, |
|
"grad_norm": 6.6698079109191895, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1026, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 8.09626955475331, |
|
"grad_norm": 12.212193489074707, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4735, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 8.115523465703971, |
|
"grad_norm": 10.958526611328125, |
|
"learning_rate": 2e-05, |
|
"loss": 5.3301, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 8.134777376654633, |
|
"grad_norm": 9.644707679748535, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2481, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 8.154031287605294, |
|
"grad_norm": 7.314677715301514, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6373, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 8.173285198555957, |
|
"grad_norm": 6.350398063659668, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0339, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 8.173285198555957, |
|
"eval_clap": 0.14958661794662476, |
|
"eval_loss": 6.165280342102051, |
|
"eval_runtime": 204.1039, |
|
"eval_samples_per_second": 0.157, |
|
"eval_steps_per_second": 0.157, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 8.192539109506619, |
|
"grad_norm": 5.304975509643555, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1317, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 8.21179302045728, |
|
"grad_norm": 6.82647705078125, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4539, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 8.231046931407942, |
|
"grad_norm": 8.981968879699707, |
|
"learning_rate": 2e-05, |
|
"loss": 5.739, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 8.250300842358604, |
|
"grad_norm": 9.646061897277832, |
|
"learning_rate": 2e-05, |
|
"loss": 5.737, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 8.269554753309265, |
|
"grad_norm": 9.535126686096191, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1408, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 8.288808664259928, |
|
"grad_norm": 6.139739513397217, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2988, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 8.30806257521059, |
|
"grad_norm": 6.903513431549072, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0573, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 8.327316486161251, |
|
"grad_norm": 11.271062850952148, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2397, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 8.346570397111913, |
|
"grad_norm": 6.281597137451172, |
|
"learning_rate": 2e-05, |
|
"loss": 6.16, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 8.365824308062574, |
|
"grad_norm": 7.392561912536621, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0408, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 8.385078219013238, |
|
"grad_norm": 13.390826225280762, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5365, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 8.4043321299639, |
|
"grad_norm": 6.831297397613525, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9492, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 8.42358604091456, |
|
"grad_norm": 8.311152458190918, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0555, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 8.442839951865222, |
|
"grad_norm": 5.671159267425537, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1021, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 8.462093862815884, |
|
"grad_norm": 12.393537521362305, |
|
"learning_rate": 2e-05, |
|
"loss": 5.3335, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 8.481347773766545, |
|
"grad_norm": 7.936432361602783, |
|
"learning_rate": 2e-05, |
|
"loss": 5.537, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 8.500601684717209, |
|
"grad_norm": 8.93083667755127, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6945, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 8.51985559566787, |
|
"grad_norm": 7.8789238929748535, |
|
"learning_rate": 2e-05, |
|
"loss": 6.4316, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 8.539109506618532, |
|
"grad_norm": 7.957115650177002, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4203, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 8.558363417569193, |
|
"grad_norm": 6.194324016571045, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8967, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 8.577617328519855, |
|
"grad_norm": 9.573834419250488, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7136, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 8.596871239470518, |
|
"grad_norm": 8.004561424255371, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5264, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 8.61612515042118, |
|
"grad_norm": 6.711611747741699, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0729, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 8.635379061371841, |
|
"grad_norm": 8.102109909057617, |
|
"learning_rate": 2e-05, |
|
"loss": 5.3048, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 8.654632972322503, |
|
"grad_norm": 11.3334379196167, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5607, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 8.654632972322503, |
|
"eval_clap": 0.1344371885061264, |
|
"eval_loss": 6.358816146850586, |
|
"eval_runtime": 205.1551, |
|
"eval_samples_per_second": 0.156, |
|
"eval_steps_per_second": 0.156, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 8.673886883273164, |
|
"grad_norm": 7.845404148101807, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6627, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 8.693140794223826, |
|
"grad_norm": 5.972433090209961, |
|
"learning_rate": 2e-05, |
|
"loss": 6.3562, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 8.71239470517449, |
|
"grad_norm": 9.84822940826416, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4597, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 8.73164861612515, |
|
"grad_norm": 9.735132217407227, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9511, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 8.750902527075812, |
|
"grad_norm": 8.289355278015137, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7793, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 8.770156438026474, |
|
"grad_norm": 10.302011489868164, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7178, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 8.789410348977135, |
|
"grad_norm": 12.201030731201172, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4838, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 8.808664259927799, |
|
"grad_norm": 8.123917579650879, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6764, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 8.82791817087846, |
|
"grad_norm": 7.225223064422607, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7118, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 8.847172081829122, |
|
"grad_norm": 10.377509117126465, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6576, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 8.866425992779783, |
|
"grad_norm": 7.475393772125244, |
|
"learning_rate": 2e-05, |
|
"loss": 5.3861, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 8.885679903730445, |
|
"grad_norm": 11.457231521606445, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8163, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 8.904933814681106, |
|
"grad_norm": 5.9624199867248535, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0883, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 8.92418772563177, |
|
"grad_norm": 15.740527153015137, |
|
"learning_rate": 2e-05, |
|
"loss": 5.1762, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 8.943441636582431, |
|
"grad_norm": 6.486323833465576, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0539, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 8.962695547533093, |
|
"grad_norm": 6.787285804748535, |
|
"learning_rate": 2e-05, |
|
"loss": 6.007, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 8.981949458483754, |
|
"grad_norm": 9.911301612854004, |
|
"learning_rate": 2e-05, |
|
"loss": 5.0872, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 7.354274749755859, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5188, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 9.019253910950662, |
|
"grad_norm": 7.0421881675720215, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6728, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 9.038507821901323, |
|
"grad_norm": 8.74863052368164, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5255, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 9.057761732851986, |
|
"grad_norm": 6.350273609161377, |
|
"learning_rate": 2e-05, |
|
"loss": 5.7792, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 9.077015643802648, |
|
"grad_norm": 5.793376922607422, |
|
"learning_rate": 2e-05, |
|
"loss": 6.019, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 9.09626955475331, |
|
"grad_norm": 7.126486778259277, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5166, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 9.115523465703971, |
|
"grad_norm": 7.304249286651611, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2965, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 9.134777376654633, |
|
"grad_norm": 6.9295172691345215, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0312, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 9.134777376654633, |
|
"eval_clap": 0.13584719598293304, |
|
"eval_loss": 6.450309753417969, |
|
"eval_runtime": 205.0145, |
|
"eval_samples_per_second": 0.156, |
|
"eval_steps_per_second": 0.156, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 9.154031287605294, |
|
"grad_norm": 8.397627830505371, |
|
"learning_rate": 2e-05, |
|
"loss": 5.1701, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 9.173285198555957, |
|
"grad_norm": 8.74401569366455, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9924, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 9.192539109506619, |
|
"grad_norm": 4.799812316894531, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1081, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 9.21179302045728, |
|
"grad_norm": 8.459375381469727, |
|
"learning_rate": 2e-05, |
|
"loss": 5.1495, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 9.231046931407942, |
|
"grad_norm": 12.414017677307129, |
|
"learning_rate": 2e-05, |
|
"loss": 5.3805, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 9.250300842358604, |
|
"grad_norm": 7.55842924118042, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 9.269554753309265, |
|
"grad_norm": 7.019160270690918, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0315, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 9.288808664259928, |
|
"grad_norm": 6.8493852615356445, |
|
"learning_rate": 2e-05, |
|
"loss": 5.989, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 9.30806257521059, |
|
"grad_norm": 6.040197372436523, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1547, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 9.327316486161251, |
|
"grad_norm": 6.4489006996154785, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1476, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 9.346570397111913, |
|
"grad_norm": 6.498973846435547, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6649, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 9.365824308062574, |
|
"grad_norm": 7.927713394165039, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4862, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 9.385078219013238, |
|
"grad_norm": 7.368226051330566, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8106, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 9.4043321299639, |
|
"grad_norm": 8.220806121826172, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9079, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 9.42358604091456, |
|
"grad_norm": 9.574630737304688, |
|
"learning_rate": 2e-05, |
|
"loss": 6.2155, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 9.442839951865222, |
|
"grad_norm": 6.782036304473877, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0144, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 9.462093862815884, |
|
"grad_norm": 8.27569580078125, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4657, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 9.481347773766545, |
|
"grad_norm": 9.7858304977417, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4273, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 9.500601684717209, |
|
"grad_norm": 6.2194905281066895, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0479, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 9.51985559566787, |
|
"grad_norm": 8.673853874206543, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8649, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 9.539109506618532, |
|
"grad_norm": 7.277229309082031, |
|
"learning_rate": 2e-05, |
|
"loss": 5.4538, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 9.558363417569193, |
|
"grad_norm": 7.213720798492432, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9019, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 9.577617328519855, |
|
"grad_norm": 6.2223896980285645, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0874, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 9.596871239470518, |
|
"grad_norm": 7.928698539733887, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5944, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 9.61612515042118, |
|
"grad_norm": 6.528334617614746, |
|
"learning_rate": 2e-05, |
|
"loss": 5.9084, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 9.61612515042118, |
|
"eval_clap": 0.1080508604645729, |
|
"eval_loss": 6.520473003387451, |
|
"eval_runtime": 205.013, |
|
"eval_samples_per_second": 0.156, |
|
"eval_steps_per_second": 0.156, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 9.635379061371841, |
|
"grad_norm": 5.430109024047852, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1004, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 9.654632972322503, |
|
"grad_norm": 5.635249614715576, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0965, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 9.673886883273164, |
|
"grad_norm": 11.477826118469238, |
|
"learning_rate": 2e-05, |
|
"loss": 5.257, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 9.693140794223826, |
|
"grad_norm": 13.920645713806152, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6852, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 9.71239470517449, |
|
"grad_norm": 7.871194362640381, |
|
"learning_rate": 2e-05, |
|
"loss": 5.635, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 9.73164861612515, |
|
"grad_norm": 12.229103088378906, |
|
"learning_rate": 2e-05, |
|
"loss": 5.6566, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 9.750902527075812, |
|
"grad_norm": 6.054317474365234, |
|
"learning_rate": 2e-05, |
|
"loss": 6.1042, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 9.770156438026474, |
|
"grad_norm": 7.183253288269043, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8351, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 9.789410348977135, |
|
"grad_norm": 11.31477165222168, |
|
"learning_rate": 2e-05, |
|
"loss": 5.8469, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 9.808664259927799, |
|
"grad_norm": 8.236515045166016, |
|
"learning_rate": 2e-05, |
|
"loss": 5.3767, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 9.808664259927799, |
|
"step": 510, |
|
"total_flos": 1697829404754480.0, |
|
"train_loss": 6.270190904654709, |
|
"train_runtime": 7324.9933, |
|
"train_samples_per_second": 1.134, |
|
"train_steps_per_second": 0.07 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 510, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1697829404754480.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|