|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.0, |
|
"eval_steps": 500, |
|
"global_step": 300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.7934532165527344, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3404, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5168005228042603, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0804, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.047807216644287, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9184, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.041599988937378, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0393, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.8074644804000854, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1779, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7784727811813354, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1583, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.8535248637199402, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0766, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8507911562919617, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1175, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8497746586799622, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1156, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.9205197691917419, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1782, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.2332911491394043, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0536, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.3457396030426025, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2186, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.6494730114936829, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8011, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6363134980201721, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8819, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7927612662315369, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8845, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7082176804542542, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8748, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.861709713935852, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8777, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7901681661605835, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8811, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.7719288468360901, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8898, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.027469277381897, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0089, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.9486727714538574, |
|
"learning_rate": 0.0002, |
|
"loss": 1.968, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.9629890322685242, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0568, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.033793568611145, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8401, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.3298218250274658, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9165, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6936089396476746, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6614, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.6136096119880676, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8167, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6043046712875366, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7217, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6395452618598938, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7353, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6829009056091309, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7708, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.8561712503433228, |
|
"learning_rate": 0.0002, |
|
"loss": 1.774, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.7594190239906311, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8788, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.867341160774231, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8708, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.9393973350524902, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9839, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.0540133714675903, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7637, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.2020256519317627, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9187, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.7588919401168823, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5851, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.9404975175857544, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7903, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.7744253873825073, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7476, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.7260447144508362, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5547, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.9214150905609131, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6342, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.834932267665863, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6577, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.8663449883460999, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7133, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.9534509181976318, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7999, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.058899164199829, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5964, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.1835004091262817, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7596, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.2041128873825073, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6529, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.5300588607788086, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7852, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.6037429571151733, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6873, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.6437931656837463, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5729, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.1996123790740967, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5609, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.6186010837554932, |
|
"eval_runtime": 565.3333, |
|
"eval_samples_per_second": 0.708, |
|
"eval_steps_per_second": 0.177, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.6480159163475037, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5754, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.8269457817077637, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4743, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.1054280996322632, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3447, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.9144314527511597, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3652, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.8429620862007141, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4696, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.3091776371002197, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3109, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.1400000000000001, |
|
"grad_norm": 1.2086460590362549, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3424, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.1823766231536865, |
|
"learning_rate": 0.0002, |
|
"loss": 1.213, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.817803144454956, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2911, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.2870073318481445, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2712, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.2424544095993042, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2292, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.4258471727371216, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0814, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.1297271251678467, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5303, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.8728504776954651, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4354, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.7809789776802063, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4398, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.844166100025177, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2171, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.8636218905448914, |
|
"learning_rate": 0.0002, |
|
"loss": 1.23, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.9831591248512268, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2496, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.4268325567245483, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2354, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.6133723258972168, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2123, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.5462720394134521, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0258, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.1962395906448364, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0858, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.413921594619751, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0169, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.442657470703125, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9553, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.9919085502624512, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2394, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.988468587398529, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3642, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.9793186187744141, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2818, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.7799855470657349, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1705, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.8288784027099609, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0484, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.064773440361023, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1608, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.0099600553512573, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1873, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 1.9040124416351318, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9739, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.6600000000000001, |
|
"grad_norm": 1.2448644638061523, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9418, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 1.2129086256027222, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8821, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.6727265119552612, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9965, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 1.6569440364837646, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9182, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.8596146702766418, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3188, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.8928490281105042, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3601, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.7409713268280029, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1212, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.8979334831237793, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4162, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.8199999999999998, |
|
"grad_norm": 0.979978621006012, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1969, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.9733594059944153, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0468, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.8599999999999999, |
|
"grad_norm": 0.9226842522621155, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1807, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.1638745069503784, |
|
"learning_rate": 0.0002, |
|
"loss": 1.139, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 1.5604937076568604, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1872, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.3674428462982178, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1865, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.8469598293304443, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0469, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.3148952722549438, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9915, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.599141001701355, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2296, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.3382114171981812, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1813, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.409305453300476, |
|
"eval_runtime": 565.9517, |
|
"eval_samples_per_second": 0.707, |
|
"eval_steps_per_second": 0.177, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.0162380933761597, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1481, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.7402092814445496, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0086, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.8824872970581055, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0588, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.7582442760467529, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8181, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.0200812816619873, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9041, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.08174467086792, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8479, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 1.01225745677948, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7659, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.2194840908050537, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7926, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.0519524812698364, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6604, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.2860150337219238, |
|
"learning_rate": 0.0002, |
|
"loss": 0.663, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.5521994829177856, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7791, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.455283284187317, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5112, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.7097278833389282, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2219, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.2800000000000002, |
|
"grad_norm": 1.5385531187057495, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1261, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.0525436401367188, |
|
"learning_rate": 0.0002, |
|
"loss": 0.86, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.0388120412826538, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9022, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 1.060497760772705, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9265, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 1.0629950761795044, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7222, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 1.2574018239974976, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7952, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.0951610803604126, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6647, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.46285879611969, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7845, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.3611388206481934, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7084, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.6670907735824585, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6594, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 2.1525955200195312, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6401, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.5126793384552, |
|
"learning_rate": 0.0002, |
|
"loss": 0.926, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 1.800521969795227, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0936, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 1.0617576837539673, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8052, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.0823312997817993, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9443, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.2193264961242676, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7955, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.0502954721450806, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8365, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.1898560523986816, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9706, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.1076680421829224, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7529, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 1.3826709985733032, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7474, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 1.2504832744598389, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7086, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 1.6292765140533447, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6305, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 1.9603074789047241, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6834, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 2.202030897140503, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1712, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.6344685554504395, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0772, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.7800000000000002, |
|
"grad_norm": 1.3579537868499756, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8803, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.0554553270339966, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9222, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.9431642889976501, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8031, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.0826098918914795, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8259, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 1.24959135055542, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7957, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.1057368516921997, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7079, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.144061803817749, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7165, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.0690631866455078, |
|
"learning_rate": 0.0002, |
|
"loss": 0.601, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.292758584022522, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7191, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.729408860206604, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5851, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 2.078197717666626, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8942, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 2.0007128715515137, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7139, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.4874461889266968, |
|
"eval_runtime": 566.1506, |
|
"eval_samples_per_second": 0.707, |
|
"eval_steps_per_second": 0.177, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 1.024670958518982, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8969, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.9738882184028625, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9056, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.9969688653945923, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6676, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.12136971950531, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5916, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 1.3517699241638184, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5908, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.5965360403060913, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6148, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 1.3009252548217773, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4902, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 1.2742400169372559, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4515, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 1.2994771003723145, |
|
"learning_rate": 0.0002, |
|
"loss": 0.416, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.3306324481964111, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4144, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 1.5406475067138672, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4256, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.584506630897522, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4511, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 1.6618622541427612, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8865, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 3.2800000000000002, |
|
"grad_norm": 1.6019847393035889, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7339, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 1.1740251779556274, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6945, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 1.1268410682678223, |
|
"learning_rate": 0.0002, |
|
"loss": 0.579, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 1.3038002252578735, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5217, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 1.112185001373291, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4766, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 1.3828542232513428, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4781, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.1456600427627563, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4056, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 1.2479093074798584, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4447, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.4044010639190674, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3814, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 1.565138339996338, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3982, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 1.4442418813705444, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4262, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.1203701496124268, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8025, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 1.3620504140853882, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9045, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 1.5145343542099, |
|
"learning_rate": 0.0002, |
|
"loss": 0.703, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 1.333682656288147, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6285, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 1.4228661060333252, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6004, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.2111386060714722, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4754, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 1.410719394683838, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5324, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 1.4157259464263916, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4556, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 1.3982216119766235, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4465, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 1.4364334344863892, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4313, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 1.5408861637115479, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4108, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.7199999999999998, |
|
"grad_norm": 1.5500551462173462, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4664, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 1.1150060892105103, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8951, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.0168464183807373, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8619, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.7800000000000002, |
|
"grad_norm": 1.2093026638031006, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5782, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 1.3905984163284302, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6929, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 1.3665902614593506, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5701, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 1.1478445529937744, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4515, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 1.2758458852767944, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5819, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.0731329917907715, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4448, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 1.20659339427948, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4295, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 1.3976835012435913, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4566, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 1.617711067199707, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4303, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 1.707471489906311, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4539, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 1.2962028980255127, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5971, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.8809109926223755, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4973, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.5414550304412842, |
|
"eval_runtime": 565.7272, |
|
"eval_samples_per_second": 0.707, |
|
"eval_steps_per_second": 0.177, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 0.9540490508079529, |
|
"learning_rate": 0.0002, |
|
"loss": 0.56, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 1.0443426370620728, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6562, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 1.020203948020935, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4533, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 1.5309128761291504, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3575, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 1.7135676145553589, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3286, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 1.602728247642517, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2556, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 1.8623350858688354, |
|
"learning_rate": 0.0002, |
|
"loss": 0.314, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 1.5630223751068115, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2716, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 1.3671077489852905, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2506, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 1.0884723663330078, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2473, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 1.193832516670227, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2836, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 1.0041422843933105, |
|
"learning_rate": 0.0002, |
|
"loss": 0.391, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 1.013597846031189, |
|
"learning_rate": 0.0002, |
|
"loss": 0.628, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.9650751948356628, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5202, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 1.0781069993972778, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4967, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 1.1297317743301392, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4154, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 1.2913479804992676, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3014, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 1.4399878978729248, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3344, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 1.4960243701934814, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2894, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.925826072692871, |
|
"learning_rate": 0.0002, |
|
"loss": 0.281, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 1.6930102109909058, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2512, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 1.6776522397994995, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2744, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 1.3323974609375, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2951, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 1.2120009660720825, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3707, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 1.2817238569259644, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6035, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 1.1797271966934204, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5878, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 0.9533390402793884, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3458, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 4.5600000000000005, |
|
"grad_norm": 1.0915648937225342, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3617, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 1.3463889360427856, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3853, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 1.457556128501892, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3343, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 1.681526780128479, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3132, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 1.7101032733917236, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2913, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 2.1125667095184326, |
|
"learning_rate": 0.0002, |
|
"loss": 0.304, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 1.5824134349822998, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2878, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 1.5257948637008667, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3049, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 1.1413626670837402, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3875, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 1.4785950183868408, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7915, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 1.0445829629898071, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4765, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 1.0932363271713257, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4081, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 1.313068151473999, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4278, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 1.2771199941635132, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3481, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 1.3306118249893188, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3301, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 1.2204334735870361, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2974, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 1.1585593223571777, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2404, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 1.6888794898986816, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2817, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 1.4956034421920776, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2583, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 4.9399999999999995, |
|
"grad_norm": 1.6130638122558594, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2888, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 2.280722141265869, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3732, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 1.8177040815353394, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6086, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.674232840538025, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3232, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.6614739894866943, |
|
"eval_runtime": 565.5774, |
|
"eval_samples_per_second": 0.707, |
|
"eval_steps_per_second": 0.177, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.02, |
|
"grad_norm": 1.1646928787231445, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4812, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"grad_norm": 1.0134564638137817, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3754, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"grad_norm": 1.0580028295516968, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3408, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 0.9608296751976013, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1999, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 1.2543963193893433, |
|
"learning_rate": 0.0002, |
|
"loss": 0.208, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 1.2029474973678589, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1838, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 5.14, |
|
"grad_norm": 1.3406902551651, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1828, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 1.3970423936843872, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1882, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 5.18, |
|
"grad_norm": 1.5742741823196411, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2117, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 1.4690401554107666, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2133, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"grad_norm": 1.337886095046997, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2375, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"grad_norm": 1.3524601459503174, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3295, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 5.26, |
|
"grad_norm": 1.533918023109436, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4466, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 1.3888379335403442, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3435, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"grad_norm": 1.0731428861618042, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2129, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 1.0117309093475342, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2522, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 5.34, |
|
"grad_norm": 1.1278893947601318, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2326, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 5.36, |
|
"grad_norm": 1.1024526357650757, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1924, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 1.3139152526855469, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2193, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 1.2390962839126587, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2005, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"grad_norm": 1.5270419120788574, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2245, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 1.48665452003479, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2317, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"grad_norm": 1.4966135025024414, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2785, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"grad_norm": 1.2391105890274048, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3812, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 1.5869184732437134, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4248, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 1.3854241371154785, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3517, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 1.2200820446014404, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2956, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 5.5600000000000005, |
|
"grad_norm": 1.0377618074417114, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2488, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 5.58, |
|
"grad_norm": 1.1551989316940308, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2139, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 1.1790027618408203, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2154, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"grad_norm": 1.4331631660461426, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2332, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 1.3983947038650513, |
|
"learning_rate": 0.0002, |
|
"loss": 0.235, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 5.66, |
|
"grad_norm": 1.4608796834945679, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2068, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"grad_norm": 1.3449772596359253, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2145, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"grad_norm": 1.5349823236465454, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2537, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"grad_norm": 1.5507280826568604, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3617, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 5.74, |
|
"grad_norm": 1.4181163311004639, |
|
"learning_rate": 0.0002, |
|
"loss": 0.428, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 1.1474294662475586, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3593, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 5.78, |
|
"grad_norm": 1.0448319911956787, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2488, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 1.2155684232711792, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2429, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"grad_norm": 1.037227749824524, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1951, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 1.0879547595977783, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2003, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 1.2885050773620605, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2133, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 1.335684061050415, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2037, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 5.9, |
|
"grad_norm": 1.3747302293777466, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2276, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 1.3095324039459229, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2255, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 5.9399999999999995, |
|
"grad_norm": 1.3633639812469482, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2837, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"grad_norm": 1.3497192859649658, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3738, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"grad_norm": 1.4940162897109985, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3705, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 1.4202619791030884, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2504, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.7531368732452393, |
|
"eval_runtime": 566.1878, |
|
"eval_samples_per_second": 0.706, |
|
"eval_steps_per_second": 0.177, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 300, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1321894941576397e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|