Spaces:
Sleeping
Sleeping
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 0.03918367346938775, | |
"eval_steps": 50, | |
"global_step": 1200, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 3.265306122448979e-05, | |
"grad_norm": 12.553070068359375, | |
"learning_rate": 2.0000000000000002e-07, | |
"loss": 2.3591, | |
"step": 1 | |
}, | |
{ | |
"epoch": 6.530612244897959e-05, | |
"grad_norm": 12.028986930847168, | |
"learning_rate": 4.0000000000000003e-07, | |
"loss": 2.3144, | |
"step": 2 | |
}, | |
{ | |
"epoch": 9.79591836734694e-05, | |
"grad_norm": 12.335774421691895, | |
"learning_rate": 6.000000000000001e-07, | |
"loss": 2.3323, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.00013061224489795917, | |
"grad_norm": 12.695511817932129, | |
"learning_rate": 8.000000000000001e-07, | |
"loss": 2.3651, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.00016326530612244898, | |
"grad_norm": 12.702936172485352, | |
"learning_rate": 1.0000000000000002e-06, | |
"loss": 2.3563, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.0001959183673469388, | |
"grad_norm": 11.689526557922363, | |
"learning_rate": 1.2000000000000002e-06, | |
"loss": 2.3263, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.00022857142857142857, | |
"grad_norm": 11.332144737243652, | |
"learning_rate": 1.4000000000000001e-06, | |
"loss": 2.2861, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.00026122448979591835, | |
"grad_norm": 11.530237197875977, | |
"learning_rate": 1.6000000000000001e-06, | |
"loss": 2.2867, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.0002938775510204082, | |
"grad_norm": 10.193593978881836, | |
"learning_rate": 1.8000000000000001e-06, | |
"loss": 2.2521, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.00032653061224489796, | |
"grad_norm": 9.98508071899414, | |
"learning_rate": 2.0000000000000003e-06, | |
"loss": 2.249, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.00035918367346938774, | |
"grad_norm": 9.584476470947266, | |
"learning_rate": 2.2e-06, | |
"loss": 2.2116, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.0003918367346938776, | |
"grad_norm": 9.574864387512207, | |
"learning_rate": 2.4000000000000003e-06, | |
"loss": 2.2343, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.00042448979591836735, | |
"grad_norm": 8.929769515991211, | |
"learning_rate": 2.6e-06, | |
"loss": 2.1599, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.00045714285714285713, | |
"grad_norm": 8.262470245361328, | |
"learning_rate": 2.8000000000000003e-06, | |
"loss": 2.1152, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.0004897959183673469, | |
"grad_norm": 8.28957748413086, | |
"learning_rate": 3e-06, | |
"loss": 2.1048, | |
"step": 15 | |
}, | |
{ | |
"epoch": 0.0005224489795918367, | |
"grad_norm": 8.160731315612793, | |
"learning_rate": 3.2000000000000003e-06, | |
"loss": 2.054, | |
"step": 16 | |
}, | |
{ | |
"epoch": 0.0005551020408163266, | |
"grad_norm": 7.5022077560424805, | |
"learning_rate": 3.4000000000000005e-06, | |
"loss": 2.0559, | |
"step": 17 | |
}, | |
{ | |
"epoch": 0.0005877551020408164, | |
"grad_norm": 7.718921661376953, | |
"learning_rate": 3.6000000000000003e-06, | |
"loss": 2.003, | |
"step": 18 | |
}, | |
{ | |
"epoch": 0.0006204081632653061, | |
"grad_norm": 7.6420392990112305, | |
"learning_rate": 3.8000000000000005e-06, | |
"loss": 1.9646, | |
"step": 19 | |
}, | |
{ | |
"epoch": 0.0006530612244897959, | |
"grad_norm": 7.363818645477295, | |
"learning_rate": 4.000000000000001e-06, | |
"loss": 1.918, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.0006857142857142857, | |
"grad_norm": 7.367091178894043, | |
"learning_rate": 4.2000000000000004e-06, | |
"loss": 1.9065, | |
"step": 21 | |
}, | |
{ | |
"epoch": 0.0007183673469387755, | |
"grad_norm": 7.201758861541748, | |
"learning_rate": 4.4e-06, | |
"loss": 1.8989, | |
"step": 22 | |
}, | |
{ | |
"epoch": 0.0007510204081632653, | |
"grad_norm": 7.093530654907227, | |
"learning_rate": 4.600000000000001e-06, | |
"loss": 1.8381, | |
"step": 23 | |
}, | |
{ | |
"epoch": 0.0007836734693877551, | |
"grad_norm": 7.22179651260376, | |
"learning_rate": 4.800000000000001e-06, | |
"loss": 1.7959, | |
"step": 24 | |
}, | |
{ | |
"epoch": 0.0008163265306122449, | |
"grad_norm": 7.264500141143799, | |
"learning_rate": 5e-06, | |
"loss": 1.7654, | |
"step": 25 | |
}, | |
{ | |
"epoch": 0.0008489795918367347, | |
"grad_norm": 6.931406021118164, | |
"learning_rate": 5.2e-06, | |
"loss": 1.6971, | |
"step": 26 | |
}, | |
{ | |
"epoch": 0.0008816326530612245, | |
"grad_norm": 6.8040547370910645, | |
"learning_rate": 5.400000000000001e-06, | |
"loss": 1.6738, | |
"step": 27 | |
}, | |
{ | |
"epoch": 0.0009142857142857143, | |
"grad_norm": 7.414416313171387, | |
"learning_rate": 5.600000000000001e-06, | |
"loss": 1.6131, | |
"step": 28 | |
}, | |
{ | |
"epoch": 0.000946938775510204, | |
"grad_norm": 8.00181770324707, | |
"learning_rate": 5.8e-06, | |
"loss": 1.5743, | |
"step": 29 | |
}, | |
{ | |
"epoch": 0.0009795918367346938, | |
"grad_norm": 8.280529022216797, | |
"learning_rate": 6e-06, | |
"loss": 1.5139, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.0010122448979591836, | |
"grad_norm": 24.16852378845215, | |
"learning_rate": 6.200000000000001e-06, | |
"loss": 1.4998, | |
"step": 31 | |
}, | |
{ | |
"epoch": 0.0010448979591836734, | |
"grad_norm": 25.735715866088867, | |
"learning_rate": 6.4000000000000006e-06, | |
"loss": 1.4774, | |
"step": 32 | |
}, | |
{ | |
"epoch": 0.0010775510204081632, | |
"grad_norm": 18.589033126831055, | |
"learning_rate": 6.600000000000001e-06, | |
"loss": 1.3795, | |
"step": 33 | |
}, | |
{ | |
"epoch": 0.0011102040816326532, | |
"grad_norm": 7.6697096824646, | |
"learning_rate": 6.800000000000001e-06, | |
"loss": 1.3513, | |
"step": 34 | |
}, | |
{ | |
"epoch": 0.001142857142857143, | |
"grad_norm": 12.835890769958496, | |
"learning_rate": 7e-06, | |
"loss": 1.2908, | |
"step": 35 | |
}, | |
{ | |
"epoch": 0.0011755102040816327, | |
"grad_norm": 12.289154052734375, | |
"learning_rate": 7.2000000000000005e-06, | |
"loss": 1.2354, | |
"step": 36 | |
}, | |
{ | |
"epoch": 0.0012081632653061225, | |
"grad_norm": 5.951991558074951, | |
"learning_rate": 7.4e-06, | |
"loss": 1.2242, | |
"step": 37 | |
}, | |
{ | |
"epoch": 0.0012408163265306123, | |
"grad_norm": 5.6563825607299805, | |
"learning_rate": 7.600000000000001e-06, | |
"loss": 1.2448, | |
"step": 38 | |
}, | |
{ | |
"epoch": 0.001273469387755102, | |
"grad_norm": 5.884244441986084, | |
"learning_rate": 7.800000000000002e-06, | |
"loss": 1.2011, | |
"step": 39 | |
}, | |
{ | |
"epoch": 0.0013061224489795918, | |
"grad_norm": 4.279225826263428, | |
"learning_rate": 8.000000000000001e-06, | |
"loss": 1.1616, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.0013387755102040816, | |
"grad_norm": 4.273413181304932, | |
"learning_rate": 8.2e-06, | |
"loss": 1.1361, | |
"step": 41 | |
}, | |
{ | |
"epoch": 0.0013714285714285714, | |
"grad_norm": 4.514435768127441, | |
"learning_rate": 8.400000000000001e-06, | |
"loss": 1.1194, | |
"step": 42 | |
}, | |
{ | |
"epoch": 0.0014040816326530612, | |
"grad_norm": 3.1108927726745605, | |
"learning_rate": 8.6e-06, | |
"loss": 1.1359, | |
"step": 43 | |
}, | |
{ | |
"epoch": 0.001436734693877551, | |
"grad_norm": 3.187659740447998, | |
"learning_rate": 8.8e-06, | |
"loss": 1.1088, | |
"step": 44 | |
}, | |
{ | |
"epoch": 0.0014693877551020407, | |
"grad_norm": 3.0588572025299072, | |
"learning_rate": 9e-06, | |
"loss": 1.0367, | |
"step": 45 | |
}, | |
{ | |
"epoch": 0.0015020408163265305, | |
"grad_norm": 3.0041353702545166, | |
"learning_rate": 9.200000000000002e-06, | |
"loss": 1.065, | |
"step": 46 | |
}, | |
{ | |
"epoch": 0.0015346938775510203, | |
"grad_norm": 3.0067152976989746, | |
"learning_rate": 9.4e-06, | |
"loss": 1.106, | |
"step": 47 | |
}, | |
{ | |
"epoch": 0.0015673469387755103, | |
"grad_norm": 2.8196375370025635, | |
"learning_rate": 9.600000000000001e-06, | |
"loss": 1.0463, | |
"step": 48 | |
}, | |
{ | |
"epoch": 0.0016, | |
"grad_norm": 2.7588930130004883, | |
"learning_rate": 9.800000000000001e-06, | |
"loss": 1.0707, | |
"step": 49 | |
}, | |
{ | |
"epoch": 0.0016326530612244899, | |
"grad_norm": 2.776578903198242, | |
"learning_rate": 1e-05, | |
"loss": 1.057, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.0016326530612244899, | |
"eval_loss": 1.0555493831634521, | |
"eval_runtime": 80.9919, | |
"eval_samples_per_second": 1.235, | |
"eval_steps_per_second": 1.235, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.0016653061224489796, | |
"grad_norm": 3.0007307529449463, | |
"learning_rate": 9.999967341606794e-06, | |
"loss": 1.0359, | |
"step": 51 | |
}, | |
{ | |
"epoch": 0.0016979591836734694, | |
"grad_norm": 2.820908308029175, | |
"learning_rate": 9.999934683213587e-06, | |
"loss": 1.0659, | |
"step": 52 | |
}, | |
{ | |
"epoch": 0.0017306122448979592, | |
"grad_norm": 3.070078134536743, | |
"learning_rate": 9.999902024820379e-06, | |
"loss": 1.0676, | |
"step": 53 | |
}, | |
{ | |
"epoch": 0.001763265306122449, | |
"grad_norm": 2.826664686203003, | |
"learning_rate": 9.999869366427172e-06, | |
"loss": 1.0396, | |
"step": 54 | |
}, | |
{ | |
"epoch": 0.0017959183673469388, | |
"grad_norm": 2.8928892612457275, | |
"learning_rate": 9.999836708033965e-06, | |
"loss": 1.0043, | |
"step": 55 | |
}, | |
{ | |
"epoch": 0.0018285714285714285, | |
"grad_norm": 2.9964358806610107, | |
"learning_rate": 9.999804049640759e-06, | |
"loss": 1.0335, | |
"step": 56 | |
}, | |
{ | |
"epoch": 0.0018612244897959183, | |
"grad_norm": 2.796006202697754, | |
"learning_rate": 9.999771391247552e-06, | |
"loss": 0.9906, | |
"step": 57 | |
}, | |
{ | |
"epoch": 0.001893877551020408, | |
"grad_norm": 2.638707399368286, | |
"learning_rate": 9.999738732854345e-06, | |
"loss": 1.0347, | |
"step": 58 | |
}, | |
{ | |
"epoch": 0.0019265306122448979, | |
"grad_norm": 2.6315219402313232, | |
"learning_rate": 9.999706074461137e-06, | |
"loss": 1.0213, | |
"step": 59 | |
}, | |
{ | |
"epoch": 0.0019591836734693877, | |
"grad_norm": 2.963063955307007, | |
"learning_rate": 9.99967341606793e-06, | |
"loss": 0.9669, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.0019918367346938777, | |
"grad_norm": 2.913827657699585, | |
"learning_rate": 9.999640757674723e-06, | |
"loss": 1.013, | |
"step": 61 | |
}, | |
{ | |
"epoch": 0.002024489795918367, | |
"grad_norm": 2.830906391143799, | |
"learning_rate": 9.999608099281516e-06, | |
"loss": 1.0036, | |
"step": 62 | |
}, | |
{ | |
"epoch": 0.002057142857142857, | |
"grad_norm": 2.856300115585327, | |
"learning_rate": 9.99957544088831e-06, | |
"loss": 0.9806, | |
"step": 63 | |
}, | |
{ | |
"epoch": 0.0020897959183673468, | |
"grad_norm": 2.8459157943725586, | |
"learning_rate": 9.999542782495103e-06, | |
"loss": 1.0253, | |
"step": 64 | |
}, | |
{ | |
"epoch": 0.0021224489795918368, | |
"grad_norm": 2.748532772064209, | |
"learning_rate": 9.999510124101896e-06, | |
"loss": 1.0017, | |
"step": 65 | |
}, | |
{ | |
"epoch": 0.0021551020408163263, | |
"grad_norm": 2.910276412963867, | |
"learning_rate": 9.999477465708688e-06, | |
"loss": 0.978, | |
"step": 66 | |
}, | |
{ | |
"epoch": 0.0021877551020408163, | |
"grad_norm": 2.8665144443511963, | |
"learning_rate": 9.99944480731548e-06, | |
"loss": 0.9914, | |
"step": 67 | |
}, | |
{ | |
"epoch": 0.0022204081632653063, | |
"grad_norm": 2.6549315452575684, | |
"learning_rate": 9.999412148922274e-06, | |
"loss": 0.9735, | |
"step": 68 | |
}, | |
{ | |
"epoch": 0.002253061224489796, | |
"grad_norm": 2.7277936935424805, | |
"learning_rate": 9.999379490529067e-06, | |
"loss": 0.9498, | |
"step": 69 | |
}, | |
{ | |
"epoch": 0.002285714285714286, | |
"grad_norm": 2.849747657775879, | |
"learning_rate": 9.99934683213586e-06, | |
"loss": 0.9976, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.0023183673469387754, | |
"grad_norm": 2.8592312335968018, | |
"learning_rate": 9.999314173742652e-06, | |
"loss": 0.9523, | |
"step": 71 | |
}, | |
{ | |
"epoch": 0.0023510204081632654, | |
"grad_norm": 2.7041122913360596, | |
"learning_rate": 9.999281515349445e-06, | |
"loss": 0.9692, | |
"step": 72 | |
}, | |
{ | |
"epoch": 0.002383673469387755, | |
"grad_norm": 2.717122793197632, | |
"learning_rate": 9.999248856956238e-06, | |
"loss": 0.9491, | |
"step": 73 | |
}, | |
{ | |
"epoch": 0.002416326530612245, | |
"grad_norm": 2.516847610473633, | |
"learning_rate": 9.999216198563032e-06, | |
"loss": 0.9632, | |
"step": 74 | |
}, | |
{ | |
"epoch": 0.0024489795918367346, | |
"grad_norm": 2.766266107559204, | |
"learning_rate": 9.999183540169825e-06, | |
"loss": 0.932, | |
"step": 75 | |
}, | |
{ | |
"epoch": 0.0024816326530612246, | |
"grad_norm": 2.8596270084381104, | |
"learning_rate": 9.999150881776616e-06, | |
"loss": 0.9479, | |
"step": 76 | |
}, | |
{ | |
"epoch": 0.002514285714285714, | |
"grad_norm": 2.6034138202667236, | |
"learning_rate": 9.99911822338341e-06, | |
"loss": 0.9712, | |
"step": 77 | |
}, | |
{ | |
"epoch": 0.002546938775510204, | |
"grad_norm": 2.662513256072998, | |
"learning_rate": 9.999085564990203e-06, | |
"loss": 0.9882, | |
"step": 78 | |
}, | |
{ | |
"epoch": 0.0025795918367346937, | |
"grad_norm": 2.900061845779419, | |
"learning_rate": 9.999052906596996e-06, | |
"loss": 0.9232, | |
"step": 79 | |
}, | |
{ | |
"epoch": 0.0026122448979591837, | |
"grad_norm": 2.7503371238708496, | |
"learning_rate": 9.99902024820379e-06, | |
"loss": 0.9483, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.0026448979591836737, | |
"grad_norm": 2.61838960647583, | |
"learning_rate": 9.998987589810583e-06, | |
"loss": 0.9104, | |
"step": 81 | |
}, | |
{ | |
"epoch": 0.0026775510204081632, | |
"grad_norm": 2.775689125061035, | |
"learning_rate": 9.998954931417376e-06, | |
"loss": 0.9145, | |
"step": 82 | |
}, | |
{ | |
"epoch": 0.0027102040816326532, | |
"grad_norm": 2.695878744125366, | |
"learning_rate": 9.998922273024167e-06, | |
"loss": 0.9075, | |
"step": 83 | |
}, | |
{ | |
"epoch": 0.002742857142857143, | |
"grad_norm": 2.748655080795288, | |
"learning_rate": 9.99888961463096e-06, | |
"loss": 0.966, | |
"step": 84 | |
}, | |
{ | |
"epoch": 0.002775510204081633, | |
"grad_norm": 2.5372986793518066, | |
"learning_rate": 9.998856956237754e-06, | |
"loss": 0.9541, | |
"step": 85 | |
}, | |
{ | |
"epoch": 0.0028081632653061224, | |
"grad_norm": 2.6666107177734375, | |
"learning_rate": 9.998824297844547e-06, | |
"loss": 0.9678, | |
"step": 86 | |
}, | |
{ | |
"epoch": 0.0028408163265306124, | |
"grad_norm": 2.450654983520508, | |
"learning_rate": 9.99879163945134e-06, | |
"loss": 0.9404, | |
"step": 87 | |
}, | |
{ | |
"epoch": 0.002873469387755102, | |
"grad_norm": 2.6407573223114014, | |
"learning_rate": 9.998758981058134e-06, | |
"loss": 0.9661, | |
"step": 88 | |
}, | |
{ | |
"epoch": 0.002906122448979592, | |
"grad_norm": 2.589488983154297, | |
"learning_rate": 9.998726322664925e-06, | |
"loss": 0.9528, | |
"step": 89 | |
}, | |
{ | |
"epoch": 0.0029387755102040815, | |
"grad_norm": 3.260467767715454, | |
"learning_rate": 9.998693664271718e-06, | |
"loss": 0.9625, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.0029714285714285715, | |
"grad_norm": 2.8341681957244873, | |
"learning_rate": 9.998661005878512e-06, | |
"loss": 0.9088, | |
"step": 91 | |
}, | |
{ | |
"epoch": 0.003004081632653061, | |
"grad_norm": 3.00934100151062, | |
"learning_rate": 9.998628347485305e-06, | |
"loss": 0.9148, | |
"step": 92 | |
}, | |
{ | |
"epoch": 0.003036734693877551, | |
"grad_norm": 2.8187415599823, | |
"learning_rate": 9.998595689092098e-06, | |
"loss": 0.9028, | |
"step": 93 | |
}, | |
{ | |
"epoch": 0.0030693877551020406, | |
"grad_norm": 2.617946147918701, | |
"learning_rate": 9.99856303069889e-06, | |
"loss": 0.8886, | |
"step": 94 | |
}, | |
{ | |
"epoch": 0.0031020408163265306, | |
"grad_norm": 2.7720468044281006, | |
"learning_rate": 9.998530372305683e-06, | |
"loss": 0.9417, | |
"step": 95 | |
}, | |
{ | |
"epoch": 0.0031346938775510206, | |
"grad_norm": 2.8178319931030273, | |
"learning_rate": 9.998497713912476e-06, | |
"loss": 0.9012, | |
"step": 96 | |
}, | |
{ | |
"epoch": 0.00316734693877551, | |
"grad_norm": 2.9068961143493652, | |
"learning_rate": 9.99846505551927e-06, | |
"loss": 0.9318, | |
"step": 97 | |
}, | |
{ | |
"epoch": 0.0032, | |
"grad_norm": 3.227482557296753, | |
"learning_rate": 9.998432397126063e-06, | |
"loss": 0.8956, | |
"step": 98 | |
}, | |
{ | |
"epoch": 0.0032326530612244897, | |
"grad_norm": 2.8666388988494873, | |
"learning_rate": 9.998399738732854e-06, | |
"loss": 0.9288, | |
"step": 99 | |
}, | |
{ | |
"epoch": 0.0032653061224489797, | |
"grad_norm": 2.861967086791992, | |
"learning_rate": 9.998367080339647e-06, | |
"loss": 0.8923, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.0032653061224489797, | |
"eval_loss": 0.9108777046203613, | |
"eval_runtime": 73.7729, | |
"eval_samples_per_second": 1.356, | |
"eval_steps_per_second": 1.356, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.0032979591836734693, | |
"grad_norm": 3.026766777038574, | |
"learning_rate": 9.99833442194644e-06, | |
"loss": 0.9085, | |
"step": 101 | |
}, | |
{ | |
"epoch": 0.0033306122448979593, | |
"grad_norm": 3.0358951091766357, | |
"learning_rate": 9.998301763553234e-06, | |
"loss": 0.9019, | |
"step": 102 | |
}, | |
{ | |
"epoch": 0.003363265306122449, | |
"grad_norm": 2.7646968364715576, | |
"learning_rate": 9.998269105160027e-06, | |
"loss": 0.8954, | |
"step": 103 | |
}, | |
{ | |
"epoch": 0.003395918367346939, | |
"grad_norm": 2.880887746810913, | |
"learning_rate": 9.99823644676682e-06, | |
"loss": 0.9257, | |
"step": 104 | |
}, | |
{ | |
"epoch": 0.0034285714285714284, | |
"grad_norm": 3.1517140865325928, | |
"learning_rate": 9.998203788373614e-06, | |
"loss": 0.8951, | |
"step": 105 | |
}, | |
{ | |
"epoch": 0.0034612244897959184, | |
"grad_norm": 2.7021565437316895, | |
"learning_rate": 9.998171129980407e-06, | |
"loss": 0.9521, | |
"step": 106 | |
}, | |
{ | |
"epoch": 0.003493877551020408, | |
"grad_norm": 2.860952854156494, | |
"learning_rate": 9.998138471587198e-06, | |
"loss": 0.902, | |
"step": 107 | |
}, | |
{ | |
"epoch": 0.003526530612244898, | |
"grad_norm": 3.0253970623016357, | |
"learning_rate": 9.998105813193992e-06, | |
"loss": 0.8935, | |
"step": 108 | |
}, | |
{ | |
"epoch": 0.003559183673469388, | |
"grad_norm": 2.776489734649658, | |
"learning_rate": 9.998073154800785e-06, | |
"loss": 0.9323, | |
"step": 109 | |
}, | |
{ | |
"epoch": 0.0035918367346938775, | |
"grad_norm": 2.3988196849823, | |
"learning_rate": 9.998040496407578e-06, | |
"loss": 0.8951, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.0036244897959183675, | |
"grad_norm": 2.6600584983825684, | |
"learning_rate": 9.998007838014371e-06, | |
"loss": 0.8913, | |
"step": 111 | |
}, | |
{ | |
"epoch": 0.003657142857142857, | |
"grad_norm": 2.4915781021118164, | |
"learning_rate": 9.997975179621163e-06, | |
"loss": 0.8814, | |
"step": 112 | |
}, | |
{ | |
"epoch": 0.003689795918367347, | |
"grad_norm": 2.7426276206970215, | |
"learning_rate": 9.997942521227956e-06, | |
"loss": 0.9027, | |
"step": 113 | |
}, | |
{ | |
"epoch": 0.0037224489795918366, | |
"grad_norm": 2.80908465385437, | |
"learning_rate": 9.99790986283475e-06, | |
"loss": 0.9033, | |
"step": 114 | |
}, | |
{ | |
"epoch": 0.0037551020408163266, | |
"grad_norm": 2.5127768516540527, | |
"learning_rate": 9.997877204441543e-06, | |
"loss": 0.8886, | |
"step": 115 | |
}, | |
{ | |
"epoch": 0.003787755102040816, | |
"grad_norm": 2.7205052375793457, | |
"learning_rate": 9.997844546048336e-06, | |
"loss": 0.9199, | |
"step": 116 | |
}, | |
{ | |
"epoch": 0.003820408163265306, | |
"grad_norm": 2.6644845008850098, | |
"learning_rate": 9.997811887655127e-06, | |
"loss": 0.8722, | |
"step": 117 | |
}, | |
{ | |
"epoch": 0.0038530612244897957, | |
"grad_norm": 3.05825138092041, | |
"learning_rate": 9.99777922926192e-06, | |
"loss": 0.8926, | |
"step": 118 | |
}, | |
{ | |
"epoch": 0.0038857142857142857, | |
"grad_norm": 2.8610692024230957, | |
"learning_rate": 9.997746570868714e-06, | |
"loss": 0.886, | |
"step": 119 | |
}, | |
{ | |
"epoch": 0.003918367346938775, | |
"grad_norm": 2.8283677101135254, | |
"learning_rate": 9.997713912475507e-06, | |
"loss": 0.8838, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.003951020408163266, | |
"grad_norm": 3.36906361579895, | |
"learning_rate": 9.9976812540823e-06, | |
"loss": 0.8614, | |
"step": 121 | |
}, | |
{ | |
"epoch": 0.003983673469387755, | |
"grad_norm": 2.949343681335449, | |
"learning_rate": 9.997648595689093e-06, | |
"loss": 0.8915, | |
"step": 122 | |
}, | |
{ | |
"epoch": 0.004016326530612245, | |
"grad_norm": 2.986492156982422, | |
"learning_rate": 9.997615937295885e-06, | |
"loss": 0.9053, | |
"step": 123 | |
}, | |
{ | |
"epoch": 0.004048979591836734, | |
"grad_norm": 2.7720727920532227, | |
"learning_rate": 9.997583278902678e-06, | |
"loss": 0.8915, | |
"step": 124 | |
}, | |
{ | |
"epoch": 0.004081632653061225, | |
"grad_norm": 2.5769472122192383, | |
"learning_rate": 9.997550620509471e-06, | |
"loss": 0.8845, | |
"step": 125 | |
}, | |
{ | |
"epoch": 0.004114285714285714, | |
"grad_norm": 2.9634106159210205, | |
"learning_rate": 9.997517962116265e-06, | |
"loss": 0.9057, | |
"step": 126 | |
}, | |
{ | |
"epoch": 0.004146938775510204, | |
"grad_norm": 2.6103193759918213, | |
"learning_rate": 9.997485303723058e-06, | |
"loss": 0.8843, | |
"step": 127 | |
}, | |
{ | |
"epoch": 0.0041795918367346935, | |
"grad_norm": 2.892089366912842, | |
"learning_rate": 9.997452645329851e-06, | |
"loss": 0.8875, | |
"step": 128 | |
}, | |
{ | |
"epoch": 0.004212244897959184, | |
"grad_norm": 3.0502076148986816, | |
"learning_rate": 9.997419986936644e-06, | |
"loss": 0.8654, | |
"step": 129 | |
}, | |
{ | |
"epoch": 0.0042448979591836735, | |
"grad_norm": 2.968538522720337, | |
"learning_rate": 9.997387328543436e-06, | |
"loss": 0.9211, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.004277551020408163, | |
"grad_norm": 2.7077767848968506, | |
"learning_rate": 9.99735467015023e-06, | |
"loss": 0.849, | |
"step": 131 | |
}, | |
{ | |
"epoch": 0.004310204081632653, | |
"grad_norm": 2.8962769508361816, | |
"learning_rate": 9.997322011757022e-06, | |
"loss": 0.8844, | |
"step": 132 | |
}, | |
{ | |
"epoch": 0.004342857142857143, | |
"grad_norm": 2.5692780017852783, | |
"learning_rate": 9.997289353363816e-06, | |
"loss": 0.8494, | |
"step": 133 | |
}, | |
{ | |
"epoch": 0.004375510204081633, | |
"grad_norm": 2.603320837020874, | |
"learning_rate": 9.997256694970609e-06, | |
"loss": 0.8994, | |
"step": 134 | |
}, | |
{ | |
"epoch": 0.004408163265306122, | |
"grad_norm": 2.762920618057251, | |
"learning_rate": 9.9972240365774e-06, | |
"loss": 0.8894, | |
"step": 135 | |
}, | |
{ | |
"epoch": 0.004440816326530613, | |
"grad_norm": 2.7908272743225098, | |
"learning_rate": 9.997191378184194e-06, | |
"loss": 0.888, | |
"step": 136 | |
}, | |
{ | |
"epoch": 0.004473469387755102, | |
"grad_norm": 2.405191421508789, | |
"learning_rate": 9.997158719790987e-06, | |
"loss": 0.8953, | |
"step": 137 | |
}, | |
{ | |
"epoch": 0.004506122448979592, | |
"grad_norm": 3.086392879486084, | |
"learning_rate": 9.99712606139778e-06, | |
"loss": 0.9024, | |
"step": 138 | |
}, | |
{ | |
"epoch": 0.004538775510204081, | |
"grad_norm": 3.10508394241333, | |
"learning_rate": 9.997093403004573e-06, | |
"loss": 0.8636, | |
"step": 139 | |
}, | |
{ | |
"epoch": 0.004571428571428572, | |
"grad_norm": 2.7642314434051514, | |
"learning_rate": 9.997060744611365e-06, | |
"loss": 0.8589, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.004604081632653061, | |
"grad_norm": 2.9966800212860107, | |
"learning_rate": 9.997028086218158e-06, | |
"loss": 0.8564, | |
"step": 141 | |
}, | |
{ | |
"epoch": 0.004636734693877551, | |
"grad_norm": 2.998603582382202, | |
"learning_rate": 9.996995427824951e-06, | |
"loss": 0.8596, | |
"step": 142 | |
}, | |
{ | |
"epoch": 0.0046693877551020405, | |
"grad_norm": 2.815833330154419, | |
"learning_rate": 9.996962769431745e-06, | |
"loss": 0.9006, | |
"step": 143 | |
}, | |
{ | |
"epoch": 0.004702040816326531, | |
"grad_norm": 2.654766798019409, | |
"learning_rate": 9.996930111038538e-06, | |
"loss": 0.8683, | |
"step": 144 | |
}, | |
{ | |
"epoch": 0.0047346938775510205, | |
"grad_norm": 2.9405479431152344, | |
"learning_rate": 9.996897452645331e-06, | |
"loss": 0.8756, | |
"step": 145 | |
}, | |
{ | |
"epoch": 0.00476734693877551, | |
"grad_norm": 3.789085626602173, | |
"learning_rate": 9.996864794252124e-06, | |
"loss": 0.8991, | |
"step": 146 | |
}, | |
{ | |
"epoch": 0.0048, | |
"grad_norm": 3.062678813934326, | |
"learning_rate": 9.996832135858918e-06, | |
"loss": 0.8461, | |
"step": 147 | |
}, | |
{ | |
"epoch": 0.00483265306122449, | |
"grad_norm": 2.656879425048828, | |
"learning_rate": 9.996799477465709e-06, | |
"loss": 0.8755, | |
"step": 148 | |
}, | |
{ | |
"epoch": 0.00486530612244898, | |
"grad_norm": 2.66681170463562, | |
"learning_rate": 9.996766819072502e-06, | |
"loss": 0.8882, | |
"step": 149 | |
}, | |
{ | |
"epoch": 0.004897959183673469, | |
"grad_norm": 2.69744873046875, | |
"learning_rate": 9.996734160679296e-06, | |
"loss": 0.8904, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.004897959183673469, | |
"eval_loss": 0.8796091675758362, | |
"eval_runtime": 74.0605, | |
"eval_samples_per_second": 1.35, | |
"eval_steps_per_second": 1.35, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.0049306122448979596, | |
"grad_norm": 2.9235658645629883, | |
"learning_rate": 9.996701502286089e-06, | |
"loss": 0.819, | |
"step": 151 | |
}, | |
{ | |
"epoch": 0.004963265306122449, | |
"grad_norm": 3.2201597690582275, | |
"learning_rate": 9.996668843892882e-06, | |
"loss": 0.8964, | |
"step": 152 | |
}, | |
{ | |
"epoch": 0.004995918367346939, | |
"grad_norm": 2.7854557037353516, | |
"learning_rate": 9.996636185499674e-06, | |
"loss": 0.8886, | |
"step": 153 | |
}, | |
{ | |
"epoch": 0.005028571428571428, | |
"grad_norm": 2.4900546073913574, | |
"learning_rate": 9.996603527106467e-06, | |
"loss": 0.8496, | |
"step": 154 | |
}, | |
{ | |
"epoch": 0.005061224489795919, | |
"grad_norm": 2.7506489753723145, | |
"learning_rate": 9.99657086871326e-06, | |
"loss": 0.9044, | |
"step": 155 | |
}, | |
{ | |
"epoch": 0.005093877551020408, | |
"grad_norm": 2.8616607189178467, | |
"learning_rate": 9.996538210320053e-06, | |
"loss": 0.8685, | |
"step": 156 | |
}, | |
{ | |
"epoch": 0.005126530612244898, | |
"grad_norm": 2.814704656600952, | |
"learning_rate": 9.996505551926847e-06, | |
"loss": 0.8771, | |
"step": 157 | |
}, | |
{ | |
"epoch": 0.005159183673469387, | |
"grad_norm": 3.006065845489502, | |
"learning_rate": 9.996472893533638e-06, | |
"loss": 0.8571, | |
"step": 158 | |
}, | |
{ | |
"epoch": 0.005191836734693878, | |
"grad_norm": 2.821923017501831, | |
"learning_rate": 9.996440235140431e-06, | |
"loss": 0.8629, | |
"step": 159 | |
}, | |
{ | |
"epoch": 0.005224489795918367, | |
"grad_norm": 3.278881072998047, | |
"learning_rate": 9.996407576747225e-06, | |
"loss": 0.893, | |
"step": 160 | |
}, | |
{ | |
"epoch": 0.005257142857142857, | |
"grad_norm": 2.744616985321045, | |
"learning_rate": 9.996374918354018e-06, | |
"loss": 0.8682, | |
"step": 161 | |
}, | |
{ | |
"epoch": 0.005289795918367347, | |
"grad_norm": 2.697544574737549, | |
"learning_rate": 9.996342259960811e-06, | |
"loss": 0.8974, | |
"step": 162 | |
}, | |
{ | |
"epoch": 0.005322448979591837, | |
"grad_norm": 2.9768247604370117, | |
"learning_rate": 9.996309601567604e-06, | |
"loss": 0.8931, | |
"step": 163 | |
}, | |
{ | |
"epoch": 0.0053551020408163265, | |
"grad_norm": 3.027183771133423, | |
"learning_rate": 9.996276943174396e-06, | |
"loss": 0.8897, | |
"step": 164 | |
}, | |
{ | |
"epoch": 0.005387755102040816, | |
"grad_norm": 2.7222235202789307, | |
"learning_rate": 9.996244284781189e-06, | |
"loss": 0.8787, | |
"step": 165 | |
}, | |
{ | |
"epoch": 0.0054204081632653065, | |
"grad_norm": 2.7184181213378906, | |
"learning_rate": 9.996211626387982e-06, | |
"loss": 0.8832, | |
"step": 166 | |
}, | |
{ | |
"epoch": 0.005453061224489796, | |
"grad_norm": 2.8517744541168213, | |
"learning_rate": 9.996178967994776e-06, | |
"loss": 0.8463, | |
"step": 167 | |
}, | |
{ | |
"epoch": 0.005485714285714286, | |
"grad_norm": 3.104189395904541, | |
"learning_rate": 9.996146309601569e-06, | |
"loss": 0.8724, | |
"step": 168 | |
}, | |
{ | |
"epoch": 0.005518367346938775, | |
"grad_norm": 2.9883522987365723, | |
"learning_rate": 9.996113651208362e-06, | |
"loss": 0.8654, | |
"step": 169 | |
}, | |
{ | |
"epoch": 0.005551020408163266, | |
"grad_norm": 3.018421173095703, | |
"learning_rate": 9.996080992815155e-06, | |
"loss": 0.8836, | |
"step": 170 | |
}, | |
{ | |
"epoch": 0.005583673469387755, | |
"grad_norm": 2.795041561126709, | |
"learning_rate": 9.996048334421947e-06, | |
"loss": 0.8843, | |
"step": 171 | |
}, | |
{ | |
"epoch": 0.005616326530612245, | |
"grad_norm": 2.594553232192993, | |
"learning_rate": 9.99601567602874e-06, | |
"loss": 0.8874, | |
"step": 172 | |
}, | |
{ | |
"epoch": 0.005648979591836734, | |
"grad_norm": 2.946117877960205, | |
"learning_rate": 9.995983017635533e-06, | |
"loss": 0.8265, | |
"step": 173 | |
}, | |
{ | |
"epoch": 0.005681632653061225, | |
"grad_norm": 3.060215950012207, | |
"learning_rate": 9.995950359242327e-06, | |
"loss": 0.8872, | |
"step": 174 | |
}, | |
{ | |
"epoch": 0.005714285714285714, | |
"grad_norm": 2.5649561882019043, | |
"learning_rate": 9.99591770084912e-06, | |
"loss": 0.898, | |
"step": 175 | |
}, | |
{ | |
"epoch": 0.005746938775510204, | |
"grad_norm": 2.7028987407684326, | |
"learning_rate": 9.995885042455911e-06, | |
"loss": 0.8168, | |
"step": 176 | |
}, | |
{ | |
"epoch": 0.005779591836734694, | |
"grad_norm": 2.918105125427246, | |
"learning_rate": 9.995852384062705e-06, | |
"loss": 0.9006, | |
"step": 177 | |
}, | |
{ | |
"epoch": 0.005812244897959184, | |
"grad_norm": 3.0559136867523193, | |
"learning_rate": 9.995819725669498e-06, | |
"loss": 0.9106, | |
"step": 178 | |
}, | |
{ | |
"epoch": 0.005844897959183673, | |
"grad_norm": 2.7586793899536133, | |
"learning_rate": 9.995787067276291e-06, | |
"loss": 0.8643, | |
"step": 179 | |
}, | |
{ | |
"epoch": 0.005877551020408163, | |
"grad_norm": 2.6476991176605225, | |
"learning_rate": 9.995754408883084e-06, | |
"loss": 0.8952, | |
"step": 180 | |
}, | |
{ | |
"epoch": 0.005910204081632653, | |
"grad_norm": 2.624241352081299, | |
"learning_rate": 9.995721750489876e-06, | |
"loss": 0.8836, | |
"step": 181 | |
}, | |
{ | |
"epoch": 0.005942857142857143, | |
"grad_norm": 3.0197315216064453, | |
"learning_rate": 9.995689092096669e-06, | |
"loss": 0.87, | |
"step": 182 | |
}, | |
{ | |
"epoch": 0.0059755102040816325, | |
"grad_norm": 2.9282002449035645, | |
"learning_rate": 9.995656433703462e-06, | |
"loss": 0.8904, | |
"step": 183 | |
}, | |
{ | |
"epoch": 0.006008163265306122, | |
"grad_norm": 2.6132211685180664, | |
"learning_rate": 9.995623775310255e-06, | |
"loss": 0.8565, | |
"step": 184 | |
}, | |
{ | |
"epoch": 0.0060408163265306125, | |
"grad_norm": 2.6105284690856934, | |
"learning_rate": 9.995591116917049e-06, | |
"loss": 0.8842, | |
"step": 185 | |
}, | |
{ | |
"epoch": 0.006073469387755102, | |
"grad_norm": 2.868211030960083, | |
"learning_rate": 9.995558458523842e-06, | |
"loss": 0.8575, | |
"step": 186 | |
}, | |
{ | |
"epoch": 0.006106122448979592, | |
"grad_norm": 2.7017822265625, | |
"learning_rate": 9.995525800130633e-06, | |
"loss": 0.8429, | |
"step": 187 | |
}, | |
{ | |
"epoch": 0.006138775510204081, | |
"grad_norm": 2.7130422592163086, | |
"learning_rate": 9.995493141737427e-06, | |
"loss": 0.8451, | |
"step": 188 | |
}, | |
{ | |
"epoch": 0.006171428571428572, | |
"grad_norm": 2.681940793991089, | |
"learning_rate": 9.99546048334422e-06, | |
"loss": 0.866, | |
"step": 189 | |
}, | |
{ | |
"epoch": 0.006204081632653061, | |
"grad_norm": 2.6012067794799805, | |
"learning_rate": 9.995427824951013e-06, | |
"loss": 0.8575, | |
"step": 190 | |
}, | |
{ | |
"epoch": 0.006236734693877551, | |
"grad_norm": 2.7982876300811768, | |
"learning_rate": 9.995395166557806e-06, | |
"loss": 0.8818, | |
"step": 191 | |
}, | |
{ | |
"epoch": 0.006269387755102041, | |
"grad_norm": 2.7027719020843506, | |
"learning_rate": 9.9953625081646e-06, | |
"loss": 0.8638, | |
"step": 192 | |
}, | |
{ | |
"epoch": 0.006302040816326531, | |
"grad_norm": 2.6796092987060547, | |
"learning_rate": 9.995329849771393e-06, | |
"loss": 0.8473, | |
"step": 193 | |
}, | |
{ | |
"epoch": 0.00633469387755102, | |
"grad_norm": 2.882390022277832, | |
"learning_rate": 9.995297191378184e-06, | |
"loss": 0.8854, | |
"step": 194 | |
}, | |
{ | |
"epoch": 0.00636734693877551, | |
"grad_norm": 2.970914125442505, | |
"learning_rate": 9.995264532984978e-06, | |
"loss": 0.8499, | |
"step": 195 | |
}, | |
{ | |
"epoch": 0.0064, | |
"grad_norm": 2.7777442932128906, | |
"learning_rate": 9.995231874591771e-06, | |
"loss": 0.8761, | |
"step": 196 | |
}, | |
{ | |
"epoch": 0.00643265306122449, | |
"grad_norm": 2.6736974716186523, | |
"learning_rate": 9.995199216198564e-06, | |
"loss": 0.8645, | |
"step": 197 | |
}, | |
{ | |
"epoch": 0.006465306122448979, | |
"grad_norm": 2.9803805351257324, | |
"learning_rate": 9.995166557805357e-06, | |
"loss": 0.8257, | |
"step": 198 | |
}, | |
{ | |
"epoch": 0.006497959183673469, | |
"grad_norm": 2.7522194385528564, | |
"learning_rate": 9.995133899412149e-06, | |
"loss": 0.8467, | |
"step": 199 | |
}, | |
{ | |
"epoch": 0.006530612244897959, | |
"grad_norm": 2.6975879669189453, | |
"learning_rate": 9.995101241018942e-06, | |
"loss": 0.872, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.006530612244897959, | |
"eval_loss": 0.8632190823554993, | |
"eval_runtime": 74.7709, | |
"eval_samples_per_second": 1.337, | |
"eval_steps_per_second": 1.337, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.006563265306122449, | |
"grad_norm": 2.8385777473449707, | |
"learning_rate": 9.995068582625735e-06, | |
"loss": 0.8592, | |
"step": 201 | |
}, | |
{ | |
"epoch": 0.0065959183673469386, | |
"grad_norm": 2.8501343727111816, | |
"learning_rate": 9.995035924232529e-06, | |
"loss": 0.8516, | |
"step": 202 | |
}, | |
{ | |
"epoch": 0.006628571428571429, | |
"grad_norm": 2.841095447540283, | |
"learning_rate": 9.995003265839322e-06, | |
"loss": 0.8589, | |
"step": 203 | |
}, | |
{ | |
"epoch": 0.0066612244897959185, | |
"grad_norm": 2.7659661769866943, | |
"learning_rate": 9.994970607446113e-06, | |
"loss": 0.8434, | |
"step": 204 | |
}, | |
{ | |
"epoch": 0.006693877551020408, | |
"grad_norm": 2.892019748687744, | |
"learning_rate": 9.994937949052907e-06, | |
"loss": 0.8896, | |
"step": 205 | |
}, | |
{ | |
"epoch": 0.006726530612244898, | |
"grad_norm": 2.7645156383514404, | |
"learning_rate": 9.9949052906597e-06, | |
"loss": 0.8654, | |
"step": 206 | |
}, | |
{ | |
"epoch": 0.006759183673469388, | |
"grad_norm": 2.7840843200683594, | |
"learning_rate": 9.994872632266493e-06, | |
"loss": 0.8558, | |
"step": 207 | |
}, | |
{ | |
"epoch": 0.006791836734693878, | |
"grad_norm": 2.843766212463379, | |
"learning_rate": 9.994839973873286e-06, | |
"loss": 0.8482, | |
"step": 208 | |
}, | |
{ | |
"epoch": 0.006824489795918367, | |
"grad_norm": 2.7724156379699707, | |
"learning_rate": 9.99480731548008e-06, | |
"loss": 0.8664, | |
"step": 209 | |
}, | |
{ | |
"epoch": 0.006857142857142857, | |
"grad_norm": 2.8699095249176025, | |
"learning_rate": 9.994774657086873e-06, | |
"loss": 0.8506, | |
"step": 210 | |
}, | |
{ | |
"epoch": 0.006889795918367347, | |
"grad_norm": 2.988041877746582, | |
"learning_rate": 9.994741998693666e-06, | |
"loss": 0.8781, | |
"step": 211 | |
}, | |
{ | |
"epoch": 0.006922448979591837, | |
"grad_norm": 2.597806930541992, | |
"learning_rate": 9.994709340300458e-06, | |
"loss": 0.8586, | |
"step": 212 | |
}, | |
{ | |
"epoch": 0.006955102040816326, | |
"grad_norm": 2.7888684272766113, | |
"learning_rate": 9.994676681907251e-06, | |
"loss": 0.8431, | |
"step": 213 | |
}, | |
{ | |
"epoch": 0.006987755102040816, | |
"grad_norm": 2.80710768699646, | |
"learning_rate": 9.994644023514044e-06, | |
"loss": 0.8911, | |
"step": 214 | |
}, | |
{ | |
"epoch": 0.007020408163265306, | |
"grad_norm": 2.6957030296325684, | |
"learning_rate": 9.994611365120837e-06, | |
"loss": 0.8626, | |
"step": 215 | |
}, | |
{ | |
"epoch": 0.007053061224489796, | |
"grad_norm": 2.8306329250335693, | |
"learning_rate": 9.99457870672763e-06, | |
"loss": 0.8619, | |
"step": 216 | |
}, | |
{ | |
"epoch": 0.0070857142857142855, | |
"grad_norm": 2.735477924346924, | |
"learning_rate": 9.994546048334422e-06, | |
"loss": 0.8661, | |
"step": 217 | |
}, | |
{ | |
"epoch": 0.007118367346938776, | |
"grad_norm": 2.737856388092041, | |
"learning_rate": 9.994513389941215e-06, | |
"loss": 0.8718, | |
"step": 218 | |
}, | |
{ | |
"epoch": 0.0071510204081632655, | |
"grad_norm": 2.743589401245117, | |
"learning_rate": 9.994480731548009e-06, | |
"loss": 0.8781, | |
"step": 219 | |
}, | |
{ | |
"epoch": 0.007183673469387755, | |
"grad_norm": 2.8569047451019287, | |
"learning_rate": 9.994448073154802e-06, | |
"loss": 0.8512, | |
"step": 220 | |
}, | |
{ | |
"epoch": 0.007216326530612245, | |
"grad_norm": 3.129603385925293, | |
"learning_rate": 9.994415414761595e-06, | |
"loss": 0.8839, | |
"step": 221 | |
}, | |
{ | |
"epoch": 0.007248979591836735, | |
"grad_norm": 2.8460073471069336, | |
"learning_rate": 9.994382756368387e-06, | |
"loss": 0.8568, | |
"step": 222 | |
}, | |
{ | |
"epoch": 0.007281632653061225, | |
"grad_norm": 3.024534225463867, | |
"learning_rate": 9.99435009797518e-06, | |
"loss": 0.8672, | |
"step": 223 | |
}, | |
{ | |
"epoch": 0.007314285714285714, | |
"grad_norm": 3.0231759548187256, | |
"learning_rate": 9.994317439581973e-06, | |
"loss": 0.8899, | |
"step": 224 | |
}, | |
{ | |
"epoch": 0.007346938775510204, | |
"grad_norm": 2.8645284175872803, | |
"learning_rate": 9.994284781188766e-06, | |
"loss": 0.8319, | |
"step": 225 | |
}, | |
{ | |
"epoch": 0.007379591836734694, | |
"grad_norm": 2.741297483444214, | |
"learning_rate": 9.99425212279556e-06, | |
"loss": 0.8726, | |
"step": 226 | |
}, | |
{ | |
"epoch": 0.007412244897959184, | |
"grad_norm": 2.8565313816070557, | |
"learning_rate": 9.994219464402353e-06, | |
"loss": 0.8807, | |
"step": 227 | |
}, | |
{ | |
"epoch": 0.007444897959183673, | |
"grad_norm": 2.9202141761779785, | |
"learning_rate": 9.994186806009144e-06, | |
"loss": 0.8098, | |
"step": 228 | |
}, | |
{ | |
"epoch": 0.007477551020408164, | |
"grad_norm": 2.955820083618164, | |
"learning_rate": 9.994154147615938e-06, | |
"loss": 0.8638, | |
"step": 229 | |
}, | |
{ | |
"epoch": 0.007510204081632653, | |
"grad_norm": 2.7195394039154053, | |
"learning_rate": 9.99412148922273e-06, | |
"loss": 0.8591, | |
"step": 230 | |
}, | |
{ | |
"epoch": 0.007542857142857143, | |
"grad_norm": 2.91768479347229, | |
"learning_rate": 9.994088830829524e-06, | |
"loss": 0.8774, | |
"step": 231 | |
}, | |
{ | |
"epoch": 0.007575510204081632, | |
"grad_norm": 2.7152891159057617, | |
"learning_rate": 9.994056172436317e-06, | |
"loss": 0.8488, | |
"step": 232 | |
}, | |
{ | |
"epoch": 0.007608163265306123, | |
"grad_norm": 2.5756800174713135, | |
"learning_rate": 9.99402351404311e-06, | |
"loss": 0.8598, | |
"step": 233 | |
}, | |
{ | |
"epoch": 0.007640816326530612, | |
"grad_norm": 2.603003978729248, | |
"learning_rate": 9.993990855649904e-06, | |
"loss": 0.8473, | |
"step": 234 | |
}, | |
{ | |
"epoch": 0.007673469387755102, | |
"grad_norm": 2.758294105529785, | |
"learning_rate": 9.993958197256695e-06, | |
"loss": 0.8624, | |
"step": 235 | |
}, | |
{ | |
"epoch": 0.0077061224489795915, | |
"grad_norm": 2.8660902976989746, | |
"learning_rate": 9.993925538863489e-06, | |
"loss": 0.8564, | |
"step": 236 | |
}, | |
{ | |
"epoch": 0.007738775510204082, | |
"grad_norm": 2.8767125606536865, | |
"learning_rate": 9.993892880470282e-06, | |
"loss": 0.8578, | |
"step": 237 | |
}, | |
{ | |
"epoch": 0.0077714285714285715, | |
"grad_norm": 2.906562089920044, | |
"learning_rate": 9.993860222077075e-06, | |
"loss": 0.8383, | |
"step": 238 | |
}, | |
{ | |
"epoch": 0.007804081632653061, | |
"grad_norm": 2.74647855758667, | |
"learning_rate": 9.993827563683868e-06, | |
"loss": 0.8656, | |
"step": 239 | |
}, | |
{ | |
"epoch": 0.00783673469387755, | |
"grad_norm": 3.078082323074341, | |
"learning_rate": 9.99379490529066e-06, | |
"loss": 0.8244, | |
"step": 240 | |
}, | |
{ | |
"epoch": 0.00786938775510204, | |
"grad_norm": 2.7712790966033936, | |
"learning_rate": 9.993762246897453e-06, | |
"loss": 0.8536, | |
"step": 241 | |
}, | |
{ | |
"epoch": 0.007902040816326531, | |
"grad_norm": 2.852389335632324, | |
"learning_rate": 9.993729588504246e-06, | |
"loss": 0.8496, | |
"step": 242 | |
}, | |
{ | |
"epoch": 0.007934693877551021, | |
"grad_norm": 2.7907660007476807, | |
"learning_rate": 9.99369693011104e-06, | |
"loss": 0.8993, | |
"step": 243 | |
}, | |
{ | |
"epoch": 0.00796734693877551, | |
"grad_norm": 2.8219473361968994, | |
"learning_rate": 9.993664271717833e-06, | |
"loss": 0.8652, | |
"step": 244 | |
}, | |
{ | |
"epoch": 0.008, | |
"grad_norm": 2.6678013801574707, | |
"learning_rate": 9.993631613324624e-06, | |
"loss": 0.8636, | |
"step": 245 | |
}, | |
{ | |
"epoch": 0.00803265306122449, | |
"grad_norm": 2.724881887435913, | |
"learning_rate": 9.993598954931417e-06, | |
"loss": 0.8559, | |
"step": 246 | |
}, | |
{ | |
"epoch": 0.00806530612244898, | |
"grad_norm": 2.8275516033172607, | |
"learning_rate": 9.99356629653821e-06, | |
"loss": 0.8646, | |
"step": 247 | |
}, | |
{ | |
"epoch": 0.008097959183673469, | |
"grad_norm": 2.8110673427581787, | |
"learning_rate": 9.993533638145004e-06, | |
"loss": 0.8713, | |
"step": 248 | |
}, | |
{ | |
"epoch": 0.008130612244897958, | |
"grad_norm": 2.652221202850342, | |
"learning_rate": 9.993500979751797e-06, | |
"loss": 0.8373, | |
"step": 249 | |
}, | |
{ | |
"epoch": 0.00816326530612245, | |
"grad_norm": 2.6357979774475098, | |
"learning_rate": 9.99346832135859e-06, | |
"loss": 0.8537, | |
"step": 250 | |
}, | |
{ | |
"epoch": 0.00816326530612245, | |
"eval_loss": 0.8545005917549133, | |
"eval_runtime": 74.0963, | |
"eval_samples_per_second": 1.35, | |
"eval_steps_per_second": 1.35, | |
"step": 250 | |
}, | |
{ | |
"epoch": 0.00819591836734694, | |
"grad_norm": 2.6132781505584717, | |
"learning_rate": 9.993435662965384e-06, | |
"loss": 0.8592, | |
"step": 251 | |
}, | |
{ | |
"epoch": 0.008228571428571429, | |
"grad_norm": 2.942228078842163, | |
"learning_rate": 9.993403004572177e-06, | |
"loss": 0.7732, | |
"step": 252 | |
}, | |
{ | |
"epoch": 0.008261224489795918, | |
"grad_norm": 2.5452983379364014, | |
"learning_rate": 9.993370346178968e-06, | |
"loss": 0.8666, | |
"step": 253 | |
}, | |
{ | |
"epoch": 0.008293877551020408, | |
"grad_norm": 2.863976240158081, | |
"learning_rate": 9.993337687785762e-06, | |
"loss": 0.8232, | |
"step": 254 | |
}, | |
{ | |
"epoch": 0.008326530612244898, | |
"grad_norm": 2.640972137451172, | |
"learning_rate": 9.993305029392555e-06, | |
"loss": 0.8703, | |
"step": 255 | |
}, | |
{ | |
"epoch": 0.008359183673469387, | |
"grad_norm": 2.98362135887146, | |
"learning_rate": 9.993272370999348e-06, | |
"loss": 0.8165, | |
"step": 256 | |
}, | |
{ | |
"epoch": 0.008391836734693878, | |
"grad_norm": 2.848294258117676, | |
"learning_rate": 9.993239712606141e-06, | |
"loss": 0.9104, | |
"step": 257 | |
}, | |
{ | |
"epoch": 0.008424489795918368, | |
"grad_norm": 2.7992308139801025, | |
"learning_rate": 9.993207054212933e-06, | |
"loss": 0.8146, | |
"step": 258 | |
}, | |
{ | |
"epoch": 0.008457142857142858, | |
"grad_norm": 2.905052900314331, | |
"learning_rate": 9.993174395819726e-06, | |
"loss": 0.8606, | |
"step": 259 | |
}, | |
{ | |
"epoch": 0.008489795918367347, | |
"grad_norm": 2.97420334815979, | |
"learning_rate": 9.99314173742652e-06, | |
"loss": 0.7909, | |
"step": 260 | |
}, | |
{ | |
"epoch": 0.008522448979591837, | |
"grad_norm": 2.7139410972595215, | |
"learning_rate": 9.993109079033313e-06, | |
"loss": 0.8593, | |
"step": 261 | |
}, | |
{ | |
"epoch": 0.008555102040816326, | |
"grad_norm": 2.7178266048431396, | |
"learning_rate": 9.993076420640106e-06, | |
"loss": 0.8785, | |
"step": 262 | |
}, | |
{ | |
"epoch": 0.008587755102040816, | |
"grad_norm": 2.6858327388763428, | |
"learning_rate": 9.993043762246897e-06, | |
"loss": 0.846, | |
"step": 263 | |
}, | |
{ | |
"epoch": 0.008620408163265305, | |
"grad_norm": 2.813204288482666, | |
"learning_rate": 9.99301110385369e-06, | |
"loss": 0.8386, | |
"step": 264 | |
}, | |
{ | |
"epoch": 0.008653061224489797, | |
"grad_norm": 2.795591115951538, | |
"learning_rate": 9.992978445460484e-06, | |
"loss": 0.8417, | |
"step": 265 | |
}, | |
{ | |
"epoch": 0.008685714285714286, | |
"grad_norm": 2.8094005584716797, | |
"learning_rate": 9.992945787067277e-06, | |
"loss": 0.8396, | |
"step": 266 | |
}, | |
{ | |
"epoch": 0.008718367346938776, | |
"grad_norm": 2.723392963409424, | |
"learning_rate": 9.99291312867407e-06, | |
"loss": 0.8357, | |
"step": 267 | |
}, | |
{ | |
"epoch": 0.008751020408163265, | |
"grad_norm": 2.9623281955718994, | |
"learning_rate": 9.992880470280864e-06, | |
"loss": 0.8874, | |
"step": 268 | |
}, | |
{ | |
"epoch": 0.008783673469387755, | |
"grad_norm": 2.8696374893188477, | |
"learning_rate": 9.992847811887655e-06, | |
"loss": 0.8595, | |
"step": 269 | |
}, | |
{ | |
"epoch": 0.008816326530612244, | |
"grad_norm": 2.906726837158203, | |
"learning_rate": 9.992815153494448e-06, | |
"loss": 0.8736, | |
"step": 270 | |
}, | |
{ | |
"epoch": 0.008848979591836734, | |
"grad_norm": 2.814854383468628, | |
"learning_rate": 9.992782495101242e-06, | |
"loss": 0.8603, | |
"step": 271 | |
}, | |
{ | |
"epoch": 0.008881632653061225, | |
"grad_norm": 2.799457550048828, | |
"learning_rate": 9.992749836708035e-06, | |
"loss": 0.8469, | |
"step": 272 | |
}, | |
{ | |
"epoch": 0.008914285714285715, | |
"grad_norm": 2.755631923675537, | |
"learning_rate": 9.992717178314828e-06, | |
"loss": 0.8702, | |
"step": 273 | |
}, | |
{ | |
"epoch": 0.008946938775510204, | |
"grad_norm": 2.9200055599212646, | |
"learning_rate": 9.992684519921621e-06, | |
"loss": 0.8407, | |
"step": 274 | |
}, | |
{ | |
"epoch": 0.008979591836734694, | |
"grad_norm": 3.2251176834106445, | |
"learning_rate": 9.992651861528415e-06, | |
"loss": 0.8572, | |
"step": 275 | |
}, | |
{ | |
"epoch": 0.009012244897959184, | |
"grad_norm": 3.8850901126861572, | |
"learning_rate": 9.992619203135206e-06, | |
"loss": 0.8458, | |
"step": 276 | |
}, | |
{ | |
"epoch": 0.009044897959183673, | |
"grad_norm": 2.8082637786865234, | |
"learning_rate": 9.992586544742e-06, | |
"loss": 0.8415, | |
"step": 277 | |
}, | |
{ | |
"epoch": 0.009077551020408163, | |
"grad_norm": 2.8923559188842773, | |
"learning_rate": 9.992553886348793e-06, | |
"loss": 0.8116, | |
"step": 278 | |
}, | |
{ | |
"epoch": 0.009110204081632652, | |
"grad_norm": 2.849003791809082, | |
"learning_rate": 9.992521227955586e-06, | |
"loss": 0.834, | |
"step": 279 | |
}, | |
{ | |
"epoch": 0.009142857142857144, | |
"grad_norm": 2.7201125621795654, | |
"learning_rate": 9.992488569562379e-06, | |
"loss": 0.8518, | |
"step": 280 | |
}, | |
{ | |
"epoch": 0.009175510204081633, | |
"grad_norm": 2.9798922538757324, | |
"learning_rate": 9.99245591116917e-06, | |
"loss": 0.8691, | |
"step": 281 | |
}, | |
{ | |
"epoch": 0.009208163265306123, | |
"grad_norm": 2.805250406265259, | |
"learning_rate": 9.992423252775964e-06, | |
"loss": 0.8537, | |
"step": 282 | |
}, | |
{ | |
"epoch": 0.009240816326530612, | |
"grad_norm": 2.6364924907684326, | |
"learning_rate": 9.992390594382757e-06, | |
"loss": 0.8368, | |
"step": 283 | |
}, | |
{ | |
"epoch": 0.009273469387755102, | |
"grad_norm": 2.692296028137207, | |
"learning_rate": 9.99235793598955e-06, | |
"loss": 0.8373, | |
"step": 284 | |
}, | |
{ | |
"epoch": 0.009306122448979591, | |
"grad_norm": 2.876549005508423, | |
"learning_rate": 9.992325277596344e-06, | |
"loss": 0.8348, | |
"step": 285 | |
}, | |
{ | |
"epoch": 0.009338775510204081, | |
"grad_norm": 2.77553129196167, | |
"learning_rate": 9.992292619203135e-06, | |
"loss": 0.8377, | |
"step": 286 | |
}, | |
{ | |
"epoch": 0.009371428571428572, | |
"grad_norm": 2.900949478149414, | |
"learning_rate": 9.992259960809928e-06, | |
"loss": 0.8469, | |
"step": 287 | |
}, | |
{ | |
"epoch": 0.009404081632653062, | |
"grad_norm": 2.99700927734375, | |
"learning_rate": 9.992227302416722e-06, | |
"loss": 0.8926, | |
"step": 288 | |
}, | |
{ | |
"epoch": 0.009436734693877551, | |
"grad_norm": 2.738802194595337, | |
"learning_rate": 9.992194644023515e-06, | |
"loss": 0.8511, | |
"step": 289 | |
}, | |
{ | |
"epoch": 0.009469387755102041, | |
"grad_norm": 2.7471158504486084, | |
"learning_rate": 9.992161985630308e-06, | |
"loss": 0.8449, | |
"step": 290 | |
}, | |
{ | |
"epoch": 0.00950204081632653, | |
"grad_norm": 2.7274839878082275, | |
"learning_rate": 9.992129327237101e-06, | |
"loss": 0.8308, | |
"step": 291 | |
}, | |
{ | |
"epoch": 0.00953469387755102, | |
"grad_norm": 2.554326057434082, | |
"learning_rate": 9.992096668843893e-06, | |
"loss": 0.868, | |
"step": 292 | |
}, | |
{ | |
"epoch": 0.00956734693877551, | |
"grad_norm": 2.578237295150757, | |
"learning_rate": 9.992064010450688e-06, | |
"loss": 0.8557, | |
"step": 293 | |
}, | |
{ | |
"epoch": 0.0096, | |
"grad_norm": 3.0283281803131104, | |
"learning_rate": 9.99203135205748e-06, | |
"loss": 0.8289, | |
"step": 294 | |
}, | |
{ | |
"epoch": 0.00963265306122449, | |
"grad_norm": 2.785170316696167, | |
"learning_rate": 9.991998693664272e-06, | |
"loss": 0.878, | |
"step": 295 | |
}, | |
{ | |
"epoch": 0.00966530612244898, | |
"grad_norm": 2.8442277908325195, | |
"learning_rate": 9.991966035271066e-06, | |
"loss": 0.8282, | |
"step": 296 | |
}, | |
{ | |
"epoch": 0.00969795918367347, | |
"grad_norm": 2.847158908843994, | |
"learning_rate": 9.991933376877859e-06, | |
"loss": 0.844, | |
"step": 297 | |
}, | |
{ | |
"epoch": 0.00973061224489796, | |
"grad_norm": 2.6960904598236084, | |
"learning_rate": 9.991900718484652e-06, | |
"loss": 0.863, | |
"step": 298 | |
}, | |
{ | |
"epoch": 0.009763265306122449, | |
"grad_norm": 2.97739839553833, | |
"learning_rate": 9.991868060091444e-06, | |
"loss": 0.8448, | |
"step": 299 | |
}, | |
{ | |
"epoch": 0.009795918367346938, | |
"grad_norm": 2.9316720962524414, | |
"learning_rate": 9.991835401698237e-06, | |
"loss": 0.8512, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.009795918367346938, | |
"eval_loss": 0.8451775312423706, | |
"eval_runtime": 73.1659, | |
"eval_samples_per_second": 1.367, | |
"eval_steps_per_second": 1.367, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.009828571428571428, | |
"grad_norm": 3.2117464542388916, | |
"learning_rate": 9.99180274330503e-06, | |
"loss": 0.852, | |
"step": 301 | |
}, | |
{ | |
"epoch": 0.009861224489795919, | |
"grad_norm": 2.7478814125061035, | |
"learning_rate": 9.991770084911823e-06, | |
"loss": 0.8454, | |
"step": 302 | |
}, | |
{ | |
"epoch": 0.009893877551020409, | |
"grad_norm": 2.7774980068206787, | |
"learning_rate": 9.991737426518617e-06, | |
"loss": 0.8581, | |
"step": 303 | |
}, | |
{ | |
"epoch": 0.009926530612244898, | |
"grad_norm": 2.691710948944092, | |
"learning_rate": 9.991704768125408e-06, | |
"loss": 0.8483, | |
"step": 304 | |
}, | |
{ | |
"epoch": 0.009959183673469388, | |
"grad_norm": 2.803921937942505, | |
"learning_rate": 9.991672109732201e-06, | |
"loss": 0.8394, | |
"step": 305 | |
}, | |
{ | |
"epoch": 0.009991836734693877, | |
"grad_norm": 2.9755606651306152, | |
"learning_rate": 9.991639451338995e-06, | |
"loss": 0.8245, | |
"step": 306 | |
}, | |
{ | |
"epoch": 0.010024489795918367, | |
"grad_norm": 3.09531569480896, | |
"learning_rate": 9.991606792945788e-06, | |
"loss": 0.8701, | |
"step": 307 | |
}, | |
{ | |
"epoch": 0.010057142857142857, | |
"grad_norm": 2.8512625694274902, | |
"learning_rate": 9.991574134552581e-06, | |
"loss": 0.8597, | |
"step": 308 | |
}, | |
{ | |
"epoch": 0.010089795918367346, | |
"grad_norm": 2.7423555850982666, | |
"learning_rate": 9.991541476159374e-06, | |
"loss": 0.8116, | |
"step": 309 | |
}, | |
{ | |
"epoch": 0.010122448979591837, | |
"grad_norm": 2.8539059162139893, | |
"learning_rate": 9.991508817766166e-06, | |
"loss": 0.841, | |
"step": 310 | |
}, | |
{ | |
"epoch": 0.010155102040816327, | |
"grad_norm": 2.843325138092041, | |
"learning_rate": 9.99147615937296e-06, | |
"loss": 0.8491, | |
"step": 311 | |
}, | |
{ | |
"epoch": 0.010187755102040816, | |
"grad_norm": 2.6367223262786865, | |
"learning_rate": 9.991443500979752e-06, | |
"loss": 0.8452, | |
"step": 312 | |
}, | |
{ | |
"epoch": 0.010220408163265306, | |
"grad_norm": 2.6435811519622803, | |
"learning_rate": 9.991410842586546e-06, | |
"loss": 0.8415, | |
"step": 313 | |
}, | |
{ | |
"epoch": 0.010253061224489796, | |
"grad_norm": 2.727060317993164, | |
"learning_rate": 9.991378184193339e-06, | |
"loss": 0.7985, | |
"step": 314 | |
}, | |
{ | |
"epoch": 0.010285714285714285, | |
"grad_norm": 2.9816014766693115, | |
"learning_rate": 9.991345525800132e-06, | |
"loss": 0.8129, | |
"step": 315 | |
}, | |
{ | |
"epoch": 0.010318367346938775, | |
"grad_norm": 3.3140499591827393, | |
"learning_rate": 9.991312867406925e-06, | |
"loss": 0.8822, | |
"step": 316 | |
}, | |
{ | |
"epoch": 0.010351020408163266, | |
"grad_norm": 2.8612375259399414, | |
"learning_rate": 9.991280209013717e-06, | |
"loss": 0.7966, | |
"step": 317 | |
}, | |
{ | |
"epoch": 0.010383673469387756, | |
"grad_norm": 2.692972183227539, | |
"learning_rate": 9.99124755062051e-06, | |
"loss": 0.8288, | |
"step": 318 | |
}, | |
{ | |
"epoch": 0.010416326530612245, | |
"grad_norm": 2.980607271194458, | |
"learning_rate": 9.991214892227303e-06, | |
"loss": 0.8238, | |
"step": 319 | |
}, | |
{ | |
"epoch": 0.010448979591836735, | |
"grad_norm": 3.0936498641967773, | |
"learning_rate": 9.991182233834097e-06, | |
"loss": 0.8338, | |
"step": 320 | |
}, | |
{ | |
"epoch": 0.010481632653061224, | |
"grad_norm": 2.7482964992523193, | |
"learning_rate": 9.99114957544089e-06, | |
"loss": 0.8511, | |
"step": 321 | |
}, | |
{ | |
"epoch": 0.010514285714285714, | |
"grad_norm": 2.6563820838928223, | |
"learning_rate": 9.991116917047681e-06, | |
"loss": 0.8568, | |
"step": 322 | |
}, | |
{ | |
"epoch": 0.010546938775510203, | |
"grad_norm": 2.7284178733825684, | |
"learning_rate": 9.991084258654475e-06, | |
"loss": 0.8615, | |
"step": 323 | |
}, | |
{ | |
"epoch": 0.010579591836734695, | |
"grad_norm": 2.7720155715942383, | |
"learning_rate": 9.991051600261268e-06, | |
"loss": 0.8459, | |
"step": 324 | |
}, | |
{ | |
"epoch": 0.010612244897959184, | |
"grad_norm": 2.6759071350097656, | |
"learning_rate": 9.991018941868061e-06, | |
"loss": 0.8489, | |
"step": 325 | |
}, | |
{ | |
"epoch": 0.010644897959183674, | |
"grad_norm": 2.8001279830932617, | |
"learning_rate": 9.990986283474854e-06, | |
"loss": 0.83, | |
"step": 326 | |
}, | |
{ | |
"epoch": 0.010677551020408163, | |
"grad_norm": 2.7164299488067627, | |
"learning_rate": 9.990953625081646e-06, | |
"loss": 0.8059, | |
"step": 327 | |
}, | |
{ | |
"epoch": 0.010710204081632653, | |
"grad_norm": 2.833035707473755, | |
"learning_rate": 9.990920966688439e-06, | |
"loss": 0.8394, | |
"step": 328 | |
}, | |
{ | |
"epoch": 0.010742857142857143, | |
"grad_norm": 2.6928770542144775, | |
"learning_rate": 9.990888308295232e-06, | |
"loss": 0.8297, | |
"step": 329 | |
}, | |
{ | |
"epoch": 0.010775510204081632, | |
"grad_norm": 2.7575488090515137, | |
"learning_rate": 9.990855649902026e-06, | |
"loss": 0.8149, | |
"step": 330 | |
}, | |
{ | |
"epoch": 0.010808163265306122, | |
"grad_norm": 2.7353413105010986, | |
"learning_rate": 9.990822991508819e-06, | |
"loss": 0.839, | |
"step": 331 | |
}, | |
{ | |
"epoch": 0.010840816326530613, | |
"grad_norm": 2.7132623195648193, | |
"learning_rate": 9.990790333115612e-06, | |
"loss": 0.8452, | |
"step": 332 | |
}, | |
{ | |
"epoch": 0.010873469387755103, | |
"grad_norm": 2.8830623626708984, | |
"learning_rate": 9.990757674722404e-06, | |
"loss": 0.8156, | |
"step": 333 | |
}, | |
{ | |
"epoch": 0.010906122448979592, | |
"grad_norm": 2.7021992206573486, | |
"learning_rate": 9.990725016329197e-06, | |
"loss": 0.8368, | |
"step": 334 | |
}, | |
{ | |
"epoch": 0.010938775510204082, | |
"grad_norm": 2.6805949211120605, | |
"learning_rate": 9.99069235793599e-06, | |
"loss": 0.8402, | |
"step": 335 | |
}, | |
{ | |
"epoch": 0.010971428571428571, | |
"grad_norm": 3.0374648571014404, | |
"learning_rate": 9.990659699542783e-06, | |
"loss": 0.8332, | |
"step": 336 | |
}, | |
{ | |
"epoch": 0.01100408163265306, | |
"grad_norm": 2.9531807899475098, | |
"learning_rate": 9.990627041149577e-06, | |
"loss": 0.8451, | |
"step": 337 | |
}, | |
{ | |
"epoch": 0.01103673469387755, | |
"grad_norm": 2.8179969787597656, | |
"learning_rate": 9.99059438275637e-06, | |
"loss": 0.8336, | |
"step": 338 | |
}, | |
{ | |
"epoch": 0.011069387755102042, | |
"grad_norm": 3.139448642730713, | |
"learning_rate": 9.990561724363163e-06, | |
"loss": 0.865, | |
"step": 339 | |
}, | |
{ | |
"epoch": 0.011102040816326531, | |
"grad_norm": 3.040363073348999, | |
"learning_rate": 9.990529065969955e-06, | |
"loss": 0.8387, | |
"step": 340 | |
}, | |
{ | |
"epoch": 0.01113469387755102, | |
"grad_norm": 2.700171709060669, | |
"learning_rate": 9.990496407576748e-06, | |
"loss": 0.8219, | |
"step": 341 | |
}, | |
{ | |
"epoch": 0.01116734693877551, | |
"grad_norm": 2.7326748371124268, | |
"learning_rate": 9.990463749183541e-06, | |
"loss": 0.7956, | |
"step": 342 | |
}, | |
{ | |
"epoch": 0.0112, | |
"grad_norm": 2.7744908332824707, | |
"learning_rate": 9.990431090790334e-06, | |
"loss": 0.8133, | |
"step": 343 | |
}, | |
{ | |
"epoch": 0.01123265306122449, | |
"grad_norm": 2.8565139770507812, | |
"learning_rate": 9.990398432397128e-06, | |
"loss": 0.825, | |
"step": 344 | |
}, | |
{ | |
"epoch": 0.011265306122448979, | |
"grad_norm": 2.9909799098968506, | |
"learning_rate": 9.990365774003919e-06, | |
"loss": 0.8683, | |
"step": 345 | |
}, | |
{ | |
"epoch": 0.011297959183673469, | |
"grad_norm": 2.642660140991211, | |
"learning_rate": 9.990333115610712e-06, | |
"loss": 0.8598, | |
"step": 346 | |
}, | |
{ | |
"epoch": 0.01133061224489796, | |
"grad_norm": 2.7017788887023926, | |
"learning_rate": 9.990300457217506e-06, | |
"loss": 0.8164, | |
"step": 347 | |
}, | |
{ | |
"epoch": 0.01136326530612245, | |
"grad_norm": 2.7391109466552734, | |
"learning_rate": 9.990267798824299e-06, | |
"loss": 0.8498, | |
"step": 348 | |
}, | |
{ | |
"epoch": 0.011395918367346939, | |
"grad_norm": 2.7465527057647705, | |
"learning_rate": 9.990235140431092e-06, | |
"loss": 0.8225, | |
"step": 349 | |
}, | |
{ | |
"epoch": 0.011428571428571429, | |
"grad_norm": 2.6583974361419678, | |
"learning_rate": 9.990202482037884e-06, | |
"loss": 0.8505, | |
"step": 350 | |
}, | |
{ | |
"epoch": 0.011428571428571429, | |
"eval_loss": 0.8395382165908813, | |
"eval_runtime": 73.8611, | |
"eval_samples_per_second": 1.354, | |
"eval_steps_per_second": 1.354, | |
"step": 350 | |
}, | |
{ | |
"epoch": 0.011461224489795918, | |
"grad_norm": 2.7981696128845215, | |
"learning_rate": 9.990169823644677e-06, | |
"loss": 0.8312, | |
"step": 351 | |
}, | |
{ | |
"epoch": 0.011493877551020408, | |
"grad_norm": 2.924536943435669, | |
"learning_rate": 9.99013716525147e-06, | |
"loss": 0.8498, | |
"step": 352 | |
}, | |
{ | |
"epoch": 0.011526530612244897, | |
"grad_norm": 2.819368362426758, | |
"learning_rate": 9.990104506858263e-06, | |
"loss": 0.8022, | |
"step": 353 | |
}, | |
{ | |
"epoch": 0.011559183673469389, | |
"grad_norm": 2.9508090019226074, | |
"learning_rate": 9.990071848465056e-06, | |
"loss": 0.8363, | |
"step": 354 | |
}, | |
{ | |
"epoch": 0.011591836734693878, | |
"grad_norm": 2.643820285797119, | |
"learning_rate": 9.99003919007185e-06, | |
"loss": 0.8172, | |
"step": 355 | |
}, | |
{ | |
"epoch": 0.011624489795918368, | |
"grad_norm": 2.5882935523986816, | |
"learning_rate": 9.990006531678641e-06, | |
"loss": 0.8095, | |
"step": 356 | |
}, | |
{ | |
"epoch": 0.011657142857142857, | |
"grad_norm": 2.968686103820801, | |
"learning_rate": 9.989973873285436e-06, | |
"loss": 0.8673, | |
"step": 357 | |
}, | |
{ | |
"epoch": 0.011689795918367347, | |
"grad_norm": 2.8162810802459717, | |
"learning_rate": 9.989941214892228e-06, | |
"loss": 0.8693, | |
"step": 358 | |
}, | |
{ | |
"epoch": 0.011722448979591836, | |
"grad_norm": 2.8215367794036865, | |
"learning_rate": 9.989908556499021e-06, | |
"loss": 0.8053, | |
"step": 359 | |
}, | |
{ | |
"epoch": 0.011755102040816326, | |
"grad_norm": 2.7967352867126465, | |
"learning_rate": 9.989875898105814e-06, | |
"loss": 0.8094, | |
"step": 360 | |
}, | |
{ | |
"epoch": 0.011787755102040815, | |
"grad_norm": 2.873476505279541, | |
"learning_rate": 9.989843239712607e-06, | |
"loss": 0.8174, | |
"step": 361 | |
}, | |
{ | |
"epoch": 0.011820408163265307, | |
"grad_norm": 2.8655476570129395, | |
"learning_rate": 9.9898105813194e-06, | |
"loss": 0.7988, | |
"step": 362 | |
}, | |
{ | |
"epoch": 0.011853061224489796, | |
"grad_norm": 2.794279098510742, | |
"learning_rate": 9.989777922926192e-06, | |
"loss": 0.8404, | |
"step": 363 | |
}, | |
{ | |
"epoch": 0.011885714285714286, | |
"grad_norm": 2.8534135818481445, | |
"learning_rate": 9.989745264532985e-06, | |
"loss": 0.8474, | |
"step": 364 | |
}, | |
{ | |
"epoch": 0.011918367346938775, | |
"grad_norm": 2.716606855392456, | |
"learning_rate": 9.989712606139779e-06, | |
"loss": 0.8256, | |
"step": 365 | |
}, | |
{ | |
"epoch": 0.011951020408163265, | |
"grad_norm": 2.8670265674591064, | |
"learning_rate": 9.989679947746572e-06, | |
"loss": 0.834, | |
"step": 366 | |
}, | |
{ | |
"epoch": 0.011983673469387755, | |
"grad_norm": 2.9551408290863037, | |
"learning_rate": 9.989647289353365e-06, | |
"loss": 0.8295, | |
"step": 367 | |
}, | |
{ | |
"epoch": 0.012016326530612244, | |
"grad_norm": 3.209449529647827, | |
"learning_rate": 9.989614630960157e-06, | |
"loss": 0.8167, | |
"step": 368 | |
}, | |
{ | |
"epoch": 0.012048979591836735, | |
"grad_norm": 2.8796558380126953, | |
"learning_rate": 9.98958197256695e-06, | |
"loss": 0.8245, | |
"step": 369 | |
}, | |
{ | |
"epoch": 0.012081632653061225, | |
"grad_norm": 2.9785306453704834, | |
"learning_rate": 9.989549314173743e-06, | |
"loss": 0.7946, | |
"step": 370 | |
}, | |
{ | |
"epoch": 0.012114285714285715, | |
"grad_norm": 2.8382294178009033, | |
"learning_rate": 9.989516655780536e-06, | |
"loss": 0.8392, | |
"step": 371 | |
}, | |
{ | |
"epoch": 0.012146938775510204, | |
"grad_norm": 3.2182369232177734, | |
"learning_rate": 9.98948399738733e-06, | |
"loss": 0.8343, | |
"step": 372 | |
}, | |
{ | |
"epoch": 0.012179591836734694, | |
"grad_norm": 2.9380509853363037, | |
"learning_rate": 9.989451338994123e-06, | |
"loss": 0.8417, | |
"step": 373 | |
}, | |
{ | |
"epoch": 0.012212244897959183, | |
"grad_norm": 2.6060733795166016, | |
"learning_rate": 9.989418680600914e-06, | |
"loss": 0.8471, | |
"step": 374 | |
}, | |
{ | |
"epoch": 0.012244897959183673, | |
"grad_norm": 3.0596020221710205, | |
"learning_rate": 9.989386022207708e-06, | |
"loss": 0.8408, | |
"step": 375 | |
}, | |
{ | |
"epoch": 0.012277551020408162, | |
"grad_norm": 2.91416335105896, | |
"learning_rate": 9.989353363814501e-06, | |
"loss": 0.8648, | |
"step": 376 | |
}, | |
{ | |
"epoch": 0.012310204081632654, | |
"grad_norm": 2.8798558712005615, | |
"learning_rate": 9.989320705421294e-06, | |
"loss": 0.7871, | |
"step": 377 | |
}, | |
{ | |
"epoch": 0.012342857142857143, | |
"grad_norm": 2.9666454792022705, | |
"learning_rate": 9.989288047028087e-06, | |
"loss": 0.8461, | |
"step": 378 | |
}, | |
{ | |
"epoch": 0.012375510204081633, | |
"grad_norm": 2.9145092964172363, | |
"learning_rate": 9.98925538863488e-06, | |
"loss": 0.8288, | |
"step": 379 | |
}, | |
{ | |
"epoch": 0.012408163265306122, | |
"grad_norm": 2.844569444656372, | |
"learning_rate": 9.989222730241674e-06, | |
"loss": 0.8685, | |
"step": 380 | |
}, | |
{ | |
"epoch": 0.012440816326530612, | |
"grad_norm": 2.790707588195801, | |
"learning_rate": 9.989190071848465e-06, | |
"loss": 0.8467, | |
"step": 381 | |
}, | |
{ | |
"epoch": 0.012473469387755102, | |
"grad_norm": 2.7228894233703613, | |
"learning_rate": 9.989157413455259e-06, | |
"loss": 0.8149, | |
"step": 382 | |
}, | |
{ | |
"epoch": 0.012506122448979591, | |
"grad_norm": 2.8930270671844482, | |
"learning_rate": 9.989124755062052e-06, | |
"loss": 0.8521, | |
"step": 383 | |
}, | |
{ | |
"epoch": 0.012538775510204082, | |
"grad_norm": 2.7201409339904785, | |
"learning_rate": 9.989092096668845e-06, | |
"loss": 0.8303, | |
"step": 384 | |
}, | |
{ | |
"epoch": 0.012571428571428572, | |
"grad_norm": 2.558375597000122, | |
"learning_rate": 9.989059438275638e-06, | |
"loss": 0.8165, | |
"step": 385 | |
}, | |
{ | |
"epoch": 0.012604081632653062, | |
"grad_norm": 2.893265724182129, | |
"learning_rate": 9.98902677988243e-06, | |
"loss": 0.833, | |
"step": 386 | |
}, | |
{ | |
"epoch": 0.012636734693877551, | |
"grad_norm": 2.6960701942443848, | |
"learning_rate": 9.988994121489223e-06, | |
"loss": 0.7972, | |
"step": 387 | |
}, | |
{ | |
"epoch": 0.01266938775510204, | |
"grad_norm": 2.5734570026397705, | |
"learning_rate": 9.988961463096016e-06, | |
"loss": 0.8748, | |
"step": 388 | |
}, | |
{ | |
"epoch": 0.01270204081632653, | |
"grad_norm": 2.993227243423462, | |
"learning_rate": 9.98892880470281e-06, | |
"loss": 0.8405, | |
"step": 389 | |
}, | |
{ | |
"epoch": 0.01273469387755102, | |
"grad_norm": 3.2019479274749756, | |
"learning_rate": 9.988896146309603e-06, | |
"loss": 0.8423, | |
"step": 390 | |
}, | |
{ | |
"epoch": 0.012767346938775511, | |
"grad_norm": 3.2083091735839844, | |
"learning_rate": 9.988863487916394e-06, | |
"loss": 0.8088, | |
"step": 391 | |
}, | |
{ | |
"epoch": 0.0128, | |
"grad_norm": 3.1379964351654053, | |
"learning_rate": 9.988830829523188e-06, | |
"loss": 0.8681, | |
"step": 392 | |
}, | |
{ | |
"epoch": 0.01283265306122449, | |
"grad_norm": 2.7037453651428223, | |
"learning_rate": 9.98879817112998e-06, | |
"loss": 0.8379, | |
"step": 393 | |
}, | |
{ | |
"epoch": 0.01286530612244898, | |
"grad_norm": 2.828984022140503, | |
"learning_rate": 9.988765512736774e-06, | |
"loss": 0.8148, | |
"step": 394 | |
}, | |
{ | |
"epoch": 0.01289795918367347, | |
"grad_norm": 2.9447624683380127, | |
"learning_rate": 9.988732854343567e-06, | |
"loss": 0.8263, | |
"step": 395 | |
}, | |
{ | |
"epoch": 0.012930612244897959, | |
"grad_norm": 2.7229952812194824, | |
"learning_rate": 9.98870019595036e-06, | |
"loss": 0.8613, | |
"step": 396 | |
}, | |
{ | |
"epoch": 0.012963265306122448, | |
"grad_norm": 2.820530652999878, | |
"learning_rate": 9.988667537557152e-06, | |
"loss": 0.8174, | |
"step": 397 | |
}, | |
{ | |
"epoch": 0.012995918367346938, | |
"grad_norm": 2.897592067718506, | |
"learning_rate": 9.988634879163947e-06, | |
"loss": 0.8416, | |
"step": 398 | |
}, | |
{ | |
"epoch": 0.01302857142857143, | |
"grad_norm": 3.032390832901001, | |
"learning_rate": 9.988602220770739e-06, | |
"loss": 0.842, | |
"step": 399 | |
}, | |
{ | |
"epoch": 0.013061224489795919, | |
"grad_norm": 2.949514389038086, | |
"learning_rate": 9.988569562377532e-06, | |
"loss": 0.8283, | |
"step": 400 | |
}, | |
{ | |
"epoch": 0.013061224489795919, | |
"eval_loss": 0.8356085419654846, | |
"eval_runtime": 76.0573, | |
"eval_samples_per_second": 1.315, | |
"eval_steps_per_second": 1.315, | |
"step": 400 | |
}, | |
{ | |
"epoch": 0.013093877551020408, | |
"grad_norm": 2.811276912689209, | |
"learning_rate": 9.988536903984325e-06, | |
"loss": 0.7906, | |
"step": 401 | |
}, | |
{ | |
"epoch": 0.013126530612244898, | |
"grad_norm": 2.9961533546447754, | |
"learning_rate": 9.988504245591118e-06, | |
"loss": 0.839, | |
"step": 402 | |
}, | |
{ | |
"epoch": 0.013159183673469388, | |
"grad_norm": 3.0735530853271484, | |
"learning_rate": 9.988471587197911e-06, | |
"loss": 0.8053, | |
"step": 403 | |
}, | |
{ | |
"epoch": 0.013191836734693877, | |
"grad_norm": 2.793745279312134, | |
"learning_rate": 9.988438928804703e-06, | |
"loss": 0.8231, | |
"step": 404 | |
}, | |
{ | |
"epoch": 0.013224489795918367, | |
"grad_norm": 3.1552298069000244, | |
"learning_rate": 9.988406270411496e-06, | |
"loss": 0.8328, | |
"step": 405 | |
}, | |
{ | |
"epoch": 0.013257142857142858, | |
"grad_norm": 2.984865427017212, | |
"learning_rate": 9.98837361201829e-06, | |
"loss": 0.811, | |
"step": 406 | |
}, | |
{ | |
"epoch": 0.013289795918367348, | |
"grad_norm": 2.8718035221099854, | |
"learning_rate": 9.988340953625083e-06, | |
"loss": 0.853, | |
"step": 407 | |
}, | |
{ | |
"epoch": 0.013322448979591837, | |
"grad_norm": 3.0865938663482666, | |
"learning_rate": 9.988308295231876e-06, | |
"loss": 0.8003, | |
"step": 408 | |
}, | |
{ | |
"epoch": 0.013355102040816327, | |
"grad_norm": 2.7381820678710938, | |
"learning_rate": 9.988275636838668e-06, | |
"loss": 0.8184, | |
"step": 409 | |
}, | |
{ | |
"epoch": 0.013387755102040816, | |
"grad_norm": 2.8170037269592285, | |
"learning_rate": 9.98824297844546e-06, | |
"loss": 0.8311, | |
"step": 410 | |
}, | |
{ | |
"epoch": 0.013420408163265306, | |
"grad_norm": 2.9140095710754395, | |
"learning_rate": 9.988210320052254e-06, | |
"loss": 0.834, | |
"step": 411 | |
}, | |
{ | |
"epoch": 0.013453061224489795, | |
"grad_norm": 2.933525800704956, | |
"learning_rate": 9.988177661659047e-06, | |
"loss": 0.8294, | |
"step": 412 | |
}, | |
{ | |
"epoch": 0.013485714285714285, | |
"grad_norm": 2.8974623680114746, | |
"learning_rate": 9.98814500326584e-06, | |
"loss": 0.8704, | |
"step": 413 | |
}, | |
{ | |
"epoch": 0.013518367346938776, | |
"grad_norm": 2.794966697692871, | |
"learning_rate": 9.988112344872634e-06, | |
"loss": 0.8579, | |
"step": 414 | |
}, | |
{ | |
"epoch": 0.013551020408163266, | |
"grad_norm": 2.856208324432373, | |
"learning_rate": 9.988079686479425e-06, | |
"loss": 0.8239, | |
"step": 415 | |
}, | |
{ | |
"epoch": 0.013583673469387755, | |
"grad_norm": 2.999901533126831, | |
"learning_rate": 9.988047028086218e-06, | |
"loss": 0.8166, | |
"step": 416 | |
}, | |
{ | |
"epoch": 0.013616326530612245, | |
"grad_norm": 2.8142268657684326, | |
"learning_rate": 9.988014369693012e-06, | |
"loss": 0.8343, | |
"step": 417 | |
}, | |
{ | |
"epoch": 0.013648979591836734, | |
"grad_norm": 2.808567523956299, | |
"learning_rate": 9.987981711299805e-06, | |
"loss": 0.8414, | |
"step": 418 | |
}, | |
{ | |
"epoch": 0.013681632653061224, | |
"grad_norm": 2.6575918197631836, | |
"learning_rate": 9.987949052906598e-06, | |
"loss": 0.8256, | |
"step": 419 | |
}, | |
{ | |
"epoch": 0.013714285714285714, | |
"grad_norm": 2.9415652751922607, | |
"learning_rate": 9.987916394513391e-06, | |
"loss": 0.811, | |
"step": 420 | |
}, | |
{ | |
"epoch": 0.013746938775510205, | |
"grad_norm": 2.7710697650909424, | |
"learning_rate": 9.987883736120185e-06, | |
"loss": 0.8132, | |
"step": 421 | |
}, | |
{ | |
"epoch": 0.013779591836734694, | |
"grad_norm": 2.6907401084899902, | |
"learning_rate": 9.987851077726976e-06, | |
"loss": 0.8022, | |
"step": 422 | |
}, | |
{ | |
"epoch": 0.013812244897959184, | |
"grad_norm": 3.010432481765747, | |
"learning_rate": 9.98781841933377e-06, | |
"loss": 0.8196, | |
"step": 423 | |
}, | |
{ | |
"epoch": 0.013844897959183674, | |
"grad_norm": 2.868598461151123, | |
"learning_rate": 9.987785760940563e-06, | |
"loss": 0.8234, | |
"step": 424 | |
}, | |
{ | |
"epoch": 0.013877551020408163, | |
"grad_norm": 2.9146206378936768, | |
"learning_rate": 9.987753102547356e-06, | |
"loss": 0.8357, | |
"step": 425 | |
}, | |
{ | |
"epoch": 0.013910204081632653, | |
"grad_norm": 3.148287534713745, | |
"learning_rate": 9.987720444154149e-06, | |
"loss": 0.8168, | |
"step": 426 | |
}, | |
{ | |
"epoch": 0.013942857142857142, | |
"grad_norm": 3.0107545852661133, | |
"learning_rate": 9.98768778576094e-06, | |
"loss": 0.8313, | |
"step": 427 | |
}, | |
{ | |
"epoch": 0.013975510204081632, | |
"grad_norm": 2.7397043704986572, | |
"learning_rate": 9.987655127367734e-06, | |
"loss": 0.825, | |
"step": 428 | |
}, | |
{ | |
"epoch": 0.014008163265306123, | |
"grad_norm": 2.8118340969085693, | |
"learning_rate": 9.987622468974527e-06, | |
"loss": 0.8101, | |
"step": 429 | |
}, | |
{ | |
"epoch": 0.014040816326530613, | |
"grad_norm": 2.862879514694214, | |
"learning_rate": 9.98758981058132e-06, | |
"loss": 0.8504, | |
"step": 430 | |
}, | |
{ | |
"epoch": 0.014073469387755102, | |
"grad_norm": 3.0519521236419678, | |
"learning_rate": 9.987557152188114e-06, | |
"loss": 0.8337, | |
"step": 431 | |
}, | |
{ | |
"epoch": 0.014106122448979592, | |
"grad_norm": 2.8947372436523438, | |
"learning_rate": 9.987524493794905e-06, | |
"loss": 0.8041, | |
"step": 432 | |
}, | |
{ | |
"epoch": 0.014138775510204081, | |
"grad_norm": 2.7874197959899902, | |
"learning_rate": 9.987491835401698e-06, | |
"loss": 0.8143, | |
"step": 433 | |
}, | |
{ | |
"epoch": 0.014171428571428571, | |
"grad_norm": 3.011812686920166, | |
"learning_rate": 9.987459177008492e-06, | |
"loss": 0.7936, | |
"step": 434 | |
}, | |
{ | |
"epoch": 0.01420408163265306, | |
"grad_norm": 2.8212246894836426, | |
"learning_rate": 9.987426518615285e-06, | |
"loss": 0.8028, | |
"step": 435 | |
}, | |
{ | |
"epoch": 0.014236734693877552, | |
"grad_norm": 2.7385244369506836, | |
"learning_rate": 9.987393860222078e-06, | |
"loss": 0.8292, | |
"step": 436 | |
}, | |
{ | |
"epoch": 0.014269387755102041, | |
"grad_norm": 2.7048816680908203, | |
"learning_rate": 9.987361201828871e-06, | |
"loss": 0.7567, | |
"step": 437 | |
}, | |
{ | |
"epoch": 0.014302040816326531, | |
"grad_norm": 3.001534938812256, | |
"learning_rate": 9.987328543435663e-06, | |
"loss": 0.7897, | |
"step": 438 | |
}, | |
{ | |
"epoch": 0.01433469387755102, | |
"grad_norm": 3.1388514041900635, | |
"learning_rate": 9.987295885042456e-06, | |
"loss": 0.8373, | |
"step": 439 | |
}, | |
{ | |
"epoch": 0.01436734693877551, | |
"grad_norm": 2.902858018875122, | |
"learning_rate": 9.98726322664925e-06, | |
"loss": 0.8342, | |
"step": 440 | |
}, | |
{ | |
"epoch": 0.0144, | |
"grad_norm": 2.871511459350586, | |
"learning_rate": 9.987230568256043e-06, | |
"loss": 0.8107, | |
"step": 441 | |
}, | |
{ | |
"epoch": 0.01443265306122449, | |
"grad_norm": 3.0739245414733887, | |
"learning_rate": 9.987197909862836e-06, | |
"loss": 0.8171, | |
"step": 442 | |
}, | |
{ | |
"epoch": 0.014465306122448979, | |
"grad_norm": 2.9680142402648926, | |
"learning_rate": 9.987165251469629e-06, | |
"loss": 0.8492, | |
"step": 443 | |
}, | |
{ | |
"epoch": 0.01449795918367347, | |
"grad_norm": 2.763535737991333, | |
"learning_rate": 9.987132593076422e-06, | |
"loss": 0.8201, | |
"step": 444 | |
}, | |
{ | |
"epoch": 0.01453061224489796, | |
"grad_norm": 2.8160109519958496, | |
"learning_rate": 9.987099934683214e-06, | |
"loss": 0.8175, | |
"step": 445 | |
}, | |
{ | |
"epoch": 0.01456326530612245, | |
"grad_norm": 2.746919870376587, | |
"learning_rate": 9.987067276290007e-06, | |
"loss": 0.8121, | |
"step": 446 | |
}, | |
{ | |
"epoch": 0.014595918367346939, | |
"grad_norm": 2.9417343139648438, | |
"learning_rate": 9.9870346178968e-06, | |
"loss": 0.8211, | |
"step": 447 | |
}, | |
{ | |
"epoch": 0.014628571428571428, | |
"grad_norm": 2.886810302734375, | |
"learning_rate": 9.987001959503594e-06, | |
"loss": 0.7889, | |
"step": 448 | |
}, | |
{ | |
"epoch": 0.014661224489795918, | |
"grad_norm": 2.89780592918396, | |
"learning_rate": 9.986969301110387e-06, | |
"loss": 0.8386, | |
"step": 449 | |
}, | |
{ | |
"epoch": 0.014693877551020407, | |
"grad_norm": 2.8015475273132324, | |
"learning_rate": 9.986936642717178e-06, | |
"loss": 0.8023, | |
"step": 450 | |
}, | |
{ | |
"epoch": 0.014693877551020407, | |
"eval_loss": 0.83209228515625, | |
"eval_runtime": 76.371, | |
"eval_samples_per_second": 1.309, | |
"eval_steps_per_second": 1.309, | |
"step": 450 | |
}, | |
{ | |
"epoch": 0.014726530612244899, | |
"grad_norm": 3.0286951065063477, | |
"learning_rate": 9.986903984323972e-06, | |
"loss": 0.795, | |
"step": 451 | |
}, | |
{ | |
"epoch": 0.014759183673469388, | |
"grad_norm": 2.798675060272217, | |
"learning_rate": 9.986871325930765e-06, | |
"loss": 0.778, | |
"step": 452 | |
}, | |
{ | |
"epoch": 0.014791836734693878, | |
"grad_norm": 2.9744062423706055, | |
"learning_rate": 9.986838667537558e-06, | |
"loss": 0.8034, | |
"step": 453 | |
}, | |
{ | |
"epoch": 0.014824489795918367, | |
"grad_norm": 3.131807804107666, | |
"learning_rate": 9.986806009144351e-06, | |
"loss": 0.8147, | |
"step": 454 | |
}, | |
{ | |
"epoch": 0.014857142857142857, | |
"grad_norm": 2.759378671646118, | |
"learning_rate": 9.986773350751145e-06, | |
"loss": 0.7798, | |
"step": 455 | |
}, | |
{ | |
"epoch": 0.014889795918367347, | |
"grad_norm": 2.79740047454834, | |
"learning_rate": 9.986740692357936e-06, | |
"loss": 0.793, | |
"step": 456 | |
}, | |
{ | |
"epoch": 0.014922448979591836, | |
"grad_norm": 3.367506742477417, | |
"learning_rate": 9.98670803396473e-06, | |
"loss": 0.8295, | |
"step": 457 | |
}, | |
{ | |
"epoch": 0.014955102040816327, | |
"grad_norm": 2.965843677520752, | |
"learning_rate": 9.986675375571523e-06, | |
"loss": 0.7908, | |
"step": 458 | |
}, | |
{ | |
"epoch": 0.014987755102040817, | |
"grad_norm": 2.815945625305176, | |
"learning_rate": 9.986642717178316e-06, | |
"loss": 0.821, | |
"step": 459 | |
}, | |
{ | |
"epoch": 0.015020408163265307, | |
"grad_norm": 2.8080756664276123, | |
"learning_rate": 9.986610058785109e-06, | |
"loss": 0.7952, | |
"step": 460 | |
}, | |
{ | |
"epoch": 0.015053061224489796, | |
"grad_norm": 3.2081828117370605, | |
"learning_rate": 9.9865774003919e-06, | |
"loss": 0.8306, | |
"step": 461 | |
}, | |
{ | |
"epoch": 0.015085714285714286, | |
"grad_norm": 2.866701602935791, | |
"learning_rate": 9.986544741998695e-06, | |
"loss": 0.8438, | |
"step": 462 | |
}, | |
{ | |
"epoch": 0.015118367346938775, | |
"grad_norm": 2.806741952896118, | |
"learning_rate": 9.986512083605487e-06, | |
"loss": 0.8082, | |
"step": 463 | |
}, | |
{ | |
"epoch": 0.015151020408163265, | |
"grad_norm": 2.8352231979370117, | |
"learning_rate": 9.98647942521228e-06, | |
"loss": 0.7805, | |
"step": 464 | |
}, | |
{ | |
"epoch": 0.015183673469387754, | |
"grad_norm": 2.838469982147217, | |
"learning_rate": 9.986446766819073e-06, | |
"loss": 0.844, | |
"step": 465 | |
}, | |
{ | |
"epoch": 0.015216326530612246, | |
"grad_norm": 2.7868754863739014, | |
"learning_rate": 9.986414108425867e-06, | |
"loss": 0.8078, | |
"step": 466 | |
}, | |
{ | |
"epoch": 0.015248979591836735, | |
"grad_norm": 2.809109687805176, | |
"learning_rate": 9.98638145003266e-06, | |
"loss": 0.8425, | |
"step": 467 | |
}, | |
{ | |
"epoch": 0.015281632653061225, | |
"grad_norm": 2.7377090454101562, | |
"learning_rate": 9.986348791639451e-06, | |
"loss": 0.8332, | |
"step": 468 | |
}, | |
{ | |
"epoch": 0.015314285714285714, | |
"grad_norm": 2.863290786743164, | |
"learning_rate": 9.986316133246245e-06, | |
"loss": 0.8352, | |
"step": 469 | |
}, | |
{ | |
"epoch": 0.015346938775510204, | |
"grad_norm": 2.720506429672241, | |
"learning_rate": 9.986283474853038e-06, | |
"loss": 0.8114, | |
"step": 470 | |
}, | |
{ | |
"epoch": 0.015379591836734693, | |
"grad_norm": 2.7052462100982666, | |
"learning_rate": 9.986250816459831e-06, | |
"loss": 0.8093, | |
"step": 471 | |
}, | |
{ | |
"epoch": 0.015412244897959183, | |
"grad_norm": 3.0364644527435303, | |
"learning_rate": 9.986218158066624e-06, | |
"loss": 0.8156, | |
"step": 472 | |
}, | |
{ | |
"epoch": 0.015444897959183674, | |
"grad_norm": 2.9919137954711914, | |
"learning_rate": 9.986185499673416e-06, | |
"loss": 0.7906, | |
"step": 473 | |
}, | |
{ | |
"epoch": 0.015477551020408164, | |
"grad_norm": 3.102023124694824, | |
"learning_rate": 9.98615284128021e-06, | |
"loss": 0.8247, | |
"step": 474 | |
}, | |
{ | |
"epoch": 0.015510204081632653, | |
"grad_norm": 2.8093349933624268, | |
"learning_rate": 9.986120182887002e-06, | |
"loss": 0.8339, | |
"step": 475 | |
}, | |
{ | |
"epoch": 0.015542857142857143, | |
"grad_norm": 2.893361806869507, | |
"learning_rate": 9.986087524493796e-06, | |
"loss": 0.7698, | |
"step": 476 | |
}, | |
{ | |
"epoch": 0.015575510204081633, | |
"grad_norm": 2.813911199569702, | |
"learning_rate": 9.986054866100589e-06, | |
"loss": 0.8204, | |
"step": 477 | |
}, | |
{ | |
"epoch": 0.015608163265306122, | |
"grad_norm": 2.7959954738616943, | |
"learning_rate": 9.986022207707382e-06, | |
"loss": 0.8099, | |
"step": 478 | |
}, | |
{ | |
"epoch": 0.015640816326530613, | |
"grad_norm": 2.637242317199707, | |
"learning_rate": 9.985989549314174e-06, | |
"loss": 0.8218, | |
"step": 479 | |
}, | |
{ | |
"epoch": 0.0156734693877551, | |
"grad_norm": 2.694908380508423, | |
"learning_rate": 9.985956890920967e-06, | |
"loss": 0.843, | |
"step": 480 | |
}, | |
{ | |
"epoch": 0.015706122448979593, | |
"grad_norm": 2.65104603767395, | |
"learning_rate": 9.98592423252776e-06, | |
"loss": 0.8142, | |
"step": 481 | |
}, | |
{ | |
"epoch": 0.01573877551020408, | |
"grad_norm": 2.7440900802612305, | |
"learning_rate": 9.985891574134553e-06, | |
"loss": 0.8079, | |
"step": 482 | |
}, | |
{ | |
"epoch": 0.01577142857142857, | |
"grad_norm": 2.7440459728240967, | |
"learning_rate": 9.985858915741347e-06, | |
"loss": 0.8106, | |
"step": 483 | |
}, | |
{ | |
"epoch": 0.015804081632653063, | |
"grad_norm": 2.8990628719329834, | |
"learning_rate": 9.98582625734814e-06, | |
"loss": 0.7954, | |
"step": 484 | |
}, | |
{ | |
"epoch": 0.01583673469387755, | |
"grad_norm": 2.8682186603546143, | |
"learning_rate": 9.985793598954933e-06, | |
"loss": 0.7892, | |
"step": 485 | |
}, | |
{ | |
"epoch": 0.015869387755102042, | |
"grad_norm": 3.1332759857177734, | |
"learning_rate": 9.985760940561725e-06, | |
"loss": 0.8182, | |
"step": 486 | |
}, | |
{ | |
"epoch": 0.01590204081632653, | |
"grad_norm": 2.9516282081604004, | |
"learning_rate": 9.985728282168518e-06, | |
"loss": 0.7985, | |
"step": 487 | |
}, | |
{ | |
"epoch": 0.01593469387755102, | |
"grad_norm": 3.3294899463653564, | |
"learning_rate": 9.985695623775311e-06, | |
"loss": 0.8368, | |
"step": 488 | |
}, | |
{ | |
"epoch": 0.01596734693877551, | |
"grad_norm": 2.95438814163208, | |
"learning_rate": 9.985662965382104e-06, | |
"loss": 0.8151, | |
"step": 489 | |
}, | |
{ | |
"epoch": 0.016, | |
"grad_norm": 2.7308006286621094, | |
"learning_rate": 9.985630306988898e-06, | |
"loss": 0.827, | |
"step": 490 | |
}, | |
{ | |
"epoch": 0.016032653061224488, | |
"grad_norm": 2.695218801498413, | |
"learning_rate": 9.985597648595689e-06, | |
"loss": 0.7846, | |
"step": 491 | |
}, | |
{ | |
"epoch": 0.01606530612244898, | |
"grad_norm": 2.842834949493408, | |
"learning_rate": 9.985564990202482e-06, | |
"loss": 0.783, | |
"step": 492 | |
}, | |
{ | |
"epoch": 0.01609795918367347, | |
"grad_norm": 2.944671869277954, | |
"learning_rate": 9.985532331809276e-06, | |
"loss": 0.8273, | |
"step": 493 | |
}, | |
{ | |
"epoch": 0.01613061224489796, | |
"grad_norm": 2.8893423080444336, | |
"learning_rate": 9.985499673416069e-06, | |
"loss": 0.8131, | |
"step": 494 | |
}, | |
{ | |
"epoch": 0.01616326530612245, | |
"grad_norm": 2.917097568511963, | |
"learning_rate": 9.985467015022862e-06, | |
"loss": 0.8491, | |
"step": 495 | |
}, | |
{ | |
"epoch": 0.016195918367346938, | |
"grad_norm": 2.678619384765625, | |
"learning_rate": 9.985434356629654e-06, | |
"loss": 0.7789, | |
"step": 496 | |
}, | |
{ | |
"epoch": 0.01622857142857143, | |
"grad_norm": 2.86249041557312, | |
"learning_rate": 9.985401698236447e-06, | |
"loss": 0.8461, | |
"step": 497 | |
}, | |
{ | |
"epoch": 0.016261224489795917, | |
"grad_norm": 2.9272475242614746, | |
"learning_rate": 9.98536903984324e-06, | |
"loss": 0.8047, | |
"step": 498 | |
}, | |
{ | |
"epoch": 0.016293877551020408, | |
"grad_norm": 2.894817590713501, | |
"learning_rate": 9.985336381450033e-06, | |
"loss": 0.8157, | |
"step": 499 | |
}, | |
{ | |
"epoch": 0.0163265306122449, | |
"grad_norm": 2.9469430446624756, | |
"learning_rate": 9.985303723056827e-06, | |
"loss": 0.8409, | |
"step": 500 | |
}, | |
{ | |
"epoch": 0.0163265306122449, | |
"eval_loss": 0.8293061852455139, | |
"eval_runtime": 76.9824, | |
"eval_samples_per_second": 1.299, | |
"eval_steps_per_second": 1.299, | |
"step": 500 | |
}, | |
{ | |
"epoch": 0.016359183673469387, | |
"grad_norm": 2.7225022315979004, | |
"learning_rate": 9.98527106466362e-06, | |
"loss": 0.7978, | |
"step": 501 | |
}, | |
{ | |
"epoch": 0.01639183673469388, | |
"grad_norm": 2.6931450366973877, | |
"learning_rate": 9.985238406270411e-06, | |
"loss": 0.8089, | |
"step": 502 | |
}, | |
{ | |
"epoch": 0.016424489795918366, | |
"grad_norm": 2.697438955307007, | |
"learning_rate": 9.985205747877206e-06, | |
"loss": 0.8231, | |
"step": 503 | |
}, | |
{ | |
"epoch": 0.016457142857142858, | |
"grad_norm": 2.997002124786377, | |
"learning_rate": 9.985173089483998e-06, | |
"loss": 0.8211, | |
"step": 504 | |
}, | |
{ | |
"epoch": 0.016489795918367346, | |
"grad_norm": 2.952876567840576, | |
"learning_rate": 9.985140431090791e-06, | |
"loss": 0.786, | |
"step": 505 | |
}, | |
{ | |
"epoch": 0.016522448979591837, | |
"grad_norm": 3.085167407989502, | |
"learning_rate": 9.985107772697584e-06, | |
"loss": 0.8261, | |
"step": 506 | |
}, | |
{ | |
"epoch": 0.016555102040816328, | |
"grad_norm": 2.8937699794769287, | |
"learning_rate": 9.985075114304378e-06, | |
"loss": 0.8009, | |
"step": 507 | |
}, | |
{ | |
"epoch": 0.016587755102040816, | |
"grad_norm": 2.934492588043213, | |
"learning_rate": 9.98504245591117e-06, | |
"loss": 0.8059, | |
"step": 508 | |
}, | |
{ | |
"epoch": 0.016620408163265307, | |
"grad_norm": 2.9281036853790283, | |
"learning_rate": 9.985009797517962e-06, | |
"loss": 0.8488, | |
"step": 509 | |
}, | |
{ | |
"epoch": 0.016653061224489795, | |
"grad_norm": 2.8904941082000732, | |
"learning_rate": 9.984977139124756e-06, | |
"loss": 0.8431, | |
"step": 510 | |
}, | |
{ | |
"epoch": 0.016685714285714286, | |
"grad_norm": 2.7994799613952637, | |
"learning_rate": 9.984944480731549e-06, | |
"loss": 0.8339, | |
"step": 511 | |
}, | |
{ | |
"epoch": 0.016718367346938774, | |
"grad_norm": 3.0456082820892334, | |
"learning_rate": 9.984911822338342e-06, | |
"loss": 0.8077, | |
"step": 512 | |
}, | |
{ | |
"epoch": 0.016751020408163265, | |
"grad_norm": 2.969820261001587, | |
"learning_rate": 9.984879163945135e-06, | |
"loss": 0.8254, | |
"step": 513 | |
}, | |
{ | |
"epoch": 0.016783673469387757, | |
"grad_norm": 2.691143035888672, | |
"learning_rate": 9.984846505551927e-06, | |
"loss": 0.8208, | |
"step": 514 | |
}, | |
{ | |
"epoch": 0.016816326530612245, | |
"grad_norm": 3.0714621543884277, | |
"learning_rate": 9.98481384715872e-06, | |
"loss": 0.8094, | |
"step": 515 | |
}, | |
{ | |
"epoch": 0.016848979591836736, | |
"grad_norm": 3.146872043609619, | |
"learning_rate": 9.984781188765513e-06, | |
"loss": 0.8295, | |
"step": 516 | |
}, | |
{ | |
"epoch": 0.016881632653061224, | |
"grad_norm": 3.0401713848114014, | |
"learning_rate": 9.984748530372307e-06, | |
"loss": 0.8438, | |
"step": 517 | |
}, | |
{ | |
"epoch": 0.016914285714285715, | |
"grad_norm": 3.1565845012664795, | |
"learning_rate": 9.9847158719791e-06, | |
"loss": 0.8238, | |
"step": 518 | |
}, | |
{ | |
"epoch": 0.016946938775510203, | |
"grad_norm": 3.035167932510376, | |
"learning_rate": 9.984683213585893e-06, | |
"loss": 0.7907, | |
"step": 519 | |
}, | |
{ | |
"epoch": 0.016979591836734694, | |
"grad_norm": 2.9173972606658936, | |
"learning_rate": 9.984650555192685e-06, | |
"loss": 0.8298, | |
"step": 520 | |
}, | |
{ | |
"epoch": 0.017012244897959182, | |
"grad_norm": 3.0475914478302, | |
"learning_rate": 9.984617896799478e-06, | |
"loss": 0.7896, | |
"step": 521 | |
}, | |
{ | |
"epoch": 0.017044897959183673, | |
"grad_norm": 2.8846919536590576, | |
"learning_rate": 9.984585238406271e-06, | |
"loss": 0.8268, | |
"step": 522 | |
}, | |
{ | |
"epoch": 0.017077551020408165, | |
"grad_norm": 2.9851925373077393, | |
"learning_rate": 9.984552580013064e-06, | |
"loss": 0.8166, | |
"step": 523 | |
}, | |
{ | |
"epoch": 0.017110204081632652, | |
"grad_norm": 2.960608720779419, | |
"learning_rate": 9.984519921619857e-06, | |
"loss": 0.838, | |
"step": 524 | |
}, | |
{ | |
"epoch": 0.017142857142857144, | |
"grad_norm": 2.8663570880889893, | |
"learning_rate": 9.984487263226649e-06, | |
"loss": 0.8134, | |
"step": 525 | |
}, | |
{ | |
"epoch": 0.01717551020408163, | |
"grad_norm": 3.2871882915496826, | |
"learning_rate": 9.984454604833444e-06, | |
"loss": 0.842, | |
"step": 526 | |
}, | |
{ | |
"epoch": 0.017208163265306123, | |
"grad_norm": 2.9747695922851562, | |
"learning_rate": 9.984421946440235e-06, | |
"loss": 0.8371, | |
"step": 527 | |
}, | |
{ | |
"epoch": 0.01724081632653061, | |
"grad_norm": 3.065366744995117, | |
"learning_rate": 9.984389288047029e-06, | |
"loss": 0.799, | |
"step": 528 | |
}, | |
{ | |
"epoch": 0.017273469387755102, | |
"grad_norm": 3.3719398975372314, | |
"learning_rate": 9.984356629653822e-06, | |
"loss": 0.7875, | |
"step": 529 | |
}, | |
{ | |
"epoch": 0.017306122448979593, | |
"grad_norm": 3.0407257080078125, | |
"learning_rate": 9.984323971260615e-06, | |
"loss": 0.833, | |
"step": 530 | |
}, | |
{ | |
"epoch": 0.01733877551020408, | |
"grad_norm": 2.992464780807495, | |
"learning_rate": 9.984291312867408e-06, | |
"loss": 0.83, | |
"step": 531 | |
}, | |
{ | |
"epoch": 0.017371428571428572, | |
"grad_norm": 2.655125379562378, | |
"learning_rate": 9.9842586544742e-06, | |
"loss": 0.8243, | |
"step": 532 | |
}, | |
{ | |
"epoch": 0.01740408163265306, | |
"grad_norm": 2.7782280445098877, | |
"learning_rate": 9.984225996080993e-06, | |
"loss": 0.8279, | |
"step": 533 | |
}, | |
{ | |
"epoch": 0.01743673469387755, | |
"grad_norm": 2.6449782848358154, | |
"learning_rate": 9.984193337687786e-06, | |
"loss": 0.8517, | |
"step": 534 | |
}, | |
{ | |
"epoch": 0.01746938775510204, | |
"grad_norm": 3.011317491531372, | |
"learning_rate": 9.98416067929458e-06, | |
"loss": 0.7763, | |
"step": 535 | |
}, | |
{ | |
"epoch": 0.01750204081632653, | |
"grad_norm": 2.915745496749878, | |
"learning_rate": 9.984128020901373e-06, | |
"loss": 0.8283, | |
"step": 536 | |
}, | |
{ | |
"epoch": 0.017534693877551022, | |
"grad_norm": 2.7176997661590576, | |
"learning_rate": 9.984095362508164e-06, | |
"loss": 0.7846, | |
"step": 537 | |
}, | |
{ | |
"epoch": 0.01756734693877551, | |
"grad_norm": 2.9343252182006836, | |
"learning_rate": 9.984062704114958e-06, | |
"loss": 0.7733, | |
"step": 538 | |
}, | |
{ | |
"epoch": 0.0176, | |
"grad_norm": 3.041198492050171, | |
"learning_rate": 9.984030045721751e-06, | |
"loss": 0.8024, | |
"step": 539 | |
}, | |
{ | |
"epoch": 0.01763265306122449, | |
"grad_norm": 2.8848764896392822, | |
"learning_rate": 9.983997387328544e-06, | |
"loss": 0.8064, | |
"step": 540 | |
}, | |
{ | |
"epoch": 0.01766530612244898, | |
"grad_norm": 2.847885847091675, | |
"learning_rate": 9.983964728935337e-06, | |
"loss": 0.7638, | |
"step": 541 | |
}, | |
{ | |
"epoch": 0.017697959183673468, | |
"grad_norm": 2.7816402912139893, | |
"learning_rate": 9.98393207054213e-06, | |
"loss": 0.7839, | |
"step": 542 | |
}, | |
{ | |
"epoch": 0.01773061224489796, | |
"grad_norm": 2.8443193435668945, | |
"learning_rate": 9.983899412148922e-06, | |
"loss": 0.8005, | |
"step": 543 | |
}, | |
{ | |
"epoch": 0.01776326530612245, | |
"grad_norm": 2.923306941986084, | |
"learning_rate": 9.983866753755717e-06, | |
"loss": 0.8048, | |
"step": 544 | |
}, | |
{ | |
"epoch": 0.01779591836734694, | |
"grad_norm": 3.0685269832611084, | |
"learning_rate": 9.983834095362509e-06, | |
"loss": 0.7676, | |
"step": 545 | |
}, | |
{ | |
"epoch": 0.01782857142857143, | |
"grad_norm": 2.841459274291992, | |
"learning_rate": 9.983801436969302e-06, | |
"loss": 0.7828, | |
"step": 546 | |
}, | |
{ | |
"epoch": 0.017861224489795918, | |
"grad_norm": 3.1108107566833496, | |
"learning_rate": 9.983768778576095e-06, | |
"loss": 0.8287, | |
"step": 547 | |
}, | |
{ | |
"epoch": 0.01789387755102041, | |
"grad_norm": 2.8307478427886963, | |
"learning_rate": 9.983736120182888e-06, | |
"loss": 0.7956, | |
"step": 548 | |
}, | |
{ | |
"epoch": 0.017926530612244897, | |
"grad_norm": 2.83941650390625, | |
"learning_rate": 9.983703461789682e-06, | |
"loss": 0.7963, | |
"step": 549 | |
}, | |
{ | |
"epoch": 0.017959183673469388, | |
"grad_norm": 2.710853338241577, | |
"learning_rate": 9.983670803396473e-06, | |
"loss": 0.8023, | |
"step": 550 | |
}, | |
{ | |
"epoch": 0.017959183673469388, | |
"eval_loss": 0.8259984850883484, | |
"eval_runtime": 98.251, | |
"eval_samples_per_second": 1.018, | |
"eval_steps_per_second": 1.018, | |
"step": 550 | |
}, | |
{ | |
"epoch": 0.01799183673469388, | |
"grad_norm": 2.7674920558929443, | |
"learning_rate": 9.983638145003266e-06, | |
"loss": 0.8121, | |
"step": 551 | |
}, | |
{ | |
"epoch": 0.018024489795918367, | |
"grad_norm": 2.756206512451172, | |
"learning_rate": 9.98360548661006e-06, | |
"loss": 0.7886, | |
"step": 552 | |
}, | |
{ | |
"epoch": 0.01805714285714286, | |
"grad_norm": 3.000251054763794, | |
"learning_rate": 9.983572828216853e-06, | |
"loss": 0.8181, | |
"step": 553 | |
}, | |
{ | |
"epoch": 0.018089795918367346, | |
"grad_norm": 2.7608935832977295, | |
"learning_rate": 9.983540169823646e-06, | |
"loss": 0.8047, | |
"step": 554 | |
}, | |
{ | |
"epoch": 0.018122448979591838, | |
"grad_norm": 2.7886579036712646, | |
"learning_rate": 9.983507511430438e-06, | |
"loss": 0.8023, | |
"step": 555 | |
}, | |
{ | |
"epoch": 0.018155102040816325, | |
"grad_norm": 2.7562673091888428, | |
"learning_rate": 9.983474853037231e-06, | |
"loss": 0.7935, | |
"step": 556 | |
}, | |
{ | |
"epoch": 0.018187755102040817, | |
"grad_norm": 2.8150694370269775, | |
"learning_rate": 9.983442194644024e-06, | |
"loss": 0.8252, | |
"step": 557 | |
}, | |
{ | |
"epoch": 0.018220408163265304, | |
"grad_norm": 3.069070339202881, | |
"learning_rate": 9.983409536250817e-06, | |
"loss": 0.8352, | |
"step": 558 | |
}, | |
{ | |
"epoch": 0.018253061224489796, | |
"grad_norm": 3.065143585205078, | |
"learning_rate": 9.98337687785761e-06, | |
"loss": 0.8322, | |
"step": 559 | |
}, | |
{ | |
"epoch": 0.018285714285714287, | |
"grad_norm": 2.808361530303955, | |
"learning_rate": 9.983344219464404e-06, | |
"loss": 0.8316, | |
"step": 560 | |
}, | |
{ | |
"epoch": 0.018318367346938775, | |
"grad_norm": 3.086597204208374, | |
"learning_rate": 9.983311561071195e-06, | |
"loss": 0.8063, | |
"step": 561 | |
}, | |
{ | |
"epoch": 0.018351020408163266, | |
"grad_norm": 2.8510525226593018, | |
"learning_rate": 9.983278902677989e-06, | |
"loss": 0.7735, | |
"step": 562 | |
}, | |
{ | |
"epoch": 0.018383673469387754, | |
"grad_norm": 2.8289899826049805, | |
"learning_rate": 9.983246244284782e-06, | |
"loss": 0.831, | |
"step": 563 | |
}, | |
{ | |
"epoch": 0.018416326530612245, | |
"grad_norm": 2.6663060188293457, | |
"learning_rate": 9.983213585891575e-06, | |
"loss": 0.8109, | |
"step": 564 | |
}, | |
{ | |
"epoch": 0.018448979591836733, | |
"grad_norm": 2.8458051681518555, | |
"learning_rate": 9.983180927498368e-06, | |
"loss": 0.8069, | |
"step": 565 | |
}, | |
{ | |
"epoch": 0.018481632653061224, | |
"grad_norm": 3.2364461421966553, | |
"learning_rate": 9.98314826910516e-06, | |
"loss": 0.8214, | |
"step": 566 | |
}, | |
{ | |
"epoch": 0.018514285714285716, | |
"grad_norm": 3.0581226348876953, | |
"learning_rate": 9.983115610711955e-06, | |
"loss": 0.8115, | |
"step": 567 | |
}, | |
{ | |
"epoch": 0.018546938775510204, | |
"grad_norm": 2.7239277362823486, | |
"learning_rate": 9.983082952318746e-06, | |
"loss": 0.8248, | |
"step": 568 | |
}, | |
{ | |
"epoch": 0.018579591836734695, | |
"grad_norm": 2.827193260192871, | |
"learning_rate": 9.98305029392554e-06, | |
"loss": 0.8282, | |
"step": 569 | |
}, | |
{ | |
"epoch": 0.018612244897959183, | |
"grad_norm": 3.018669605255127, | |
"learning_rate": 9.983017635532333e-06, | |
"loss": 0.8196, | |
"step": 570 | |
}, | |
{ | |
"epoch": 0.018644897959183674, | |
"grad_norm": 2.9022018909454346, | |
"learning_rate": 9.982984977139126e-06, | |
"loss": 0.7748, | |
"step": 571 | |
}, | |
{ | |
"epoch": 0.018677551020408162, | |
"grad_norm": 2.7094552516937256, | |
"learning_rate": 9.98295231874592e-06, | |
"loss": 0.8381, | |
"step": 572 | |
}, | |
{ | |
"epoch": 0.018710204081632653, | |
"grad_norm": 3.020749568939209, | |
"learning_rate": 9.98291966035271e-06, | |
"loss": 0.7954, | |
"step": 573 | |
}, | |
{ | |
"epoch": 0.018742857142857144, | |
"grad_norm": 2.9567782878875732, | |
"learning_rate": 9.982887001959504e-06, | |
"loss": 0.8197, | |
"step": 574 | |
}, | |
{ | |
"epoch": 0.018775510204081632, | |
"grad_norm": 2.939793348312378, | |
"learning_rate": 9.982854343566297e-06, | |
"loss": 0.7891, | |
"step": 575 | |
}, | |
{ | |
"epoch": 0.018808163265306124, | |
"grad_norm": 2.851130485534668, | |
"learning_rate": 9.98282168517309e-06, | |
"loss": 0.8669, | |
"step": 576 | |
}, | |
{ | |
"epoch": 0.01884081632653061, | |
"grad_norm": 2.745495080947876, | |
"learning_rate": 9.982789026779884e-06, | |
"loss": 0.8028, | |
"step": 577 | |
}, | |
{ | |
"epoch": 0.018873469387755103, | |
"grad_norm": 2.83685302734375, | |
"learning_rate": 9.982756368386675e-06, | |
"loss": 0.7981, | |
"step": 578 | |
}, | |
{ | |
"epoch": 0.01890612244897959, | |
"grad_norm": 2.845992088317871, | |
"learning_rate": 9.982723709993469e-06, | |
"loss": 0.7752, | |
"step": 579 | |
}, | |
{ | |
"epoch": 0.018938775510204082, | |
"grad_norm": 3.1280617713928223, | |
"learning_rate": 9.982691051600262e-06, | |
"loss": 0.824, | |
"step": 580 | |
}, | |
{ | |
"epoch": 0.018971428571428573, | |
"grad_norm": 2.8821194171905518, | |
"learning_rate": 9.982658393207055e-06, | |
"loss": 0.7925, | |
"step": 581 | |
}, | |
{ | |
"epoch": 0.01900408163265306, | |
"grad_norm": 2.827406167984009, | |
"learning_rate": 9.982625734813848e-06, | |
"loss": 0.8311, | |
"step": 582 | |
}, | |
{ | |
"epoch": 0.019036734693877552, | |
"grad_norm": 2.7628026008605957, | |
"learning_rate": 9.982593076420641e-06, | |
"loss": 0.8228, | |
"step": 583 | |
}, | |
{ | |
"epoch": 0.01906938775510204, | |
"grad_norm": 2.8712172508239746, | |
"learning_rate": 9.982560418027433e-06, | |
"loss": 0.8014, | |
"step": 584 | |
}, | |
{ | |
"epoch": 0.01910204081632653, | |
"grad_norm": 2.85799241065979, | |
"learning_rate": 9.982527759634226e-06, | |
"loss": 0.8281, | |
"step": 585 | |
}, | |
{ | |
"epoch": 0.01913469387755102, | |
"grad_norm": 2.917358160018921, | |
"learning_rate": 9.98249510124102e-06, | |
"loss": 0.8049, | |
"step": 586 | |
}, | |
{ | |
"epoch": 0.01916734693877551, | |
"grad_norm": 2.8612101078033447, | |
"learning_rate": 9.982462442847813e-06, | |
"loss": 0.7645, | |
"step": 587 | |
}, | |
{ | |
"epoch": 0.0192, | |
"grad_norm": 2.8899152278900146, | |
"learning_rate": 9.982429784454606e-06, | |
"loss": 0.7655, | |
"step": 588 | |
}, | |
{ | |
"epoch": 0.01923265306122449, | |
"grad_norm": 2.761504888534546, | |
"learning_rate": 9.9823971260614e-06, | |
"loss": 0.784, | |
"step": 589 | |
}, | |
{ | |
"epoch": 0.01926530612244898, | |
"grad_norm": 3.1624319553375244, | |
"learning_rate": 9.982364467668192e-06, | |
"loss": 0.8538, | |
"step": 590 | |
}, | |
{ | |
"epoch": 0.01929795918367347, | |
"grad_norm": 2.840982675552368, | |
"learning_rate": 9.982331809274984e-06, | |
"loss": 0.7879, | |
"step": 591 | |
}, | |
{ | |
"epoch": 0.01933061224489796, | |
"grad_norm": 2.8063557147979736, | |
"learning_rate": 9.982299150881777e-06, | |
"loss": 0.7858, | |
"step": 592 | |
}, | |
{ | |
"epoch": 0.019363265306122448, | |
"grad_norm": 2.761817455291748, | |
"learning_rate": 9.98226649248857e-06, | |
"loss": 0.7932, | |
"step": 593 | |
}, | |
{ | |
"epoch": 0.01939591836734694, | |
"grad_norm": 2.845803737640381, | |
"learning_rate": 9.982233834095364e-06, | |
"loss": 0.8315, | |
"step": 594 | |
}, | |
{ | |
"epoch": 0.019428571428571427, | |
"grad_norm": 2.820237398147583, | |
"learning_rate": 9.982201175702157e-06, | |
"loss": 0.8115, | |
"step": 595 | |
}, | |
{ | |
"epoch": 0.01946122448979592, | |
"grad_norm": 2.8963451385498047, | |
"learning_rate": 9.982168517308948e-06, | |
"loss": 0.7965, | |
"step": 596 | |
}, | |
{ | |
"epoch": 0.01949387755102041, | |
"grad_norm": 2.8076157569885254, | |
"learning_rate": 9.982135858915742e-06, | |
"loss": 0.8205, | |
"step": 597 | |
}, | |
{ | |
"epoch": 0.019526530612244897, | |
"grad_norm": 2.9265244007110596, | |
"learning_rate": 9.982103200522535e-06, | |
"loss": 0.7975, | |
"step": 598 | |
}, | |
{ | |
"epoch": 0.01955918367346939, | |
"grad_norm": 3.044168472290039, | |
"learning_rate": 9.982070542129328e-06, | |
"loss": 0.7738, | |
"step": 599 | |
}, | |
{ | |
"epoch": 0.019591836734693877, | |
"grad_norm": 2.991462469100952, | |
"learning_rate": 9.982037883736121e-06, | |
"loss": 0.808, | |
"step": 600 | |
}, | |
{ | |
"epoch": 0.019591836734693877, | |
"eval_loss": 0.8216409087181091, | |
"eval_runtime": 96.6305, | |
"eval_samples_per_second": 1.035, | |
"eval_steps_per_second": 1.035, | |
"step": 600 | |
}, | |
{ | |
"epoch": 0.019624489795918368, | |
"grad_norm": 2.856177568435669, | |
"learning_rate": 9.982005225342915e-06, | |
"loss": 0.8002, | |
"step": 601 | |
}, | |
{ | |
"epoch": 0.019657142857142856, | |
"grad_norm": 2.881652355194092, | |
"learning_rate": 9.981972566949706e-06, | |
"loss": 0.8012, | |
"step": 602 | |
}, | |
{ | |
"epoch": 0.019689795918367347, | |
"grad_norm": 2.9178857803344727, | |
"learning_rate": 9.9819399085565e-06, | |
"loss": 0.7724, | |
"step": 603 | |
}, | |
{ | |
"epoch": 0.019722448979591838, | |
"grad_norm": 2.7211968898773193, | |
"learning_rate": 9.981907250163293e-06, | |
"loss": 0.798, | |
"step": 604 | |
}, | |
{ | |
"epoch": 0.019755102040816326, | |
"grad_norm": 3.050656795501709, | |
"learning_rate": 9.981874591770086e-06, | |
"loss": 0.8564, | |
"step": 605 | |
}, | |
{ | |
"epoch": 0.019787755102040817, | |
"grad_norm": 2.9224750995635986, | |
"learning_rate": 9.981841933376879e-06, | |
"loss": 0.8058, | |
"step": 606 | |
}, | |
{ | |
"epoch": 0.019820408163265305, | |
"grad_norm": 2.9750068187713623, | |
"learning_rate": 9.98180927498367e-06, | |
"loss": 0.7774, | |
"step": 607 | |
}, | |
{ | |
"epoch": 0.019853061224489797, | |
"grad_norm": 3.0843162536621094, | |
"learning_rate": 9.981776616590466e-06, | |
"loss": 0.8017, | |
"step": 608 | |
}, | |
{ | |
"epoch": 0.019885714285714284, | |
"grad_norm": 3.043372631072998, | |
"learning_rate": 9.981743958197257e-06, | |
"loss": 0.7763, | |
"step": 609 | |
}, | |
{ | |
"epoch": 0.019918367346938776, | |
"grad_norm": 2.8992819786071777, | |
"learning_rate": 9.98171129980405e-06, | |
"loss": 0.8483, | |
"step": 610 | |
}, | |
{ | |
"epoch": 0.019951020408163267, | |
"grad_norm": 2.5121912956237793, | |
"learning_rate": 9.981678641410844e-06, | |
"loss": 0.8014, | |
"step": 611 | |
}, | |
{ | |
"epoch": 0.019983673469387755, | |
"grad_norm": 2.8126091957092285, | |
"learning_rate": 9.981645983017637e-06, | |
"loss": 0.8, | |
"step": 612 | |
}, | |
{ | |
"epoch": 0.020016326530612246, | |
"grad_norm": 2.760281801223755, | |
"learning_rate": 9.98161332462443e-06, | |
"loss": 0.8089, | |
"step": 613 | |
}, | |
{ | |
"epoch": 0.020048979591836734, | |
"grad_norm": 2.75252103805542, | |
"learning_rate": 9.981580666231222e-06, | |
"loss": 0.7991, | |
"step": 614 | |
}, | |
{ | |
"epoch": 0.020081632653061225, | |
"grad_norm": 2.7450368404388428, | |
"learning_rate": 9.981548007838015e-06, | |
"loss": 0.7995, | |
"step": 615 | |
}, | |
{ | |
"epoch": 0.020114285714285713, | |
"grad_norm": 2.752061367034912, | |
"learning_rate": 9.981515349444808e-06, | |
"loss": 0.8167, | |
"step": 616 | |
}, | |
{ | |
"epoch": 0.020146938775510204, | |
"grad_norm": 2.882789134979248, | |
"learning_rate": 9.981482691051601e-06, | |
"loss": 0.8438, | |
"step": 617 | |
}, | |
{ | |
"epoch": 0.020179591836734692, | |
"grad_norm": 2.8994760513305664, | |
"learning_rate": 9.981450032658395e-06, | |
"loss": 0.7794, | |
"step": 618 | |
}, | |
{ | |
"epoch": 0.020212244897959183, | |
"grad_norm": 2.925473928451538, | |
"learning_rate": 9.981417374265186e-06, | |
"loss": 0.798, | |
"step": 619 | |
}, | |
{ | |
"epoch": 0.020244897959183675, | |
"grad_norm": 2.909290075302124, | |
"learning_rate": 9.98138471587198e-06, | |
"loss": 0.8487, | |
"step": 620 | |
}, | |
{ | |
"epoch": 0.020277551020408163, | |
"grad_norm": 3.004021644592285, | |
"learning_rate": 9.981352057478773e-06, | |
"loss": 0.8214, | |
"step": 621 | |
}, | |
{ | |
"epoch": 0.020310204081632654, | |
"grad_norm": 2.850106954574585, | |
"learning_rate": 9.981319399085566e-06, | |
"loss": 0.8158, | |
"step": 622 | |
}, | |
{ | |
"epoch": 0.02034285714285714, | |
"grad_norm": 2.903092622756958, | |
"learning_rate": 9.981286740692359e-06, | |
"loss": 0.8078, | |
"step": 623 | |
}, | |
{ | |
"epoch": 0.020375510204081633, | |
"grad_norm": 2.9145193099975586, | |
"learning_rate": 9.981254082299152e-06, | |
"loss": 0.8224, | |
"step": 624 | |
}, | |
{ | |
"epoch": 0.02040816326530612, | |
"grad_norm": 3.0473721027374268, | |
"learning_rate": 9.981221423905944e-06, | |
"loss": 0.7681, | |
"step": 625 | |
}, | |
{ | |
"epoch": 0.020440816326530612, | |
"grad_norm": 2.8615925312042236, | |
"learning_rate": 9.981188765512737e-06, | |
"loss": 0.815, | |
"step": 626 | |
}, | |
{ | |
"epoch": 0.020473469387755103, | |
"grad_norm": 2.8572824001312256, | |
"learning_rate": 9.98115610711953e-06, | |
"loss": 0.7672, | |
"step": 627 | |
}, | |
{ | |
"epoch": 0.02050612244897959, | |
"grad_norm": 2.9113521575927734, | |
"learning_rate": 9.981123448726324e-06, | |
"loss": 0.7952, | |
"step": 628 | |
}, | |
{ | |
"epoch": 0.020538775510204083, | |
"grad_norm": 3.0004873275756836, | |
"learning_rate": 9.981090790333117e-06, | |
"loss": 0.8181, | |
"step": 629 | |
}, | |
{ | |
"epoch": 0.02057142857142857, | |
"grad_norm": 2.9250986576080322, | |
"learning_rate": 9.981058131939908e-06, | |
"loss": 0.7681, | |
"step": 630 | |
}, | |
{ | |
"epoch": 0.02060408163265306, | |
"grad_norm": 3.1994357109069824, | |
"learning_rate": 9.981025473546703e-06, | |
"loss": 0.8292, | |
"step": 631 | |
}, | |
{ | |
"epoch": 0.02063673469387755, | |
"grad_norm": 3.061445713043213, | |
"learning_rate": 9.980992815153495e-06, | |
"loss": 0.8241, | |
"step": 632 | |
}, | |
{ | |
"epoch": 0.02066938775510204, | |
"grad_norm": 3.1536760330200195, | |
"learning_rate": 9.980960156760288e-06, | |
"loss": 0.7889, | |
"step": 633 | |
}, | |
{ | |
"epoch": 0.020702040816326532, | |
"grad_norm": 2.7358641624450684, | |
"learning_rate": 9.980927498367081e-06, | |
"loss": 0.8167, | |
"step": 634 | |
}, | |
{ | |
"epoch": 0.02073469387755102, | |
"grad_norm": 2.952697992324829, | |
"learning_rate": 9.980894839973874e-06, | |
"loss": 0.7741, | |
"step": 635 | |
}, | |
{ | |
"epoch": 0.02076734693877551, | |
"grad_norm": 3.003535747528076, | |
"learning_rate": 9.980862181580668e-06, | |
"loss": 0.8101, | |
"step": 636 | |
}, | |
{ | |
"epoch": 0.0208, | |
"grad_norm": 2.895209789276123, | |
"learning_rate": 9.98082952318746e-06, | |
"loss": 0.8025, | |
"step": 637 | |
}, | |
{ | |
"epoch": 0.02083265306122449, | |
"grad_norm": 2.974769115447998, | |
"learning_rate": 9.980796864794252e-06, | |
"loss": 0.8236, | |
"step": 638 | |
}, | |
{ | |
"epoch": 0.020865306122448978, | |
"grad_norm": 2.8859550952911377, | |
"learning_rate": 9.980764206401046e-06, | |
"loss": 0.8214, | |
"step": 639 | |
}, | |
{ | |
"epoch": 0.02089795918367347, | |
"grad_norm": 3.1916568279266357, | |
"learning_rate": 9.980731548007839e-06, | |
"loss": 0.7618, | |
"step": 640 | |
}, | |
{ | |
"epoch": 0.02093061224489796, | |
"grad_norm": 3.1790311336517334, | |
"learning_rate": 9.980698889614632e-06, | |
"loss": 0.779, | |
"step": 641 | |
}, | |
{ | |
"epoch": 0.02096326530612245, | |
"grad_norm": 9.082115173339844, | |
"learning_rate": 9.980666231221424e-06, | |
"loss": 0.7972, | |
"step": 642 | |
}, | |
{ | |
"epoch": 0.02099591836734694, | |
"grad_norm": 3.0865111351013184, | |
"learning_rate": 9.980633572828217e-06, | |
"loss": 0.7972, | |
"step": 643 | |
}, | |
{ | |
"epoch": 0.021028571428571428, | |
"grad_norm": 3.3228487968444824, | |
"learning_rate": 9.98060091443501e-06, | |
"loss": 0.8245, | |
"step": 644 | |
}, | |
{ | |
"epoch": 0.02106122448979592, | |
"grad_norm": 2.7116079330444336, | |
"learning_rate": 9.980568256041803e-06, | |
"loss": 0.8129, | |
"step": 645 | |
}, | |
{ | |
"epoch": 0.021093877551020407, | |
"grad_norm": 2.9123730659484863, | |
"learning_rate": 9.980535597648597e-06, | |
"loss": 0.832, | |
"step": 646 | |
}, | |
{ | |
"epoch": 0.021126530612244898, | |
"grad_norm": 3.1061103343963623, | |
"learning_rate": 9.98050293925539e-06, | |
"loss": 0.8067, | |
"step": 647 | |
}, | |
{ | |
"epoch": 0.02115918367346939, | |
"grad_norm": 3.2617485523223877, | |
"learning_rate": 9.980470280862181e-06, | |
"loss": 0.7897, | |
"step": 648 | |
}, | |
{ | |
"epoch": 0.021191836734693877, | |
"grad_norm": 3.0000083446502686, | |
"learning_rate": 9.980437622468976e-06, | |
"loss": 0.7663, | |
"step": 649 | |
}, | |
{ | |
"epoch": 0.02122448979591837, | |
"grad_norm": 3.029799461364746, | |
"learning_rate": 9.980404964075768e-06, | |
"loss": 0.7688, | |
"step": 650 | |
}, | |
{ | |
"epoch": 0.02122448979591837, | |
"eval_loss": 0.8194996118545532, | |
"eval_runtime": 74.2652, | |
"eval_samples_per_second": 1.347, | |
"eval_steps_per_second": 1.347, | |
"step": 650 | |
}, | |
{ | |
"epoch": 0.021257142857142856, | |
"grad_norm": 2.762091875076294, | |
"learning_rate": 9.980372305682561e-06, | |
"loss": 0.8051, | |
"step": 651 | |
}, | |
{ | |
"epoch": 0.021289795918367348, | |
"grad_norm": 3.030320167541504, | |
"learning_rate": 9.980339647289354e-06, | |
"loss": 0.8201, | |
"step": 652 | |
}, | |
{ | |
"epoch": 0.021322448979591836, | |
"grad_norm": 2.9875736236572266, | |
"learning_rate": 9.980306988896148e-06, | |
"loss": 0.7841, | |
"step": 653 | |
}, | |
{ | |
"epoch": 0.021355102040816327, | |
"grad_norm": 2.847587823867798, | |
"learning_rate": 9.980274330502941e-06, | |
"loss": 0.7807, | |
"step": 654 | |
}, | |
{ | |
"epoch": 0.021387755102040815, | |
"grad_norm": 3.0852084159851074, | |
"learning_rate": 9.980241672109732e-06, | |
"loss": 0.7606, | |
"step": 655 | |
}, | |
{ | |
"epoch": 0.021420408163265306, | |
"grad_norm": 3.159208297729492, | |
"learning_rate": 9.980209013716526e-06, | |
"loss": 0.8011, | |
"step": 656 | |
}, | |
{ | |
"epoch": 0.021453061224489797, | |
"grad_norm": 3.063185214996338, | |
"learning_rate": 9.980176355323319e-06, | |
"loss": 0.7839, | |
"step": 657 | |
}, | |
{ | |
"epoch": 0.021485714285714285, | |
"grad_norm": 2.9843311309814453, | |
"learning_rate": 9.980143696930112e-06, | |
"loss": 0.7961, | |
"step": 658 | |
}, | |
{ | |
"epoch": 0.021518367346938776, | |
"grad_norm": 3.0662293434143066, | |
"learning_rate": 9.980111038536905e-06, | |
"loss": 0.8065, | |
"step": 659 | |
}, | |
{ | |
"epoch": 0.021551020408163264, | |
"grad_norm": 2.9915835857391357, | |
"learning_rate": 9.980078380143697e-06, | |
"loss": 0.8022, | |
"step": 660 | |
}, | |
{ | |
"epoch": 0.021583673469387755, | |
"grad_norm": 2.7856531143188477, | |
"learning_rate": 9.98004572175049e-06, | |
"loss": 0.7853, | |
"step": 661 | |
}, | |
{ | |
"epoch": 0.021616326530612243, | |
"grad_norm": 2.8300893306732178, | |
"learning_rate": 9.980013063357283e-06, | |
"loss": 0.8097, | |
"step": 662 | |
}, | |
{ | |
"epoch": 0.021648979591836735, | |
"grad_norm": 2.7422966957092285, | |
"learning_rate": 9.979980404964077e-06, | |
"loss": 0.8307, | |
"step": 663 | |
}, | |
{ | |
"epoch": 0.021681632653061226, | |
"grad_norm": 2.80092453956604, | |
"learning_rate": 9.97994774657087e-06, | |
"loss": 0.7932, | |
"step": 664 | |
}, | |
{ | |
"epoch": 0.021714285714285714, | |
"grad_norm": 2.8894505500793457, | |
"learning_rate": 9.979915088177663e-06, | |
"loss": 0.8042, | |
"step": 665 | |
}, | |
{ | |
"epoch": 0.021746938775510205, | |
"grad_norm": 2.7492778301239014, | |
"learning_rate": 9.979882429784455e-06, | |
"loss": 0.7817, | |
"step": 666 | |
}, | |
{ | |
"epoch": 0.021779591836734693, | |
"grad_norm": 2.734226703643799, | |
"learning_rate": 9.979849771391248e-06, | |
"loss": 0.7984, | |
"step": 667 | |
}, | |
{ | |
"epoch": 0.021812244897959184, | |
"grad_norm": 2.7127978801727295, | |
"learning_rate": 9.979817112998041e-06, | |
"loss": 0.7955, | |
"step": 668 | |
}, | |
{ | |
"epoch": 0.021844897959183672, | |
"grad_norm": 2.881192207336426, | |
"learning_rate": 9.979784454604834e-06, | |
"loss": 0.8257, | |
"step": 669 | |
}, | |
{ | |
"epoch": 0.021877551020408163, | |
"grad_norm": 3.02278995513916, | |
"learning_rate": 9.979751796211628e-06, | |
"loss": 0.7892, | |
"step": 670 | |
}, | |
{ | |
"epoch": 0.021910204081632655, | |
"grad_norm": 3.157317876815796, | |
"learning_rate": 9.979719137818419e-06, | |
"loss": 0.8227, | |
"step": 671 | |
}, | |
{ | |
"epoch": 0.021942857142857142, | |
"grad_norm": 2.876741409301758, | |
"learning_rate": 9.979686479425214e-06, | |
"loss": 0.8116, | |
"step": 672 | |
}, | |
{ | |
"epoch": 0.021975510204081634, | |
"grad_norm": 3.1728057861328125, | |
"learning_rate": 9.979653821032006e-06, | |
"loss": 0.8207, | |
"step": 673 | |
}, | |
{ | |
"epoch": 0.02200816326530612, | |
"grad_norm": 3.2653119564056396, | |
"learning_rate": 9.979621162638799e-06, | |
"loss": 0.8087, | |
"step": 674 | |
}, | |
{ | |
"epoch": 0.022040816326530613, | |
"grad_norm": 3.0209734439849854, | |
"learning_rate": 9.979588504245592e-06, | |
"loss": 0.7958, | |
"step": 675 | |
}, | |
{ | |
"epoch": 0.0220734693877551, | |
"grad_norm": 2.9444637298583984, | |
"learning_rate": 9.979555845852385e-06, | |
"loss": 0.8031, | |
"step": 676 | |
}, | |
{ | |
"epoch": 0.022106122448979592, | |
"grad_norm": 2.8222014904022217, | |
"learning_rate": 9.979523187459179e-06, | |
"loss": 0.7842, | |
"step": 677 | |
}, | |
{ | |
"epoch": 0.022138775510204083, | |
"grad_norm": 3.034346342086792, | |
"learning_rate": 9.97949052906597e-06, | |
"loss": 0.7979, | |
"step": 678 | |
}, | |
{ | |
"epoch": 0.02217142857142857, | |
"grad_norm": 2.9454312324523926, | |
"learning_rate": 9.979457870672763e-06, | |
"loss": 0.8209, | |
"step": 679 | |
}, | |
{ | |
"epoch": 0.022204081632653062, | |
"grad_norm": 2.9699203968048096, | |
"learning_rate": 9.979425212279557e-06, | |
"loss": 0.7783, | |
"step": 680 | |
}, | |
{ | |
"epoch": 0.02223673469387755, | |
"grad_norm": 3.0586094856262207, | |
"learning_rate": 9.97939255388635e-06, | |
"loss": 0.8257, | |
"step": 681 | |
}, | |
{ | |
"epoch": 0.02226938775510204, | |
"grad_norm": 3.1194658279418945, | |
"learning_rate": 9.979359895493143e-06, | |
"loss": 0.7824, | |
"step": 682 | |
}, | |
{ | |
"epoch": 0.02230204081632653, | |
"grad_norm": 2.950078248977661, | |
"learning_rate": 9.979327237099935e-06, | |
"loss": 0.7866, | |
"step": 683 | |
}, | |
{ | |
"epoch": 0.02233469387755102, | |
"grad_norm": 3.135443925857544, | |
"learning_rate": 9.979294578706728e-06, | |
"loss": 0.786, | |
"step": 684 | |
}, | |
{ | |
"epoch": 0.02236734693877551, | |
"grad_norm": 3.1549785137176514, | |
"learning_rate": 9.979261920313521e-06, | |
"loss": 0.8063, | |
"step": 685 | |
}, | |
{ | |
"epoch": 0.0224, | |
"grad_norm": 3.064702033996582, | |
"learning_rate": 9.979229261920314e-06, | |
"loss": 0.7789, | |
"step": 686 | |
}, | |
{ | |
"epoch": 0.02243265306122449, | |
"grad_norm": 2.9797706604003906, | |
"learning_rate": 9.979196603527108e-06, | |
"loss": 0.8022, | |
"step": 687 | |
}, | |
{ | |
"epoch": 0.02246530612244898, | |
"grad_norm": 3.0248265266418457, | |
"learning_rate": 9.9791639451339e-06, | |
"loss": 0.7935, | |
"step": 688 | |
}, | |
{ | |
"epoch": 0.02249795918367347, | |
"grad_norm": 2.865000009536743, | |
"learning_rate": 9.979131286740692e-06, | |
"loss": 0.7539, | |
"step": 689 | |
}, | |
{ | |
"epoch": 0.022530612244897958, | |
"grad_norm": 2.7899985313415527, | |
"learning_rate": 9.979098628347487e-06, | |
"loss": 0.766, | |
"step": 690 | |
}, | |
{ | |
"epoch": 0.02256326530612245, | |
"grad_norm": 3.016523838043213, | |
"learning_rate": 9.979065969954279e-06, | |
"loss": 0.7904, | |
"step": 691 | |
}, | |
{ | |
"epoch": 0.022595918367346937, | |
"grad_norm": 2.954990863800049, | |
"learning_rate": 9.979033311561072e-06, | |
"loss": 0.8276, | |
"step": 692 | |
}, | |
{ | |
"epoch": 0.02262857142857143, | |
"grad_norm": 2.932527780532837, | |
"learning_rate": 9.979000653167865e-06, | |
"loss": 0.7614, | |
"step": 693 | |
}, | |
{ | |
"epoch": 0.02266122448979592, | |
"grad_norm": 2.892082691192627, | |
"learning_rate": 9.978967994774657e-06, | |
"loss": 0.8026, | |
"step": 694 | |
}, | |
{ | |
"epoch": 0.022693877551020408, | |
"grad_norm": 2.8288159370422363, | |
"learning_rate": 9.978935336381452e-06, | |
"loss": 0.7956, | |
"step": 695 | |
}, | |
{ | |
"epoch": 0.0227265306122449, | |
"grad_norm": 2.8160433769226074, | |
"learning_rate": 9.978902677988243e-06, | |
"loss": 0.7765, | |
"step": 696 | |
}, | |
{ | |
"epoch": 0.022759183673469387, | |
"grad_norm": 2.8383493423461914, | |
"learning_rate": 9.978870019595036e-06, | |
"loss": 0.7926, | |
"step": 697 | |
}, | |
{ | |
"epoch": 0.022791836734693878, | |
"grad_norm": 3.058093786239624, | |
"learning_rate": 9.97883736120183e-06, | |
"loss": 0.7878, | |
"step": 698 | |
}, | |
{ | |
"epoch": 0.022824489795918366, | |
"grad_norm": 3.0352556705474854, | |
"learning_rate": 9.978804702808623e-06, | |
"loss": 0.7885, | |
"step": 699 | |
}, | |
{ | |
"epoch": 0.022857142857142857, | |
"grad_norm": 2.9252421855926514, | |
"learning_rate": 9.978772044415416e-06, | |
"loss": 0.7759, | |
"step": 700 | |
}, | |
{ | |
"epoch": 0.022857142857142857, | |
"eval_loss": 0.8187767267227173, | |
"eval_runtime": 74.1789, | |
"eval_samples_per_second": 1.348, | |
"eval_steps_per_second": 1.348, | |
"step": 700 | |
}, | |
{ | |
"epoch": 0.02288979591836735, | |
"grad_norm": 2.860393762588501, | |
"learning_rate": 9.978739386022208e-06, | |
"loss": 0.8177, | |
"step": 701 | |
}, | |
{ | |
"epoch": 0.022922448979591836, | |
"grad_norm": 2.960848331451416, | |
"learning_rate": 9.978706727629001e-06, | |
"loss": 0.8298, | |
"step": 702 | |
}, | |
{ | |
"epoch": 0.022955102040816328, | |
"grad_norm": 2.8897740840911865, | |
"learning_rate": 9.978674069235794e-06, | |
"loss": 0.7625, | |
"step": 703 | |
}, | |
{ | |
"epoch": 0.022987755102040815, | |
"grad_norm": 3.0318431854248047, | |
"learning_rate": 9.978641410842587e-06, | |
"loss": 0.7759, | |
"step": 704 | |
}, | |
{ | |
"epoch": 0.023020408163265307, | |
"grad_norm": 2.959118366241455, | |
"learning_rate": 9.97860875244938e-06, | |
"loss": 0.7921, | |
"step": 705 | |
}, | |
{ | |
"epoch": 0.023053061224489795, | |
"grad_norm": 2.871290922164917, | |
"learning_rate": 9.978576094056174e-06, | |
"loss": 0.7943, | |
"step": 706 | |
}, | |
{ | |
"epoch": 0.023085714285714286, | |
"grad_norm": 2.699939489364624, | |
"learning_rate": 9.978543435662965e-06, | |
"loss": 0.7708, | |
"step": 707 | |
}, | |
{ | |
"epoch": 0.023118367346938777, | |
"grad_norm": 2.8150787353515625, | |
"learning_rate": 9.978510777269759e-06, | |
"loss": 0.7888, | |
"step": 708 | |
}, | |
{ | |
"epoch": 0.023151020408163265, | |
"grad_norm": 2.9636101722717285, | |
"learning_rate": 9.978478118876552e-06, | |
"loss": 0.7861, | |
"step": 709 | |
}, | |
{ | |
"epoch": 0.023183673469387756, | |
"grad_norm": 3.122624397277832, | |
"learning_rate": 9.978445460483345e-06, | |
"loss": 0.7967, | |
"step": 710 | |
}, | |
{ | |
"epoch": 0.023216326530612244, | |
"grad_norm": 3.070082902908325, | |
"learning_rate": 9.978412802090138e-06, | |
"loss": 0.793, | |
"step": 711 | |
}, | |
{ | |
"epoch": 0.023248979591836735, | |
"grad_norm": 3.063530206680298, | |
"learning_rate": 9.97838014369693e-06, | |
"loss": 0.8115, | |
"step": 712 | |
}, | |
{ | |
"epoch": 0.023281632653061223, | |
"grad_norm": 2.8320116996765137, | |
"learning_rate": 9.978347485303725e-06, | |
"loss": 0.831, | |
"step": 713 | |
}, | |
{ | |
"epoch": 0.023314285714285714, | |
"grad_norm": 2.9792158603668213, | |
"learning_rate": 9.978314826910516e-06, | |
"loss": 0.745, | |
"step": 714 | |
}, | |
{ | |
"epoch": 0.023346938775510206, | |
"grad_norm": 2.9203131198883057, | |
"learning_rate": 9.97828216851731e-06, | |
"loss": 0.7912, | |
"step": 715 | |
}, | |
{ | |
"epoch": 0.023379591836734694, | |
"grad_norm": 3.151167631149292, | |
"learning_rate": 9.978249510124103e-06, | |
"loss": 0.7687, | |
"step": 716 | |
}, | |
{ | |
"epoch": 0.023412244897959185, | |
"grad_norm": 2.8722281455993652, | |
"learning_rate": 9.978216851730896e-06, | |
"loss": 0.7975, | |
"step": 717 | |
}, | |
{ | |
"epoch": 0.023444897959183673, | |
"grad_norm": 2.989642858505249, | |
"learning_rate": 9.97818419333769e-06, | |
"loss": 0.8136, | |
"step": 718 | |
}, | |
{ | |
"epoch": 0.023477551020408164, | |
"grad_norm": 3.2293035984039307, | |
"learning_rate": 9.978151534944481e-06, | |
"loss": 0.8469, | |
"step": 719 | |
}, | |
{ | |
"epoch": 0.023510204081632652, | |
"grad_norm": 3.016899585723877, | |
"learning_rate": 9.978118876551274e-06, | |
"loss": 0.7901, | |
"step": 720 | |
}, | |
{ | |
"epoch": 0.023542857142857143, | |
"grad_norm": 3.194211006164551, | |
"learning_rate": 9.978086218158067e-06, | |
"loss": 0.8112, | |
"step": 721 | |
}, | |
{ | |
"epoch": 0.02357551020408163, | |
"grad_norm": 2.810654640197754, | |
"learning_rate": 9.97805355976486e-06, | |
"loss": 0.7878, | |
"step": 722 | |
}, | |
{ | |
"epoch": 0.023608163265306122, | |
"grad_norm": 2.788594961166382, | |
"learning_rate": 9.978020901371654e-06, | |
"loss": 0.8245, | |
"step": 723 | |
}, | |
{ | |
"epoch": 0.023640816326530614, | |
"grad_norm": 2.976698875427246, | |
"learning_rate": 9.977988242978445e-06, | |
"loss": 0.7869, | |
"step": 724 | |
}, | |
{ | |
"epoch": 0.0236734693877551, | |
"grad_norm": 2.7467331886291504, | |
"learning_rate": 9.977955584585239e-06, | |
"loss": 0.7837, | |
"step": 725 | |
}, | |
{ | |
"epoch": 0.023706122448979593, | |
"grad_norm": 2.8149194717407227, | |
"learning_rate": 9.977922926192032e-06, | |
"loss": 0.7713, | |
"step": 726 | |
}, | |
{ | |
"epoch": 0.02373877551020408, | |
"grad_norm": 3.07743501663208, | |
"learning_rate": 9.977890267798825e-06, | |
"loss": 0.7519, | |
"step": 727 | |
}, | |
{ | |
"epoch": 0.023771428571428572, | |
"grad_norm": 2.779167413711548, | |
"learning_rate": 9.977857609405618e-06, | |
"loss": 0.8015, | |
"step": 728 | |
}, | |
{ | |
"epoch": 0.02380408163265306, | |
"grad_norm": 2.861452579498291, | |
"learning_rate": 9.977824951012412e-06, | |
"loss": 0.8097, | |
"step": 729 | |
}, | |
{ | |
"epoch": 0.02383673469387755, | |
"grad_norm": 2.9489521980285645, | |
"learning_rate": 9.977792292619203e-06, | |
"loss": 0.7988, | |
"step": 730 | |
}, | |
{ | |
"epoch": 0.023869387755102042, | |
"grad_norm": 2.7786002159118652, | |
"learning_rate": 9.977759634225996e-06, | |
"loss": 0.7828, | |
"step": 731 | |
}, | |
{ | |
"epoch": 0.02390204081632653, | |
"grad_norm": 2.9412615299224854, | |
"learning_rate": 9.97772697583279e-06, | |
"loss": 0.7684, | |
"step": 732 | |
}, | |
{ | |
"epoch": 0.02393469387755102, | |
"grad_norm": 2.9088659286499023, | |
"learning_rate": 9.977694317439583e-06, | |
"loss": 0.7899, | |
"step": 733 | |
}, | |
{ | |
"epoch": 0.02396734693877551, | |
"grad_norm": 2.811553955078125, | |
"learning_rate": 9.977661659046376e-06, | |
"loss": 0.7705, | |
"step": 734 | |
}, | |
{ | |
"epoch": 0.024, | |
"grad_norm": 2.7078611850738525, | |
"learning_rate": 9.977629000653168e-06, | |
"loss": 0.7803, | |
"step": 735 | |
}, | |
{ | |
"epoch": 0.02403265306122449, | |
"grad_norm": 2.9198622703552246, | |
"learning_rate": 9.977596342259963e-06, | |
"loss": 0.794, | |
"step": 736 | |
}, | |
{ | |
"epoch": 0.02406530612244898, | |
"grad_norm": 3.1856372356414795, | |
"learning_rate": 9.977563683866754e-06, | |
"loss": 0.8146, | |
"step": 737 | |
}, | |
{ | |
"epoch": 0.02409795918367347, | |
"grad_norm": 2.914483070373535, | |
"learning_rate": 9.977531025473547e-06, | |
"loss": 0.813, | |
"step": 738 | |
}, | |
{ | |
"epoch": 0.02413061224489796, | |
"grad_norm": 2.837502956390381, | |
"learning_rate": 9.97749836708034e-06, | |
"loss": 0.7807, | |
"step": 739 | |
}, | |
{ | |
"epoch": 0.02416326530612245, | |
"grad_norm": 2.7452003955841064, | |
"learning_rate": 9.977465708687134e-06, | |
"loss": 0.8272, | |
"step": 740 | |
}, | |
{ | |
"epoch": 0.024195918367346938, | |
"grad_norm": 2.922658681869507, | |
"learning_rate": 9.977433050293927e-06, | |
"loss": 0.8096, | |
"step": 741 | |
}, | |
{ | |
"epoch": 0.02422857142857143, | |
"grad_norm": 2.9923369884490967, | |
"learning_rate": 9.977400391900719e-06, | |
"loss": 0.8232, | |
"step": 742 | |
}, | |
{ | |
"epoch": 0.024261224489795917, | |
"grad_norm": 2.879516363143921, | |
"learning_rate": 9.977367733507512e-06, | |
"loss": 0.8128, | |
"step": 743 | |
}, | |
{ | |
"epoch": 0.02429387755102041, | |
"grad_norm": 2.7809741497039795, | |
"learning_rate": 9.977335075114305e-06, | |
"loss": 0.7824, | |
"step": 744 | |
}, | |
{ | |
"epoch": 0.0243265306122449, | |
"grad_norm": 2.8293521404266357, | |
"learning_rate": 9.977302416721098e-06, | |
"loss": 0.7637, | |
"step": 745 | |
}, | |
{ | |
"epoch": 0.024359183673469387, | |
"grad_norm": 2.900975227355957, | |
"learning_rate": 9.977269758327891e-06, | |
"loss": 0.8065, | |
"step": 746 | |
}, | |
{ | |
"epoch": 0.02439183673469388, | |
"grad_norm": 2.8670005798339844, | |
"learning_rate": 9.977237099934683e-06, | |
"loss": 0.7747, | |
"step": 747 | |
}, | |
{ | |
"epoch": 0.024424489795918367, | |
"grad_norm": 2.987907886505127, | |
"learning_rate": 9.977204441541476e-06, | |
"loss": 0.7907, | |
"step": 748 | |
}, | |
{ | |
"epoch": 0.024457142857142858, | |
"grad_norm": 2.8523452281951904, | |
"learning_rate": 9.97717178314827e-06, | |
"loss": 0.781, | |
"step": 749 | |
}, | |
{ | |
"epoch": 0.024489795918367346, | |
"grad_norm": 2.9037506580352783, | |
"learning_rate": 9.977139124755063e-06, | |
"loss": 0.7505, | |
"step": 750 | |
}, | |
{ | |
"epoch": 0.024489795918367346, | |
"eval_loss": 0.8159348964691162, | |
"eval_runtime": 74.1956, | |
"eval_samples_per_second": 1.348, | |
"eval_steps_per_second": 1.348, | |
"step": 750 | |
}, | |
{ | |
"epoch": 0.024522448979591837, | |
"grad_norm": 2.8492870330810547, | |
"learning_rate": 9.977106466361856e-06, | |
"loss": 0.8186, | |
"step": 751 | |
}, | |
{ | |
"epoch": 0.024555102040816325, | |
"grad_norm": 3.0403406620025635, | |
"learning_rate": 9.97707380796865e-06, | |
"loss": 0.8152, | |
"step": 752 | |
}, | |
{ | |
"epoch": 0.024587755102040816, | |
"grad_norm": 2.8419723510742188, | |
"learning_rate": 9.97704114957544e-06, | |
"loss": 0.7908, | |
"step": 753 | |
}, | |
{ | |
"epoch": 0.024620408163265307, | |
"grad_norm": 3.0176303386688232, | |
"learning_rate": 9.977008491182236e-06, | |
"loss": 0.7921, | |
"step": 754 | |
}, | |
{ | |
"epoch": 0.024653061224489795, | |
"grad_norm": 2.928659439086914, | |
"learning_rate": 9.976975832789027e-06, | |
"loss": 0.7784, | |
"step": 755 | |
}, | |
{ | |
"epoch": 0.024685714285714287, | |
"grad_norm": 2.800809621810913, | |
"learning_rate": 9.97694317439582e-06, | |
"loss": 0.7362, | |
"step": 756 | |
}, | |
{ | |
"epoch": 0.024718367346938774, | |
"grad_norm": 2.698071002960205, | |
"learning_rate": 9.976910516002614e-06, | |
"loss": 0.8046, | |
"step": 757 | |
}, | |
{ | |
"epoch": 0.024751020408163266, | |
"grad_norm": 2.8206839561462402, | |
"learning_rate": 9.976877857609407e-06, | |
"loss": 0.8169, | |
"step": 758 | |
}, | |
{ | |
"epoch": 0.024783673469387753, | |
"grad_norm": 2.927217960357666, | |
"learning_rate": 9.9768451992162e-06, | |
"loss": 0.7816, | |
"step": 759 | |
}, | |
{ | |
"epoch": 0.024816326530612245, | |
"grad_norm": 2.9845449924468994, | |
"learning_rate": 9.976812540822992e-06, | |
"loss": 0.801, | |
"step": 760 | |
}, | |
{ | |
"epoch": 0.024848979591836736, | |
"grad_norm": 2.881765604019165, | |
"learning_rate": 9.976779882429785e-06, | |
"loss": 0.8052, | |
"step": 761 | |
}, | |
{ | |
"epoch": 0.024881632653061224, | |
"grad_norm": 2.9570679664611816, | |
"learning_rate": 9.976747224036578e-06, | |
"loss": 0.8044, | |
"step": 762 | |
}, | |
{ | |
"epoch": 0.024914285714285715, | |
"grad_norm": 3.069812774658203, | |
"learning_rate": 9.976714565643371e-06, | |
"loss": 0.8166, | |
"step": 763 | |
}, | |
{ | |
"epoch": 0.024946938775510203, | |
"grad_norm": 3.0275254249572754, | |
"learning_rate": 9.976681907250165e-06, | |
"loss": 0.7835, | |
"step": 764 | |
}, | |
{ | |
"epoch": 0.024979591836734694, | |
"grad_norm": 3.007643461227417, | |
"learning_rate": 9.976649248856956e-06, | |
"loss": 0.8088, | |
"step": 765 | |
}, | |
{ | |
"epoch": 0.025012244897959182, | |
"grad_norm": 2.788733720779419, | |
"learning_rate": 9.97661659046375e-06, | |
"loss": 0.7852, | |
"step": 766 | |
}, | |
{ | |
"epoch": 0.025044897959183673, | |
"grad_norm": 3.187893867492676, | |
"learning_rate": 9.976583932070543e-06, | |
"loss": 0.7887, | |
"step": 767 | |
}, | |
{ | |
"epoch": 0.025077551020408165, | |
"grad_norm": 3.0475502014160156, | |
"learning_rate": 9.976551273677336e-06, | |
"loss": 0.7837, | |
"step": 768 | |
}, | |
{ | |
"epoch": 0.025110204081632653, | |
"grad_norm": 2.9664242267608643, | |
"learning_rate": 9.976518615284129e-06, | |
"loss": 0.8021, | |
"step": 769 | |
}, | |
{ | |
"epoch": 0.025142857142857144, | |
"grad_norm": 3.1118979454040527, | |
"learning_rate": 9.976485956890922e-06, | |
"loss": 0.7992, | |
"step": 770 | |
}, | |
{ | |
"epoch": 0.02517551020408163, | |
"grad_norm": 2.909301280975342, | |
"learning_rate": 9.976453298497714e-06, | |
"loss": 0.743, | |
"step": 771 | |
}, | |
{ | |
"epoch": 0.025208163265306123, | |
"grad_norm": 2.9557557106018066, | |
"learning_rate": 9.976420640104507e-06, | |
"loss": 0.8007, | |
"step": 772 | |
}, | |
{ | |
"epoch": 0.02524081632653061, | |
"grad_norm": 3.108922243118286, | |
"learning_rate": 9.9763879817113e-06, | |
"loss": 0.8127, | |
"step": 773 | |
}, | |
{ | |
"epoch": 0.025273469387755102, | |
"grad_norm": 2.927797555923462, | |
"learning_rate": 9.976355323318094e-06, | |
"loss": 0.7577, | |
"step": 774 | |
}, | |
{ | |
"epoch": 0.025306122448979593, | |
"grad_norm": 2.929468870162964, | |
"learning_rate": 9.976322664924887e-06, | |
"loss": 0.7662, | |
"step": 775 | |
}, | |
{ | |
"epoch": 0.02533877551020408, | |
"grad_norm": 3.0343034267425537, | |
"learning_rate": 9.976290006531678e-06, | |
"loss": 0.8283, | |
"step": 776 | |
}, | |
{ | |
"epoch": 0.025371428571428573, | |
"grad_norm": 3.136800765991211, | |
"learning_rate": 9.976257348138473e-06, | |
"loss": 0.7935, | |
"step": 777 | |
}, | |
{ | |
"epoch": 0.02540408163265306, | |
"grad_norm": 2.916213035583496, | |
"learning_rate": 9.976224689745265e-06, | |
"loss": 0.7651, | |
"step": 778 | |
}, | |
{ | |
"epoch": 0.02543673469387755, | |
"grad_norm": 2.929622173309326, | |
"learning_rate": 9.976192031352058e-06, | |
"loss": 0.7677, | |
"step": 779 | |
}, | |
{ | |
"epoch": 0.02546938775510204, | |
"grad_norm": 3.2419509887695312, | |
"learning_rate": 9.976159372958851e-06, | |
"loss": 0.7789, | |
"step": 780 | |
}, | |
{ | |
"epoch": 0.02550204081632653, | |
"grad_norm": 2.8888614177703857, | |
"learning_rate": 9.976126714565645e-06, | |
"loss": 0.7735, | |
"step": 781 | |
}, | |
{ | |
"epoch": 0.025534693877551022, | |
"grad_norm": 3.1505699157714844, | |
"learning_rate": 9.976094056172438e-06, | |
"loss": 0.8128, | |
"step": 782 | |
}, | |
{ | |
"epoch": 0.02556734693877551, | |
"grad_norm": 3.030273675918579, | |
"learning_rate": 9.97606139777923e-06, | |
"loss": 0.7678, | |
"step": 783 | |
}, | |
{ | |
"epoch": 0.0256, | |
"grad_norm": 3.0003929138183594, | |
"learning_rate": 9.976028739386023e-06, | |
"loss": 0.8076, | |
"step": 784 | |
}, | |
{ | |
"epoch": 0.02563265306122449, | |
"grad_norm": 2.957667350769043, | |
"learning_rate": 9.975996080992816e-06, | |
"loss": 0.7886, | |
"step": 785 | |
}, | |
{ | |
"epoch": 0.02566530612244898, | |
"grad_norm": 3.0816612243652344, | |
"learning_rate": 9.975963422599609e-06, | |
"loss": 0.7871, | |
"step": 786 | |
}, | |
{ | |
"epoch": 0.025697959183673468, | |
"grad_norm": 2.947930097579956, | |
"learning_rate": 9.975930764206402e-06, | |
"loss": 0.8288, | |
"step": 787 | |
}, | |
{ | |
"epoch": 0.02573061224489796, | |
"grad_norm": 3.121185302734375, | |
"learning_rate": 9.975898105813194e-06, | |
"loss": 0.793, | |
"step": 788 | |
}, | |
{ | |
"epoch": 0.025763265306122447, | |
"grad_norm": 3.0452706813812256, | |
"learning_rate": 9.975865447419987e-06, | |
"loss": 0.7819, | |
"step": 789 | |
}, | |
{ | |
"epoch": 0.02579591836734694, | |
"grad_norm": 2.903444528579712, | |
"learning_rate": 9.97583278902678e-06, | |
"loss": 0.798, | |
"step": 790 | |
}, | |
{ | |
"epoch": 0.02582857142857143, | |
"grad_norm": 3.088996410369873, | |
"learning_rate": 9.975800130633574e-06, | |
"loss": 0.8171, | |
"step": 791 | |
}, | |
{ | |
"epoch": 0.025861224489795918, | |
"grad_norm": 2.957293748855591, | |
"learning_rate": 9.975767472240367e-06, | |
"loss": 0.7862, | |
"step": 792 | |
}, | |
{ | |
"epoch": 0.02589387755102041, | |
"grad_norm": 2.7569499015808105, | |
"learning_rate": 9.97573481384716e-06, | |
"loss": 0.7899, | |
"step": 793 | |
}, | |
{ | |
"epoch": 0.025926530612244897, | |
"grad_norm": 2.931257724761963, | |
"learning_rate": 9.975702155453952e-06, | |
"loss": 0.8051, | |
"step": 794 | |
}, | |
{ | |
"epoch": 0.025959183673469388, | |
"grad_norm": 2.9962334632873535, | |
"learning_rate": 9.975669497060746e-06, | |
"loss": 0.7914, | |
"step": 795 | |
}, | |
{ | |
"epoch": 0.025991836734693876, | |
"grad_norm": 3.016993761062622, | |
"learning_rate": 9.975636838667538e-06, | |
"loss": 0.8011, | |
"step": 796 | |
}, | |
{ | |
"epoch": 0.026024489795918367, | |
"grad_norm": 3.066710948944092, | |
"learning_rate": 9.975604180274331e-06, | |
"loss": 0.7885, | |
"step": 797 | |
}, | |
{ | |
"epoch": 0.02605714285714286, | |
"grad_norm": 2.9981632232666016, | |
"learning_rate": 9.975571521881125e-06, | |
"loss": 0.7968, | |
"step": 798 | |
}, | |
{ | |
"epoch": 0.026089795918367346, | |
"grad_norm": 2.971733331680298, | |
"learning_rate": 9.975538863487916e-06, | |
"loss": 0.7997, | |
"step": 799 | |
}, | |
{ | |
"epoch": 0.026122448979591838, | |
"grad_norm": 3.0647032260894775, | |
"learning_rate": 9.975506205094711e-06, | |
"loss": 0.8122, | |
"step": 800 | |
}, | |
{ | |
"epoch": 0.026122448979591838, | |
"eval_loss": 0.8141899108886719, | |
"eval_runtime": 73.6209, | |
"eval_samples_per_second": 1.358, | |
"eval_steps_per_second": 1.358, | |
"step": 800 | |
}, | |
{ | |
"epoch": 0.026155102040816326, | |
"grad_norm": 3.015854597091675, | |
"learning_rate": 9.975473546701503e-06, | |
"loss": 0.8062, | |
"step": 801 | |
}, | |
{ | |
"epoch": 0.026187755102040817, | |
"grad_norm": 3.065284013748169, | |
"learning_rate": 9.975440888308296e-06, | |
"loss": 0.7973, | |
"step": 802 | |
}, | |
{ | |
"epoch": 0.026220408163265305, | |
"grad_norm": 2.8390414714813232, | |
"learning_rate": 9.975408229915089e-06, | |
"loss": 0.7943, | |
"step": 803 | |
}, | |
{ | |
"epoch": 0.026253061224489796, | |
"grad_norm": 3.254967451095581, | |
"learning_rate": 9.975375571521882e-06, | |
"loss": 0.8059, | |
"step": 804 | |
}, | |
{ | |
"epoch": 0.026285714285714287, | |
"grad_norm": 3.0705697536468506, | |
"learning_rate": 9.975342913128675e-06, | |
"loss": 0.7871, | |
"step": 805 | |
}, | |
{ | |
"epoch": 0.026318367346938775, | |
"grad_norm": 2.900188446044922, | |
"learning_rate": 9.975310254735467e-06, | |
"loss": 0.7672, | |
"step": 806 | |
}, | |
{ | |
"epoch": 0.026351020408163266, | |
"grad_norm": 2.8207056522369385, | |
"learning_rate": 9.97527759634226e-06, | |
"loss": 0.7921, | |
"step": 807 | |
}, | |
{ | |
"epoch": 0.026383673469387754, | |
"grad_norm": 2.891328811645508, | |
"learning_rate": 9.975244937949053e-06, | |
"loss": 0.8177, | |
"step": 808 | |
}, | |
{ | |
"epoch": 0.026416326530612246, | |
"grad_norm": 2.974545955657959, | |
"learning_rate": 9.975212279555847e-06, | |
"loss": 0.7773, | |
"step": 809 | |
}, | |
{ | |
"epoch": 0.026448979591836733, | |
"grad_norm": 2.8445703983306885, | |
"learning_rate": 9.97517962116264e-06, | |
"loss": 0.8131, | |
"step": 810 | |
}, | |
{ | |
"epoch": 0.026481632653061225, | |
"grad_norm": 2.93383526802063, | |
"learning_rate": 9.975146962769433e-06, | |
"loss": 0.7693, | |
"step": 811 | |
}, | |
{ | |
"epoch": 0.026514285714285716, | |
"grad_norm": 3.0080511569976807, | |
"learning_rate": 9.975114304376225e-06, | |
"loss": 0.7593, | |
"step": 812 | |
}, | |
{ | |
"epoch": 0.026546938775510204, | |
"grad_norm": 2.967305898666382, | |
"learning_rate": 9.975081645983018e-06, | |
"loss": 0.7773, | |
"step": 813 | |
}, | |
{ | |
"epoch": 0.026579591836734695, | |
"grad_norm": 2.9201626777648926, | |
"learning_rate": 9.975048987589811e-06, | |
"loss": 0.7908, | |
"step": 814 | |
}, | |
{ | |
"epoch": 0.026612244897959183, | |
"grad_norm": 3.0869758129119873, | |
"learning_rate": 9.975016329196604e-06, | |
"loss": 0.8015, | |
"step": 815 | |
}, | |
{ | |
"epoch": 0.026644897959183674, | |
"grad_norm": 3.120110511779785, | |
"learning_rate": 9.974983670803398e-06, | |
"loss": 0.7993, | |
"step": 816 | |
}, | |
{ | |
"epoch": 0.026677551020408162, | |
"grad_norm": 3.1260945796966553, | |
"learning_rate": 9.97495101241019e-06, | |
"loss": 0.8407, | |
"step": 817 | |
}, | |
{ | |
"epoch": 0.026710204081632653, | |
"grad_norm": 2.858689785003662, | |
"learning_rate": 9.974918354016984e-06, | |
"loss": 0.8001, | |
"step": 818 | |
}, | |
{ | |
"epoch": 0.02674285714285714, | |
"grad_norm": 2.854126214981079, | |
"learning_rate": 9.974885695623776e-06, | |
"loss": 0.754, | |
"step": 819 | |
}, | |
{ | |
"epoch": 0.026775510204081632, | |
"grad_norm": 2.932650566101074, | |
"learning_rate": 9.974853037230569e-06, | |
"loss": 0.8051, | |
"step": 820 | |
}, | |
{ | |
"epoch": 0.026808163265306124, | |
"grad_norm": 3.063242197036743, | |
"learning_rate": 9.974820378837362e-06, | |
"loss": 0.8114, | |
"step": 821 | |
}, | |
{ | |
"epoch": 0.02684081632653061, | |
"grad_norm": 2.8537795543670654, | |
"learning_rate": 9.974787720444155e-06, | |
"loss": 0.7842, | |
"step": 822 | |
}, | |
{ | |
"epoch": 0.026873469387755103, | |
"grad_norm": 2.986083745956421, | |
"learning_rate": 9.974755062050949e-06, | |
"loss": 0.7949, | |
"step": 823 | |
}, | |
{ | |
"epoch": 0.02690612244897959, | |
"grad_norm": 3.0178754329681396, | |
"learning_rate": 9.97472240365774e-06, | |
"loss": 0.8005, | |
"step": 824 | |
}, | |
{ | |
"epoch": 0.026938775510204082, | |
"grad_norm": 3.1746835708618164, | |
"learning_rate": 9.974689745264533e-06, | |
"loss": 0.7994, | |
"step": 825 | |
}, | |
{ | |
"epoch": 0.02697142857142857, | |
"grad_norm": 2.8827731609344482, | |
"learning_rate": 9.974657086871327e-06, | |
"loss": 0.7764, | |
"step": 826 | |
}, | |
{ | |
"epoch": 0.02700408163265306, | |
"grad_norm": 2.7529618740081787, | |
"learning_rate": 9.97462442847812e-06, | |
"loss": 0.8092, | |
"step": 827 | |
}, | |
{ | |
"epoch": 0.027036734693877552, | |
"grad_norm": 2.8854501247406006, | |
"learning_rate": 9.974591770084913e-06, | |
"loss": 0.7713, | |
"step": 828 | |
}, | |
{ | |
"epoch": 0.02706938775510204, | |
"grad_norm": 2.8492770195007324, | |
"learning_rate": 9.974559111691705e-06, | |
"loss": 0.75, | |
"step": 829 | |
}, | |
{ | |
"epoch": 0.02710204081632653, | |
"grad_norm": 2.9942445755004883, | |
"learning_rate": 9.974526453298498e-06, | |
"loss": 0.7838, | |
"step": 830 | |
}, | |
{ | |
"epoch": 0.02713469387755102, | |
"grad_norm": 2.9380862712860107, | |
"learning_rate": 9.974493794905291e-06, | |
"loss": 0.8111, | |
"step": 831 | |
}, | |
{ | |
"epoch": 0.02716734693877551, | |
"grad_norm": 2.8200275897979736, | |
"learning_rate": 9.974461136512084e-06, | |
"loss": 0.7818, | |
"step": 832 | |
}, | |
{ | |
"epoch": 0.0272, | |
"grad_norm": 3.0098073482513428, | |
"learning_rate": 9.974428478118878e-06, | |
"loss": 0.7786, | |
"step": 833 | |
}, | |
{ | |
"epoch": 0.02723265306122449, | |
"grad_norm": 2.96885347366333, | |
"learning_rate": 9.97439581972567e-06, | |
"loss": 0.762, | |
"step": 834 | |
}, | |
{ | |
"epoch": 0.02726530612244898, | |
"grad_norm": 2.9910387992858887, | |
"learning_rate": 9.974363161332462e-06, | |
"loss": 0.8142, | |
"step": 835 | |
}, | |
{ | |
"epoch": 0.02729795918367347, | |
"grad_norm": 3.1752495765686035, | |
"learning_rate": 9.974330502939257e-06, | |
"loss": 0.8031, | |
"step": 836 | |
}, | |
{ | |
"epoch": 0.02733061224489796, | |
"grad_norm": 2.9318926334381104, | |
"learning_rate": 9.974297844546049e-06, | |
"loss": 0.7873, | |
"step": 837 | |
}, | |
{ | |
"epoch": 0.027363265306122448, | |
"grad_norm": 2.8536527156829834, | |
"learning_rate": 9.974265186152842e-06, | |
"loss": 0.7878, | |
"step": 838 | |
}, | |
{ | |
"epoch": 0.02739591836734694, | |
"grad_norm": 2.7921385765075684, | |
"learning_rate": 9.974232527759635e-06, | |
"loss": 0.8042, | |
"step": 839 | |
}, | |
{ | |
"epoch": 0.027428571428571427, | |
"grad_norm": 2.97587251663208, | |
"learning_rate": 9.974199869366427e-06, | |
"loss": 0.8044, | |
"step": 840 | |
}, | |
{ | |
"epoch": 0.02746122448979592, | |
"grad_norm": 2.879415273666382, | |
"learning_rate": 9.974167210973222e-06, | |
"loss": 0.7902, | |
"step": 841 | |
}, | |
{ | |
"epoch": 0.02749387755102041, | |
"grad_norm": 2.930534839630127, | |
"learning_rate": 9.974134552580013e-06, | |
"loss": 0.7809, | |
"step": 842 | |
}, | |
{ | |
"epoch": 0.027526530612244898, | |
"grad_norm": 2.8264150619506836, | |
"learning_rate": 9.974101894186807e-06, | |
"loss": 0.7565, | |
"step": 843 | |
}, | |
{ | |
"epoch": 0.02755918367346939, | |
"grad_norm": 2.879453182220459, | |
"learning_rate": 9.9740692357936e-06, | |
"loss": 0.8139, | |
"step": 844 | |
}, | |
{ | |
"epoch": 0.027591836734693877, | |
"grad_norm": 2.805103063583374, | |
"learning_rate": 9.974036577400393e-06, | |
"loss": 0.798, | |
"step": 845 | |
}, | |
{ | |
"epoch": 0.027624489795918368, | |
"grad_norm": 2.8710720539093018, | |
"learning_rate": 9.974003919007186e-06, | |
"loss": 0.7412, | |
"step": 846 | |
}, | |
{ | |
"epoch": 0.027657142857142856, | |
"grad_norm": 2.9521493911743164, | |
"learning_rate": 9.973971260613978e-06, | |
"loss": 0.7943, | |
"step": 847 | |
}, | |
{ | |
"epoch": 0.027689795918367347, | |
"grad_norm": 3.062432050704956, | |
"learning_rate": 9.973938602220771e-06, | |
"loss": 0.7884, | |
"step": 848 | |
}, | |
{ | |
"epoch": 0.02772244897959184, | |
"grad_norm": 2.98431658744812, | |
"learning_rate": 9.973905943827564e-06, | |
"loss": 0.7643, | |
"step": 849 | |
}, | |
{ | |
"epoch": 0.027755102040816326, | |
"grad_norm": 2.852004289627075, | |
"learning_rate": 9.973873285434358e-06, | |
"loss": 0.8126, | |
"step": 850 | |
}, | |
{ | |
"epoch": 0.027755102040816326, | |
"eval_loss": 0.8143028020858765, | |
"eval_runtime": 85.3794, | |
"eval_samples_per_second": 1.171, | |
"eval_steps_per_second": 1.171, | |
"step": 850 | |
}, | |
{ | |
"epoch": 0.027787755102040818, | |
"grad_norm": 3.2032933235168457, | |
"learning_rate": 9.97384062704115e-06, | |
"loss": 0.7847, | |
"step": 851 | |
}, | |
{ | |
"epoch": 0.027820408163265305, | |
"grad_norm": 3.0217716693878174, | |
"learning_rate": 9.973807968647944e-06, | |
"loss": 0.7812, | |
"step": 852 | |
}, | |
{ | |
"epoch": 0.027853061224489797, | |
"grad_norm": 2.961899995803833, | |
"learning_rate": 9.973775310254736e-06, | |
"loss": 0.7917, | |
"step": 853 | |
}, | |
{ | |
"epoch": 0.027885714285714285, | |
"grad_norm": 2.9481585025787354, | |
"learning_rate": 9.973742651861529e-06, | |
"loss": 0.7866, | |
"step": 854 | |
}, | |
{ | |
"epoch": 0.027918367346938776, | |
"grad_norm": 2.8054749965667725, | |
"learning_rate": 9.973709993468322e-06, | |
"loss": 0.7808, | |
"step": 855 | |
}, | |
{ | |
"epoch": 0.027951020408163264, | |
"grad_norm": 3.106367349624634, | |
"learning_rate": 9.973677335075115e-06, | |
"loss": 0.8281, | |
"step": 856 | |
}, | |
{ | |
"epoch": 0.027983673469387755, | |
"grad_norm": 3.7496819496154785, | |
"learning_rate": 9.973644676681908e-06, | |
"loss": 0.7851, | |
"step": 857 | |
}, | |
{ | |
"epoch": 0.028016326530612246, | |
"grad_norm": 2.7985293865203857, | |
"learning_rate": 9.9736120182887e-06, | |
"loss": 0.7843, | |
"step": 858 | |
}, | |
{ | |
"epoch": 0.028048979591836734, | |
"grad_norm": 2.8734278678894043, | |
"learning_rate": 9.973579359895495e-06, | |
"loss": 0.7482, | |
"step": 859 | |
}, | |
{ | |
"epoch": 0.028081632653061225, | |
"grad_norm": 3.0287320613861084, | |
"learning_rate": 9.973546701502287e-06, | |
"loss": 0.7901, | |
"step": 860 | |
}, | |
{ | |
"epoch": 0.028114285714285713, | |
"grad_norm": 3.2069811820983887, | |
"learning_rate": 9.97351404310908e-06, | |
"loss": 0.7762, | |
"step": 861 | |
}, | |
{ | |
"epoch": 0.028146938775510204, | |
"grad_norm": 3.193441152572632, | |
"learning_rate": 9.973481384715873e-06, | |
"loss": 0.7867, | |
"step": 862 | |
}, | |
{ | |
"epoch": 0.028179591836734692, | |
"grad_norm": 2.932454824447632, | |
"learning_rate": 9.973448726322665e-06, | |
"loss": 0.801, | |
"step": 863 | |
}, | |
{ | |
"epoch": 0.028212244897959184, | |
"grad_norm": 3.009593963623047, | |
"learning_rate": 9.97341606792946e-06, | |
"loss": 0.7612, | |
"step": 864 | |
}, | |
{ | |
"epoch": 0.028244897959183675, | |
"grad_norm": 3.2580459117889404, | |
"learning_rate": 9.973383409536251e-06, | |
"loss": 0.7295, | |
"step": 865 | |
}, | |
{ | |
"epoch": 0.028277551020408163, | |
"grad_norm": 2.934058904647827, | |
"learning_rate": 9.973350751143044e-06, | |
"loss": 0.7766, | |
"step": 866 | |
}, | |
{ | |
"epoch": 0.028310204081632654, | |
"grad_norm": 3.0753285884857178, | |
"learning_rate": 9.973318092749837e-06, | |
"loss": 0.7647, | |
"step": 867 | |
}, | |
{ | |
"epoch": 0.028342857142857142, | |
"grad_norm": 3.1178665161132812, | |
"learning_rate": 9.97328543435663e-06, | |
"loss": 0.8071, | |
"step": 868 | |
}, | |
{ | |
"epoch": 0.028375510204081633, | |
"grad_norm": 3.235008955001831, | |
"learning_rate": 9.973252775963424e-06, | |
"loss": 0.7899, | |
"step": 869 | |
}, | |
{ | |
"epoch": 0.02840816326530612, | |
"grad_norm": 3.198587417602539, | |
"learning_rate": 9.973220117570215e-06, | |
"loss": 0.7918, | |
"step": 870 | |
}, | |
{ | |
"epoch": 0.028440816326530612, | |
"grad_norm": 3.1364388465881348, | |
"learning_rate": 9.973187459177009e-06, | |
"loss": 0.8178, | |
"step": 871 | |
}, | |
{ | |
"epoch": 0.028473469387755104, | |
"grad_norm": 3.1181514263153076, | |
"learning_rate": 9.973154800783802e-06, | |
"loss": 0.7922, | |
"step": 872 | |
}, | |
{ | |
"epoch": 0.02850612244897959, | |
"grad_norm": 3.0627710819244385, | |
"learning_rate": 9.973122142390595e-06, | |
"loss": 0.8063, | |
"step": 873 | |
}, | |
{ | |
"epoch": 0.028538775510204083, | |
"grad_norm": 3.042802572250366, | |
"learning_rate": 9.973089483997388e-06, | |
"loss": 0.8304, | |
"step": 874 | |
}, | |
{ | |
"epoch": 0.02857142857142857, | |
"grad_norm": 2.935323715209961, | |
"learning_rate": 9.973056825604182e-06, | |
"loss": 0.7983, | |
"step": 875 | |
}, | |
{ | |
"epoch": 0.028604081632653062, | |
"grad_norm": 2.9549400806427, | |
"learning_rate": 9.973024167210973e-06, | |
"loss": 0.8063, | |
"step": 876 | |
}, | |
{ | |
"epoch": 0.02863673469387755, | |
"grad_norm": 2.7875542640686035, | |
"learning_rate": 9.972991508817766e-06, | |
"loss": 0.7981, | |
"step": 877 | |
}, | |
{ | |
"epoch": 0.02866938775510204, | |
"grad_norm": 2.9394519329071045, | |
"learning_rate": 9.97295885042456e-06, | |
"loss": 0.792, | |
"step": 878 | |
}, | |
{ | |
"epoch": 0.028702040816326532, | |
"grad_norm": 3.0494203567504883, | |
"learning_rate": 9.972926192031353e-06, | |
"loss": 0.8128, | |
"step": 879 | |
}, | |
{ | |
"epoch": 0.02873469387755102, | |
"grad_norm": 2.9686169624328613, | |
"learning_rate": 9.972893533638146e-06, | |
"loss": 0.7797, | |
"step": 880 | |
}, | |
{ | |
"epoch": 0.02876734693877551, | |
"grad_norm": 2.964944839477539, | |
"learning_rate": 9.972860875244938e-06, | |
"loss": 0.7494, | |
"step": 881 | |
}, | |
{ | |
"epoch": 0.0288, | |
"grad_norm": 2.9027297496795654, | |
"learning_rate": 9.972828216851733e-06, | |
"loss": 0.7526, | |
"step": 882 | |
}, | |
{ | |
"epoch": 0.02883265306122449, | |
"grad_norm": 3.0997474193573, | |
"learning_rate": 9.972795558458524e-06, | |
"loss": 0.7878, | |
"step": 883 | |
}, | |
{ | |
"epoch": 0.02886530612244898, | |
"grad_norm": 2.999011516571045, | |
"learning_rate": 9.972762900065317e-06, | |
"loss": 0.785, | |
"step": 884 | |
}, | |
{ | |
"epoch": 0.02889795918367347, | |
"grad_norm": 3.1621158123016357, | |
"learning_rate": 9.97273024167211e-06, | |
"loss": 0.8338, | |
"step": 885 | |
}, | |
{ | |
"epoch": 0.028930612244897957, | |
"grad_norm": 2.943216323852539, | |
"learning_rate": 9.972697583278904e-06, | |
"loss": 0.7844, | |
"step": 886 | |
}, | |
{ | |
"epoch": 0.02896326530612245, | |
"grad_norm": 3.0958337783813477, | |
"learning_rate": 9.972664924885697e-06, | |
"loss": 0.7895, | |
"step": 887 | |
}, | |
{ | |
"epoch": 0.02899591836734694, | |
"grad_norm": 2.982701539993286, | |
"learning_rate": 9.972632266492489e-06, | |
"loss": 0.7783, | |
"step": 888 | |
}, | |
{ | |
"epoch": 0.029028571428571428, | |
"grad_norm": 2.9064059257507324, | |
"learning_rate": 9.972599608099282e-06, | |
"loss": 0.7915, | |
"step": 889 | |
}, | |
{ | |
"epoch": 0.02906122448979592, | |
"grad_norm": 2.8960936069488525, | |
"learning_rate": 9.972566949706075e-06, | |
"loss": 0.7914, | |
"step": 890 | |
}, | |
{ | |
"epoch": 0.029093877551020407, | |
"grad_norm": 2.881122350692749, | |
"learning_rate": 9.972534291312868e-06, | |
"loss": 0.7673, | |
"step": 891 | |
}, | |
{ | |
"epoch": 0.0291265306122449, | |
"grad_norm": 2.9828569889068604, | |
"learning_rate": 9.972501632919662e-06, | |
"loss": 0.8089, | |
"step": 892 | |
}, | |
{ | |
"epoch": 0.029159183673469386, | |
"grad_norm": 3.0593910217285156, | |
"learning_rate": 9.972468974526453e-06, | |
"loss": 0.8079, | |
"step": 893 | |
}, | |
{ | |
"epoch": 0.029191836734693877, | |
"grad_norm": 2.993431568145752, | |
"learning_rate": 9.972436316133246e-06, | |
"loss": 0.7855, | |
"step": 894 | |
}, | |
{ | |
"epoch": 0.02922448979591837, | |
"grad_norm": 3.132528066635132, | |
"learning_rate": 9.97240365774004e-06, | |
"loss": 0.7948, | |
"step": 895 | |
}, | |
{ | |
"epoch": 0.029257142857142857, | |
"grad_norm": 3.0954887866973877, | |
"learning_rate": 9.972370999346833e-06, | |
"loss": 0.765, | |
"step": 896 | |
}, | |
{ | |
"epoch": 0.029289795918367348, | |
"grad_norm": 3.0066490173339844, | |
"learning_rate": 9.972338340953626e-06, | |
"loss": 0.7616, | |
"step": 897 | |
}, | |
{ | |
"epoch": 0.029322448979591836, | |
"grad_norm": 3.05830717086792, | |
"learning_rate": 9.97230568256042e-06, | |
"loss": 0.7585, | |
"step": 898 | |
}, | |
{ | |
"epoch": 0.029355102040816327, | |
"grad_norm": 3.2171273231506348, | |
"learning_rate": 9.972273024167211e-06, | |
"loss": 0.8138, | |
"step": 899 | |
}, | |
{ | |
"epoch": 0.029387755102040815, | |
"grad_norm": 2.9297590255737305, | |
"learning_rate": 9.972240365774006e-06, | |
"loss": 0.762, | |
"step": 900 | |
}, | |
{ | |
"epoch": 0.029387755102040815, | |
"eval_loss": 0.8132917284965515, | |
"eval_runtime": 76.0316, | |
"eval_samples_per_second": 1.315, | |
"eval_steps_per_second": 1.315, | |
"step": 900 | |
}, | |
{ | |
"epoch": 0.029420408163265306, | |
"grad_norm": 2.8318891525268555, | |
"learning_rate": 9.972207707380797e-06, | |
"loss": 0.7571, | |
"step": 901 | |
}, | |
{ | |
"epoch": 0.029453061224489797, | |
"grad_norm": 3.222458600997925, | |
"learning_rate": 9.97217504898759e-06, | |
"loss": 0.7869, | |
"step": 902 | |
}, | |
{ | |
"epoch": 0.029485714285714285, | |
"grad_norm": 3.2787888050079346, | |
"learning_rate": 9.972142390594384e-06, | |
"loss": 0.7922, | |
"step": 903 | |
}, | |
{ | |
"epoch": 0.029518367346938777, | |
"grad_norm": 2.9158124923706055, | |
"learning_rate": 9.972109732201175e-06, | |
"loss": 0.8008, | |
"step": 904 | |
}, | |
{ | |
"epoch": 0.029551020408163264, | |
"grad_norm": 2.931711435317993, | |
"learning_rate": 9.97207707380797e-06, | |
"loss": 0.7958, | |
"step": 905 | |
}, | |
{ | |
"epoch": 0.029583673469387756, | |
"grad_norm": 3.1747889518737793, | |
"learning_rate": 9.972044415414762e-06, | |
"loss": 0.7953, | |
"step": 906 | |
}, | |
{ | |
"epoch": 0.029616326530612243, | |
"grad_norm": 3.23126482963562, | |
"learning_rate": 9.972011757021555e-06, | |
"loss": 0.8133, | |
"step": 907 | |
}, | |
{ | |
"epoch": 0.029648979591836735, | |
"grad_norm": 2.8572208881378174, | |
"learning_rate": 9.971979098628348e-06, | |
"loss": 0.7832, | |
"step": 908 | |
}, | |
{ | |
"epoch": 0.029681632653061226, | |
"grad_norm": 3.004584312438965, | |
"learning_rate": 9.971946440235142e-06, | |
"loss": 0.8025, | |
"step": 909 | |
}, | |
{ | |
"epoch": 0.029714285714285714, | |
"grad_norm": 2.7122766971588135, | |
"learning_rate": 9.971913781841935e-06, | |
"loss": 0.7819, | |
"step": 910 | |
}, | |
{ | |
"epoch": 0.029746938775510205, | |
"grad_norm": 2.9016056060791016, | |
"learning_rate": 9.971881123448726e-06, | |
"loss": 0.7852, | |
"step": 911 | |
}, | |
{ | |
"epoch": 0.029779591836734693, | |
"grad_norm": 2.985293388366699, | |
"learning_rate": 9.97184846505552e-06, | |
"loss": 0.8294, | |
"step": 912 | |
}, | |
{ | |
"epoch": 0.029812244897959184, | |
"grad_norm": 2.9722225666046143, | |
"learning_rate": 9.971815806662313e-06, | |
"loss": 0.7924, | |
"step": 913 | |
}, | |
{ | |
"epoch": 0.029844897959183672, | |
"grad_norm": 2.8796117305755615, | |
"learning_rate": 9.971783148269106e-06, | |
"loss": 0.7549, | |
"step": 914 | |
}, | |
{ | |
"epoch": 0.029877551020408163, | |
"grad_norm": 3.0465636253356934, | |
"learning_rate": 9.9717504898759e-06, | |
"loss": 0.7875, | |
"step": 915 | |
}, | |
{ | |
"epoch": 0.029910204081632655, | |
"grad_norm": 3.179034948348999, | |
"learning_rate": 9.971717831482692e-06, | |
"loss": 0.774, | |
"step": 916 | |
}, | |
{ | |
"epoch": 0.029942857142857143, | |
"grad_norm": 3.2824554443359375, | |
"learning_rate": 9.971685173089484e-06, | |
"loss": 0.788, | |
"step": 917 | |
}, | |
{ | |
"epoch": 0.029975510204081634, | |
"grad_norm": 3.2578866481781006, | |
"learning_rate": 9.971652514696277e-06, | |
"loss": 0.7672, | |
"step": 918 | |
}, | |
{ | |
"epoch": 0.03000816326530612, | |
"grad_norm": 3.1106038093566895, | |
"learning_rate": 9.97161985630307e-06, | |
"loss": 0.8133, | |
"step": 919 | |
}, | |
{ | |
"epoch": 0.030040816326530613, | |
"grad_norm": 2.948033094406128, | |
"learning_rate": 9.971587197909864e-06, | |
"loss": 0.7819, | |
"step": 920 | |
}, | |
{ | |
"epoch": 0.0300734693877551, | |
"grad_norm": 3.094900131225586, | |
"learning_rate": 9.971554539516657e-06, | |
"loss": 0.7833, | |
"step": 921 | |
}, | |
{ | |
"epoch": 0.030106122448979592, | |
"grad_norm": 2.9197640419006348, | |
"learning_rate": 9.971521881123449e-06, | |
"loss": 0.7804, | |
"step": 922 | |
}, | |
{ | |
"epoch": 0.03013877551020408, | |
"grad_norm": 3.011596441268921, | |
"learning_rate": 9.971489222730243e-06, | |
"loss": 0.7706, | |
"step": 923 | |
}, | |
{ | |
"epoch": 0.03017142857142857, | |
"grad_norm": 2.9059317111968994, | |
"learning_rate": 9.971456564337035e-06, | |
"loss": 0.7594, | |
"step": 924 | |
}, | |
{ | |
"epoch": 0.030204081632653063, | |
"grad_norm": 3.198932409286499, | |
"learning_rate": 9.971423905943828e-06, | |
"loss": 0.7876, | |
"step": 925 | |
}, | |
{ | |
"epoch": 0.03023673469387755, | |
"grad_norm": 3.1033425331115723, | |
"learning_rate": 9.971391247550621e-06, | |
"loss": 0.7628, | |
"step": 926 | |
}, | |
{ | |
"epoch": 0.03026938775510204, | |
"grad_norm": 3.210116147994995, | |
"learning_rate": 9.971358589157413e-06, | |
"loss": 0.748, | |
"step": 927 | |
}, | |
{ | |
"epoch": 0.03030204081632653, | |
"grad_norm": 2.9255874156951904, | |
"learning_rate": 9.971325930764208e-06, | |
"loss": 0.7654, | |
"step": 928 | |
}, | |
{ | |
"epoch": 0.03033469387755102, | |
"grad_norm": 2.949495553970337, | |
"learning_rate": 9.971293272371e-06, | |
"loss": 0.7875, | |
"step": 929 | |
}, | |
{ | |
"epoch": 0.03036734693877551, | |
"grad_norm": 2.8590776920318604, | |
"learning_rate": 9.971260613977793e-06, | |
"loss": 0.7365, | |
"step": 930 | |
}, | |
{ | |
"epoch": 0.0304, | |
"grad_norm": 2.9772932529449463, | |
"learning_rate": 9.971227955584586e-06, | |
"loss": 0.7683, | |
"step": 931 | |
}, | |
{ | |
"epoch": 0.03043265306122449, | |
"grad_norm": 3.1256070137023926, | |
"learning_rate": 9.97119529719138e-06, | |
"loss": 0.8071, | |
"step": 932 | |
}, | |
{ | |
"epoch": 0.03046530612244898, | |
"grad_norm": 3.2088918685913086, | |
"learning_rate": 9.971162638798172e-06, | |
"loss": 0.7676, | |
"step": 933 | |
}, | |
{ | |
"epoch": 0.03049795918367347, | |
"grad_norm": 2.9030396938323975, | |
"learning_rate": 9.971129980404964e-06, | |
"loss": 0.8101, | |
"step": 934 | |
}, | |
{ | |
"epoch": 0.030530612244897958, | |
"grad_norm": 2.968740940093994, | |
"learning_rate": 9.971097322011757e-06, | |
"loss": 0.7716, | |
"step": 935 | |
}, | |
{ | |
"epoch": 0.03056326530612245, | |
"grad_norm": 2.748076915740967, | |
"learning_rate": 9.97106466361855e-06, | |
"loss": 0.7889, | |
"step": 936 | |
}, | |
{ | |
"epoch": 0.030595918367346937, | |
"grad_norm": 2.941471815109253, | |
"learning_rate": 9.971032005225344e-06, | |
"loss": 0.7453, | |
"step": 937 | |
}, | |
{ | |
"epoch": 0.03062857142857143, | |
"grad_norm": 2.9281535148620605, | |
"learning_rate": 9.970999346832137e-06, | |
"loss": 0.7891, | |
"step": 938 | |
}, | |
{ | |
"epoch": 0.03066122448979592, | |
"grad_norm": 2.85978627204895, | |
"learning_rate": 9.97096668843893e-06, | |
"loss": 0.7772, | |
"step": 939 | |
}, | |
{ | |
"epoch": 0.030693877551020408, | |
"grad_norm": 3.195918560028076, | |
"learning_rate": 9.970934030045722e-06, | |
"loss": 0.8068, | |
"step": 940 | |
}, | |
{ | |
"epoch": 0.0307265306122449, | |
"grad_norm": 3.1167869567871094, | |
"learning_rate": 9.970901371652517e-06, | |
"loss": 0.7549, | |
"step": 941 | |
}, | |
{ | |
"epoch": 0.030759183673469387, | |
"grad_norm": 2.9762868881225586, | |
"learning_rate": 9.970868713259308e-06, | |
"loss": 0.7923, | |
"step": 942 | |
}, | |
{ | |
"epoch": 0.030791836734693878, | |
"grad_norm": 3.1077170372009277, | |
"learning_rate": 9.970836054866101e-06, | |
"loss": 0.7636, | |
"step": 943 | |
}, | |
{ | |
"epoch": 0.030824489795918366, | |
"grad_norm": 3.062537431716919, | |
"learning_rate": 9.970803396472895e-06, | |
"loss": 0.7796, | |
"step": 944 | |
}, | |
{ | |
"epoch": 0.030857142857142857, | |
"grad_norm": 3.2072839736938477, | |
"learning_rate": 9.970770738079686e-06, | |
"loss": 0.793, | |
"step": 945 | |
}, | |
{ | |
"epoch": 0.03088979591836735, | |
"grad_norm": 3.26112699508667, | |
"learning_rate": 9.970738079686481e-06, | |
"loss": 0.8107, | |
"step": 946 | |
}, | |
{ | |
"epoch": 0.030922448979591836, | |
"grad_norm": 2.9786770343780518, | |
"learning_rate": 9.970705421293273e-06, | |
"loss": 0.8025, | |
"step": 947 | |
}, | |
{ | |
"epoch": 0.030955102040816328, | |
"grad_norm": 3.0765345096588135, | |
"learning_rate": 9.970672762900066e-06, | |
"loss": 0.8047, | |
"step": 948 | |
}, | |
{ | |
"epoch": 0.030987755102040816, | |
"grad_norm": 2.958984375, | |
"learning_rate": 9.970640104506859e-06, | |
"loss": 0.762, | |
"step": 949 | |
}, | |
{ | |
"epoch": 0.031020408163265307, | |
"grad_norm": 3.08040452003479, | |
"learning_rate": 9.970607446113652e-06, | |
"loss": 0.7928, | |
"step": 950 | |
}, | |
{ | |
"epoch": 0.031020408163265307, | |
"eval_loss": 0.8111075758934021, | |
"eval_runtime": 74.5279, | |
"eval_samples_per_second": 1.342, | |
"eval_steps_per_second": 1.342, | |
"step": 950 | |
}, | |
{ | |
"epoch": 0.031053061224489795, | |
"grad_norm": 2.965144395828247, | |
"learning_rate": 9.970574787720446e-06, | |
"loss": 0.8011, | |
"step": 951 | |
}, | |
{ | |
"epoch": 0.031085714285714286, | |
"grad_norm": 2.9179527759552, | |
"learning_rate": 9.970542129327237e-06, | |
"loss": 0.7703, | |
"step": 952 | |
}, | |
{ | |
"epoch": 0.031118367346938774, | |
"grad_norm": 2.9587607383728027, | |
"learning_rate": 9.97050947093403e-06, | |
"loss": 0.7536, | |
"step": 953 | |
}, | |
{ | |
"epoch": 0.031151020408163265, | |
"grad_norm": 2.9393138885498047, | |
"learning_rate": 9.970476812540824e-06, | |
"loss": 0.7922, | |
"step": 954 | |
}, | |
{ | |
"epoch": 0.031183673469387756, | |
"grad_norm": 3.013162612915039, | |
"learning_rate": 9.970444154147617e-06, | |
"loss": 0.7668, | |
"step": 955 | |
}, | |
{ | |
"epoch": 0.031216326530612244, | |
"grad_norm": 2.8993349075317383, | |
"learning_rate": 9.97041149575441e-06, | |
"loss": 0.7559, | |
"step": 956 | |
}, | |
{ | |
"epoch": 0.031248979591836736, | |
"grad_norm": 3.0861432552337646, | |
"learning_rate": 9.970378837361203e-06, | |
"loss": 0.7632, | |
"step": 957 | |
}, | |
{ | |
"epoch": 0.03128163265306123, | |
"grad_norm": 2.9803247451782227, | |
"learning_rate": 9.970346178967995e-06, | |
"loss": 0.7933, | |
"step": 958 | |
}, | |
{ | |
"epoch": 0.03131428571428571, | |
"grad_norm": 3.0155863761901855, | |
"learning_rate": 9.970313520574788e-06, | |
"loss": 0.8215, | |
"step": 959 | |
}, | |
{ | |
"epoch": 0.0313469387755102, | |
"grad_norm": 3.0787782669067383, | |
"learning_rate": 9.970280862181581e-06, | |
"loss": 0.7936, | |
"step": 960 | |
}, | |
{ | |
"epoch": 0.031379591836734694, | |
"grad_norm": 3.0619027614593506, | |
"learning_rate": 9.970248203788375e-06, | |
"loss": 0.7958, | |
"step": 961 | |
}, | |
{ | |
"epoch": 0.031412244897959185, | |
"grad_norm": 2.7557358741760254, | |
"learning_rate": 9.970215545395168e-06, | |
"loss": 0.761, | |
"step": 962 | |
}, | |
{ | |
"epoch": 0.031444897959183676, | |
"grad_norm": 2.8766896724700928, | |
"learning_rate": 9.97018288700196e-06, | |
"loss": 0.7563, | |
"step": 963 | |
}, | |
{ | |
"epoch": 0.03147755102040816, | |
"grad_norm": 2.976086139678955, | |
"learning_rate": 9.970150228608754e-06, | |
"loss": 0.7839, | |
"step": 964 | |
}, | |
{ | |
"epoch": 0.03151020408163265, | |
"grad_norm": 2.788193941116333, | |
"learning_rate": 9.970117570215546e-06, | |
"loss": 0.789, | |
"step": 965 | |
}, | |
{ | |
"epoch": 0.03154285714285714, | |
"grad_norm": 3.0085878372192383, | |
"learning_rate": 9.970084911822339e-06, | |
"loss": 0.771, | |
"step": 966 | |
}, | |
{ | |
"epoch": 0.031575510204081635, | |
"grad_norm": 3.1194570064544678, | |
"learning_rate": 9.970052253429132e-06, | |
"loss": 0.7889, | |
"step": 967 | |
}, | |
{ | |
"epoch": 0.031608163265306126, | |
"grad_norm": 2.8725526332855225, | |
"learning_rate": 9.970019595035924e-06, | |
"loss": 0.7882, | |
"step": 968 | |
}, | |
{ | |
"epoch": 0.03164081632653061, | |
"grad_norm": 2.982837677001953, | |
"learning_rate": 9.969986936642719e-06, | |
"loss": 0.7674, | |
"step": 969 | |
}, | |
{ | |
"epoch": 0.0316734693877551, | |
"grad_norm": 2.9641151428222656, | |
"learning_rate": 9.96995427824951e-06, | |
"loss": 0.7871, | |
"step": 970 | |
}, | |
{ | |
"epoch": 0.03170612244897959, | |
"grad_norm": 2.973365068435669, | |
"learning_rate": 9.969921619856304e-06, | |
"loss": 0.7939, | |
"step": 971 | |
}, | |
{ | |
"epoch": 0.031738775510204084, | |
"grad_norm": 3.0161428451538086, | |
"learning_rate": 9.969888961463097e-06, | |
"loss": 0.7865, | |
"step": 972 | |
}, | |
{ | |
"epoch": 0.03177142857142857, | |
"grad_norm": 3.369993209838867, | |
"learning_rate": 9.96985630306989e-06, | |
"loss": 0.8231, | |
"step": 973 | |
}, | |
{ | |
"epoch": 0.03180408163265306, | |
"grad_norm": 3.0183799266815186, | |
"learning_rate": 9.969823644676683e-06, | |
"loss": 0.7853, | |
"step": 974 | |
}, | |
{ | |
"epoch": 0.03183673469387755, | |
"grad_norm": 3.244966983795166, | |
"learning_rate": 9.969790986283475e-06, | |
"loss": 0.7607, | |
"step": 975 | |
}, | |
{ | |
"epoch": 0.03186938775510204, | |
"grad_norm": 3.0273585319519043, | |
"learning_rate": 9.969758327890268e-06, | |
"loss": 0.77, | |
"step": 976 | |
}, | |
{ | |
"epoch": 0.031902040816326534, | |
"grad_norm": 3.168429136276245, | |
"learning_rate": 9.969725669497061e-06, | |
"loss": 0.7537, | |
"step": 977 | |
}, | |
{ | |
"epoch": 0.03193469387755102, | |
"grad_norm": 3.006373882293701, | |
"learning_rate": 9.969693011103854e-06, | |
"loss": 0.7927, | |
"step": 978 | |
}, | |
{ | |
"epoch": 0.03196734693877551, | |
"grad_norm": 3.1100571155548096, | |
"learning_rate": 9.969660352710648e-06, | |
"loss": 0.7846, | |
"step": 979 | |
}, | |
{ | |
"epoch": 0.032, | |
"grad_norm": 2.941803216934204, | |
"learning_rate": 9.969627694317441e-06, | |
"loss": 0.7821, | |
"step": 980 | |
}, | |
{ | |
"epoch": 0.03203265306122449, | |
"grad_norm": 3.077153444290161, | |
"learning_rate": 9.969595035924232e-06, | |
"loss": 0.8004, | |
"step": 981 | |
}, | |
{ | |
"epoch": 0.032065306122448976, | |
"grad_norm": 2.9901161193847656, | |
"learning_rate": 9.969562377531026e-06, | |
"loss": 0.7848, | |
"step": 982 | |
}, | |
{ | |
"epoch": 0.03209795918367347, | |
"grad_norm": 3.0473811626434326, | |
"learning_rate": 9.969529719137819e-06, | |
"loss": 0.7823, | |
"step": 983 | |
}, | |
{ | |
"epoch": 0.03213061224489796, | |
"grad_norm": 2.894256353378296, | |
"learning_rate": 9.969497060744612e-06, | |
"loss": 0.7757, | |
"step": 984 | |
}, | |
{ | |
"epoch": 0.03216326530612245, | |
"grad_norm": 3.0119597911834717, | |
"learning_rate": 9.969464402351405e-06, | |
"loss": 0.7799, | |
"step": 985 | |
}, | |
{ | |
"epoch": 0.03219591836734694, | |
"grad_norm": 3.020019292831421, | |
"learning_rate": 9.969431743958197e-06, | |
"loss": 0.7817, | |
"step": 986 | |
}, | |
{ | |
"epoch": 0.032228571428571426, | |
"grad_norm": 3.0470404624938965, | |
"learning_rate": 9.969399085564992e-06, | |
"loss": 0.7873, | |
"step": 987 | |
}, | |
{ | |
"epoch": 0.03226122448979592, | |
"grad_norm": 2.9759464263916016, | |
"learning_rate": 9.969366427171783e-06, | |
"loss": 0.7857, | |
"step": 988 | |
}, | |
{ | |
"epoch": 0.03229387755102041, | |
"grad_norm": 3.1986935138702393, | |
"learning_rate": 9.969333768778577e-06, | |
"loss": 0.786, | |
"step": 989 | |
}, | |
{ | |
"epoch": 0.0323265306122449, | |
"grad_norm": 3.0360336303710938, | |
"learning_rate": 9.96930111038537e-06, | |
"loss": 0.8174, | |
"step": 990 | |
}, | |
{ | |
"epoch": 0.03235918367346939, | |
"grad_norm": 3.093979597091675, | |
"learning_rate": 9.969268451992163e-06, | |
"loss": 0.7619, | |
"step": 991 | |
}, | |
{ | |
"epoch": 0.032391836734693875, | |
"grad_norm": 2.935920238494873, | |
"learning_rate": 9.969235793598956e-06, | |
"loss": 0.7661, | |
"step": 992 | |
}, | |
{ | |
"epoch": 0.03242448979591837, | |
"grad_norm": 3.1429708003997803, | |
"learning_rate": 9.969203135205748e-06, | |
"loss": 0.7981, | |
"step": 993 | |
}, | |
{ | |
"epoch": 0.03245714285714286, | |
"grad_norm": 2.8831772804260254, | |
"learning_rate": 9.969170476812541e-06, | |
"loss": 0.7969, | |
"step": 994 | |
}, | |
{ | |
"epoch": 0.03248979591836735, | |
"grad_norm": 2.8683125972747803, | |
"learning_rate": 9.969137818419334e-06, | |
"loss": 0.7688, | |
"step": 995 | |
}, | |
{ | |
"epoch": 0.032522448979591834, | |
"grad_norm": 3.026094436645508, | |
"learning_rate": 9.969105160026128e-06, | |
"loss": 0.782, | |
"step": 996 | |
}, | |
{ | |
"epoch": 0.032555102040816325, | |
"grad_norm": 3.065110445022583, | |
"learning_rate": 9.969072501632921e-06, | |
"loss": 0.782, | |
"step": 997 | |
}, | |
{ | |
"epoch": 0.032587755102040816, | |
"grad_norm": 2.9125099182128906, | |
"learning_rate": 9.969039843239714e-06, | |
"loss": 0.8056, | |
"step": 998 | |
}, | |
{ | |
"epoch": 0.03262040816326531, | |
"grad_norm": 2.978609800338745, | |
"learning_rate": 9.969007184846506e-06, | |
"loss": 0.7839, | |
"step": 999 | |
}, | |
{ | |
"epoch": 0.0326530612244898, | |
"grad_norm": 3.037384510040283, | |
"learning_rate": 9.968974526453299e-06, | |
"loss": 0.7877, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.0326530612244898, | |
"eval_loss": 0.8111925721168518, | |
"eval_runtime": 72.9629, | |
"eval_samples_per_second": 1.371, | |
"eval_steps_per_second": 1.371, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.03268571428571428, | |
"grad_norm": 2.9627959728240967, | |
"learning_rate": 9.968941868060092e-06, | |
"loss": 0.7855, | |
"step": 1001 | |
}, | |
{ | |
"epoch": 0.032718367346938775, | |
"grad_norm": 3.2459042072296143, | |
"learning_rate": 9.968909209666885e-06, | |
"loss": 0.7897, | |
"step": 1002 | |
}, | |
{ | |
"epoch": 0.032751020408163266, | |
"grad_norm": 3.0746281147003174, | |
"learning_rate": 9.968876551273679e-06, | |
"loss": 0.7415, | |
"step": 1003 | |
}, | |
{ | |
"epoch": 0.03278367346938776, | |
"grad_norm": 3.005556583404541, | |
"learning_rate": 9.96884389288047e-06, | |
"loss": 0.8075, | |
"step": 1004 | |
}, | |
{ | |
"epoch": 0.03281632653061224, | |
"grad_norm": 2.9875948429107666, | |
"learning_rate": 9.968811234487265e-06, | |
"loss": 0.7982, | |
"step": 1005 | |
}, | |
{ | |
"epoch": 0.03284897959183673, | |
"grad_norm": 2.8231279850006104, | |
"learning_rate": 9.968778576094057e-06, | |
"loss": 0.758, | |
"step": 1006 | |
}, | |
{ | |
"epoch": 0.032881632653061224, | |
"grad_norm": 2.8995821475982666, | |
"learning_rate": 9.96874591770085e-06, | |
"loss": 0.7653, | |
"step": 1007 | |
}, | |
{ | |
"epoch": 0.032914285714285715, | |
"grad_norm": 3.0024149417877197, | |
"learning_rate": 9.968713259307643e-06, | |
"loss": 0.7777, | |
"step": 1008 | |
}, | |
{ | |
"epoch": 0.03294693877551021, | |
"grad_norm": 3.019351005554199, | |
"learning_rate": 9.968680600914435e-06, | |
"loss": 0.7921, | |
"step": 1009 | |
}, | |
{ | |
"epoch": 0.03297959183673469, | |
"grad_norm": 2.916240930557251, | |
"learning_rate": 9.96864794252123e-06, | |
"loss": 0.75, | |
"step": 1010 | |
}, | |
{ | |
"epoch": 0.03301224489795918, | |
"grad_norm": 3.005613088607788, | |
"learning_rate": 9.968615284128021e-06, | |
"loss": 0.7943, | |
"step": 1011 | |
}, | |
{ | |
"epoch": 0.033044897959183674, | |
"grad_norm": 2.9690959453582764, | |
"learning_rate": 9.968582625734814e-06, | |
"loss": 0.7855, | |
"step": 1012 | |
}, | |
{ | |
"epoch": 0.033077551020408165, | |
"grad_norm": 3.0370657444000244, | |
"learning_rate": 9.968549967341608e-06, | |
"loss": 0.7567, | |
"step": 1013 | |
}, | |
{ | |
"epoch": 0.033110204081632656, | |
"grad_norm": 3.010715961456299, | |
"learning_rate": 9.9685173089484e-06, | |
"loss": 0.7545, | |
"step": 1014 | |
}, | |
{ | |
"epoch": 0.03314285714285714, | |
"grad_norm": 3.1019158363342285, | |
"learning_rate": 9.968484650555194e-06, | |
"loss": 0.7881, | |
"step": 1015 | |
}, | |
{ | |
"epoch": 0.03317551020408163, | |
"grad_norm": 2.9990556240081787, | |
"learning_rate": 9.968451992161986e-06, | |
"loss": 0.7892, | |
"step": 1016 | |
}, | |
{ | |
"epoch": 0.03320816326530612, | |
"grad_norm": 2.9631762504577637, | |
"learning_rate": 9.968419333768779e-06, | |
"loss": 0.7459, | |
"step": 1017 | |
}, | |
{ | |
"epoch": 0.033240816326530614, | |
"grad_norm": 3.1492321491241455, | |
"learning_rate": 9.968386675375572e-06, | |
"loss": 0.796, | |
"step": 1018 | |
}, | |
{ | |
"epoch": 0.0332734693877551, | |
"grad_norm": 3.0348122119903564, | |
"learning_rate": 9.968354016982365e-06, | |
"loss": 0.795, | |
"step": 1019 | |
}, | |
{ | |
"epoch": 0.03330612244897959, | |
"grad_norm": 3.2612578868865967, | |
"learning_rate": 9.968321358589159e-06, | |
"loss": 0.7781, | |
"step": 1020 | |
}, | |
{ | |
"epoch": 0.03333877551020408, | |
"grad_norm": 3.0605742931365967, | |
"learning_rate": 9.968288700195952e-06, | |
"loss": 0.7329, | |
"step": 1021 | |
}, | |
{ | |
"epoch": 0.03337142857142857, | |
"grad_norm": 3.0548489093780518, | |
"learning_rate": 9.968256041802743e-06, | |
"loss": 0.7601, | |
"step": 1022 | |
}, | |
{ | |
"epoch": 0.033404081632653064, | |
"grad_norm": 2.946451187133789, | |
"learning_rate": 9.968223383409537e-06, | |
"loss": 0.7756, | |
"step": 1023 | |
}, | |
{ | |
"epoch": 0.03343673469387755, | |
"grad_norm": 3.0444273948669434, | |
"learning_rate": 9.96819072501633e-06, | |
"loss": 0.762, | |
"step": 1024 | |
}, | |
{ | |
"epoch": 0.03346938775510204, | |
"grad_norm": 3.207759380340576, | |
"learning_rate": 9.968158066623123e-06, | |
"loss": 0.7755, | |
"step": 1025 | |
}, | |
{ | |
"epoch": 0.03350204081632653, | |
"grad_norm": 3.0307605266571045, | |
"learning_rate": 9.968125408229916e-06, | |
"loss": 0.7604, | |
"step": 1026 | |
}, | |
{ | |
"epoch": 0.03353469387755102, | |
"grad_norm": 2.975473165512085, | |
"learning_rate": 9.968092749836708e-06, | |
"loss": 0.7929, | |
"step": 1027 | |
}, | |
{ | |
"epoch": 0.033567346938775514, | |
"grad_norm": 3.00290846824646, | |
"learning_rate": 9.968060091443503e-06, | |
"loss": 0.8294, | |
"step": 1028 | |
}, | |
{ | |
"epoch": 0.0336, | |
"grad_norm": 3.235747814178467, | |
"learning_rate": 9.968027433050294e-06, | |
"loss": 0.7807, | |
"step": 1029 | |
}, | |
{ | |
"epoch": 0.03363265306122449, | |
"grad_norm": 3.031163215637207, | |
"learning_rate": 9.967994774657088e-06, | |
"loss": 0.7549, | |
"step": 1030 | |
}, | |
{ | |
"epoch": 0.03366530612244898, | |
"grad_norm": 2.9528584480285645, | |
"learning_rate": 9.96796211626388e-06, | |
"loss": 0.7542, | |
"step": 1031 | |
}, | |
{ | |
"epoch": 0.03369795918367347, | |
"grad_norm": 3.0693178176879883, | |
"learning_rate": 9.967929457870672e-06, | |
"loss": 0.7773, | |
"step": 1032 | |
}, | |
{ | |
"epoch": 0.033730612244897956, | |
"grad_norm": 3.159449338912964, | |
"learning_rate": 9.967896799477467e-06, | |
"loss": 0.7998, | |
"step": 1033 | |
}, | |
{ | |
"epoch": 0.03376326530612245, | |
"grad_norm": 3.0627989768981934, | |
"learning_rate": 9.967864141084259e-06, | |
"loss": 0.7624, | |
"step": 1034 | |
}, | |
{ | |
"epoch": 0.03379591836734694, | |
"grad_norm": 3.088207721710205, | |
"learning_rate": 9.967831482691052e-06, | |
"loss": 0.7757, | |
"step": 1035 | |
}, | |
{ | |
"epoch": 0.03382857142857143, | |
"grad_norm": 3.1921637058258057, | |
"learning_rate": 9.967798824297845e-06, | |
"loss": 0.7923, | |
"step": 1036 | |
}, | |
{ | |
"epoch": 0.03386122448979592, | |
"grad_norm": 3.039994955062866, | |
"learning_rate": 9.967766165904638e-06, | |
"loss": 0.7628, | |
"step": 1037 | |
}, | |
{ | |
"epoch": 0.033893877551020406, | |
"grad_norm": 3.221714973449707, | |
"learning_rate": 9.967733507511432e-06, | |
"loss": 0.7684, | |
"step": 1038 | |
}, | |
{ | |
"epoch": 0.0339265306122449, | |
"grad_norm": 2.9829065799713135, | |
"learning_rate": 9.967700849118223e-06, | |
"loss": 0.7956, | |
"step": 1039 | |
}, | |
{ | |
"epoch": 0.03395918367346939, | |
"grad_norm": 2.9304864406585693, | |
"learning_rate": 9.967668190725016e-06, | |
"loss": 0.7865, | |
"step": 1040 | |
}, | |
{ | |
"epoch": 0.03399183673469388, | |
"grad_norm": 3.0041542053222656, | |
"learning_rate": 9.96763553233181e-06, | |
"loss": 0.8113, | |
"step": 1041 | |
}, | |
{ | |
"epoch": 0.034024489795918364, | |
"grad_norm": 3.1308107376098633, | |
"learning_rate": 9.967602873938603e-06, | |
"loss": 0.7622, | |
"step": 1042 | |
}, | |
{ | |
"epoch": 0.034057142857142855, | |
"grad_norm": 3.0761616230010986, | |
"learning_rate": 9.967570215545396e-06, | |
"loss": 0.7822, | |
"step": 1043 | |
}, | |
{ | |
"epoch": 0.03408979591836735, | |
"grad_norm": 3.073542594909668, | |
"learning_rate": 9.96753755715219e-06, | |
"loss": 0.782, | |
"step": 1044 | |
}, | |
{ | |
"epoch": 0.03412244897959184, | |
"grad_norm": 2.954620361328125, | |
"learning_rate": 9.967504898758981e-06, | |
"loss": 0.7686, | |
"step": 1045 | |
}, | |
{ | |
"epoch": 0.03415510204081633, | |
"grad_norm": 3.1652603149414062, | |
"learning_rate": 9.967472240365776e-06, | |
"loss": 0.7743, | |
"step": 1046 | |
}, | |
{ | |
"epoch": 0.034187755102040814, | |
"grad_norm": 2.8833162784576416, | |
"learning_rate": 9.967439581972567e-06, | |
"loss": 0.7741, | |
"step": 1047 | |
}, | |
{ | |
"epoch": 0.034220408163265305, | |
"grad_norm": 2.910767078399658, | |
"learning_rate": 9.96740692357936e-06, | |
"loss": 0.7768, | |
"step": 1048 | |
}, | |
{ | |
"epoch": 0.034253061224489796, | |
"grad_norm": 3.0124878883361816, | |
"learning_rate": 9.967374265186154e-06, | |
"loss": 0.7656, | |
"step": 1049 | |
}, | |
{ | |
"epoch": 0.03428571428571429, | |
"grad_norm": 3.200578451156616, | |
"learning_rate": 9.967341606792945e-06, | |
"loss": 0.7979, | |
"step": 1050 | |
}, | |
{ | |
"epoch": 0.03428571428571429, | |
"eval_loss": 0.809950053691864, | |
"eval_runtime": 78.3268, | |
"eval_samples_per_second": 1.277, | |
"eval_steps_per_second": 1.277, | |
"step": 1050 | |
}, | |
{ | |
"epoch": 0.03431836734693878, | |
"grad_norm": 3.1058106422424316, | |
"learning_rate": 9.96730894839974e-06, | |
"loss": 0.7929, | |
"step": 1051 | |
}, | |
{ | |
"epoch": 0.03435102040816326, | |
"grad_norm": 2.981287956237793, | |
"learning_rate": 9.967276290006532e-06, | |
"loss": 0.7922, | |
"step": 1052 | |
}, | |
{ | |
"epoch": 0.034383673469387754, | |
"grad_norm": 3.0732994079589844, | |
"learning_rate": 9.967243631613325e-06, | |
"loss": 0.7598, | |
"step": 1053 | |
}, | |
{ | |
"epoch": 0.034416326530612246, | |
"grad_norm": 2.965428352355957, | |
"learning_rate": 9.967210973220118e-06, | |
"loss": 0.7796, | |
"step": 1054 | |
}, | |
{ | |
"epoch": 0.03444897959183674, | |
"grad_norm": 3.047433853149414, | |
"learning_rate": 9.967178314826912e-06, | |
"loss": 0.7806, | |
"step": 1055 | |
}, | |
{ | |
"epoch": 0.03448163265306122, | |
"grad_norm": 2.9787747859954834, | |
"learning_rate": 9.967145656433705e-06, | |
"loss": 0.7548, | |
"step": 1056 | |
}, | |
{ | |
"epoch": 0.03451428571428571, | |
"grad_norm": 3.3675661087036133, | |
"learning_rate": 9.967112998040496e-06, | |
"loss": 0.7764, | |
"step": 1057 | |
}, | |
{ | |
"epoch": 0.034546938775510204, | |
"grad_norm": 2.90116024017334, | |
"learning_rate": 9.96708033964729e-06, | |
"loss": 0.7819, | |
"step": 1058 | |
}, | |
{ | |
"epoch": 0.034579591836734695, | |
"grad_norm": 2.965744972229004, | |
"learning_rate": 9.967047681254083e-06, | |
"loss": 0.7603, | |
"step": 1059 | |
}, | |
{ | |
"epoch": 0.034612244897959187, | |
"grad_norm": 3.116170644760132, | |
"learning_rate": 9.967015022860876e-06, | |
"loss": 0.7885, | |
"step": 1060 | |
}, | |
{ | |
"epoch": 0.03464489795918367, | |
"grad_norm": 3.1526293754577637, | |
"learning_rate": 9.96698236446767e-06, | |
"loss": 0.7752, | |
"step": 1061 | |
}, | |
{ | |
"epoch": 0.03467755102040816, | |
"grad_norm": 3.0937249660491943, | |
"learning_rate": 9.966949706074463e-06, | |
"loss": 0.7845, | |
"step": 1062 | |
}, | |
{ | |
"epoch": 0.034710204081632653, | |
"grad_norm": 3.069850444793701, | |
"learning_rate": 9.966917047681254e-06, | |
"loss": 0.7645, | |
"step": 1063 | |
}, | |
{ | |
"epoch": 0.034742857142857145, | |
"grad_norm": 2.979881763458252, | |
"learning_rate": 9.966884389288047e-06, | |
"loss": 0.7687, | |
"step": 1064 | |
}, | |
{ | |
"epoch": 0.034775510204081636, | |
"grad_norm": 3.1711268424987793, | |
"learning_rate": 9.96685173089484e-06, | |
"loss": 0.8136, | |
"step": 1065 | |
}, | |
{ | |
"epoch": 0.03480816326530612, | |
"grad_norm": 3.197852373123169, | |
"learning_rate": 9.966819072501634e-06, | |
"loss": 0.8152, | |
"step": 1066 | |
}, | |
{ | |
"epoch": 0.03484081632653061, | |
"grad_norm": 3.098540782928467, | |
"learning_rate": 9.966786414108427e-06, | |
"loss": 0.7645, | |
"step": 1067 | |
}, | |
{ | |
"epoch": 0.0348734693877551, | |
"grad_norm": 3.1754651069641113, | |
"learning_rate": 9.966753755715219e-06, | |
"loss": 0.7663, | |
"step": 1068 | |
}, | |
{ | |
"epoch": 0.034906122448979594, | |
"grad_norm": 3.0712950229644775, | |
"learning_rate": 9.966721097322014e-06, | |
"loss": 0.7851, | |
"step": 1069 | |
}, | |
{ | |
"epoch": 0.03493877551020408, | |
"grad_norm": 3.285234212875366, | |
"learning_rate": 9.966688438928805e-06, | |
"loss": 0.774, | |
"step": 1070 | |
}, | |
{ | |
"epoch": 0.03497142857142857, | |
"grad_norm": 3.2550666332244873, | |
"learning_rate": 9.966655780535598e-06, | |
"loss": 0.751, | |
"step": 1071 | |
}, | |
{ | |
"epoch": 0.03500408163265306, | |
"grad_norm": 3.238384246826172, | |
"learning_rate": 9.966623122142392e-06, | |
"loss": 0.7666, | |
"step": 1072 | |
}, | |
{ | |
"epoch": 0.03503673469387755, | |
"grad_norm": 3.0862374305725098, | |
"learning_rate": 9.966590463749183e-06, | |
"loss": 0.7981, | |
"step": 1073 | |
}, | |
{ | |
"epoch": 0.035069387755102044, | |
"grad_norm": 3.172941207885742, | |
"learning_rate": 9.966557805355978e-06, | |
"loss": 0.7549, | |
"step": 1074 | |
}, | |
{ | |
"epoch": 0.03510204081632653, | |
"grad_norm": 3.060302495956421, | |
"learning_rate": 9.96652514696277e-06, | |
"loss": 0.7835, | |
"step": 1075 | |
}, | |
{ | |
"epoch": 0.03513469387755102, | |
"grad_norm": 3.122617244720459, | |
"learning_rate": 9.966492488569563e-06, | |
"loss": 0.7952, | |
"step": 1076 | |
}, | |
{ | |
"epoch": 0.03516734693877551, | |
"grad_norm": 2.904637336730957, | |
"learning_rate": 9.966459830176356e-06, | |
"loss": 0.7595, | |
"step": 1077 | |
}, | |
{ | |
"epoch": 0.0352, | |
"grad_norm": 3.1672565937042236, | |
"learning_rate": 9.96642717178315e-06, | |
"loss": 0.7926, | |
"step": 1078 | |
}, | |
{ | |
"epoch": 0.035232653061224486, | |
"grad_norm": 3.0276424884796143, | |
"learning_rate": 9.966394513389943e-06, | |
"loss": 0.7801, | |
"step": 1079 | |
}, | |
{ | |
"epoch": 0.03526530612244898, | |
"grad_norm": 3.0199339389801025, | |
"learning_rate": 9.966361854996734e-06, | |
"loss": 0.7977, | |
"step": 1080 | |
}, | |
{ | |
"epoch": 0.03529795918367347, | |
"grad_norm": 3.1580443382263184, | |
"learning_rate": 9.966329196603527e-06, | |
"loss": 0.7638, | |
"step": 1081 | |
}, | |
{ | |
"epoch": 0.03533061224489796, | |
"grad_norm": 2.9399259090423584, | |
"learning_rate": 9.96629653821032e-06, | |
"loss": 0.7594, | |
"step": 1082 | |
}, | |
{ | |
"epoch": 0.03536326530612245, | |
"grad_norm": 3.1555962562561035, | |
"learning_rate": 9.966263879817114e-06, | |
"loss": 0.7688, | |
"step": 1083 | |
}, | |
{ | |
"epoch": 0.035395918367346936, | |
"grad_norm": 3.03648042678833, | |
"learning_rate": 9.966231221423907e-06, | |
"loss": 0.8, | |
"step": 1084 | |
}, | |
{ | |
"epoch": 0.03542857142857143, | |
"grad_norm": 3.142136573791504, | |
"learning_rate": 9.9661985630307e-06, | |
"loss": 0.7502, | |
"step": 1085 | |
}, | |
{ | |
"epoch": 0.03546122448979592, | |
"grad_norm": 3.103031635284424, | |
"learning_rate": 9.966165904637492e-06, | |
"loss": 0.7979, | |
"step": 1086 | |
}, | |
{ | |
"epoch": 0.03549387755102041, | |
"grad_norm": 3.0145888328552246, | |
"learning_rate": 9.966133246244287e-06, | |
"loss": 0.7669, | |
"step": 1087 | |
}, | |
{ | |
"epoch": 0.0355265306122449, | |
"grad_norm": 3.1782283782958984, | |
"learning_rate": 9.966100587851078e-06, | |
"loss": 0.7677, | |
"step": 1088 | |
}, | |
{ | |
"epoch": 0.035559183673469386, | |
"grad_norm": 3.102893829345703, | |
"learning_rate": 9.966067929457871e-06, | |
"loss": 0.7651, | |
"step": 1089 | |
}, | |
{ | |
"epoch": 0.03559183673469388, | |
"grad_norm": 3.0069074630737305, | |
"learning_rate": 9.966035271064665e-06, | |
"loss": 0.7729, | |
"step": 1090 | |
}, | |
{ | |
"epoch": 0.03562448979591837, | |
"grad_norm": 3.022388458251953, | |
"learning_rate": 9.966002612671456e-06, | |
"loss": 0.7537, | |
"step": 1091 | |
}, | |
{ | |
"epoch": 0.03565714285714286, | |
"grad_norm": 3.0949289798736572, | |
"learning_rate": 9.965969954278251e-06, | |
"loss": 0.77, | |
"step": 1092 | |
}, | |
{ | |
"epoch": 0.035689795918367344, | |
"grad_norm": 3.48496413230896, | |
"learning_rate": 9.965937295885043e-06, | |
"loss": 0.7671, | |
"step": 1093 | |
}, | |
{ | |
"epoch": 0.035722448979591835, | |
"grad_norm": 3.1143581867218018, | |
"learning_rate": 9.965904637491836e-06, | |
"loss": 0.7451, | |
"step": 1094 | |
}, | |
{ | |
"epoch": 0.035755102040816326, | |
"grad_norm": 2.93046498298645, | |
"learning_rate": 9.96587197909863e-06, | |
"loss": 0.7878, | |
"step": 1095 | |
}, | |
{ | |
"epoch": 0.03578775510204082, | |
"grad_norm": 3.0415232181549072, | |
"learning_rate": 9.96583932070542e-06, | |
"loss": 0.7465, | |
"step": 1096 | |
}, | |
{ | |
"epoch": 0.03582040816326531, | |
"grad_norm": 3.1707770824432373, | |
"learning_rate": 9.965806662312216e-06, | |
"loss": 0.7548, | |
"step": 1097 | |
}, | |
{ | |
"epoch": 0.03585306122448979, | |
"grad_norm": 3.0290682315826416, | |
"learning_rate": 9.965774003919007e-06, | |
"loss": 0.7703, | |
"step": 1098 | |
}, | |
{ | |
"epoch": 0.035885714285714285, | |
"grad_norm": 2.925924777984619, | |
"learning_rate": 9.9657413455258e-06, | |
"loss": 0.7653, | |
"step": 1099 | |
}, | |
{ | |
"epoch": 0.035918367346938776, | |
"grad_norm": 3.2360856533050537, | |
"learning_rate": 9.965708687132594e-06, | |
"loss": 0.7705, | |
"step": 1100 | |
}, | |
{ | |
"epoch": 0.035918367346938776, | |
"eval_loss": 0.8095739483833313, | |
"eval_runtime": 95.0564, | |
"eval_samples_per_second": 1.052, | |
"eval_steps_per_second": 1.052, | |
"step": 1100 | |
}, | |
{ | |
"epoch": 0.03595102040816327, | |
"grad_norm": 3.276266574859619, | |
"learning_rate": 9.965676028739387e-06, | |
"loss": 0.7926, | |
"step": 1101 | |
}, | |
{ | |
"epoch": 0.03598367346938776, | |
"grad_norm": 3.069849967956543, | |
"learning_rate": 9.96564337034618e-06, | |
"loss": 0.7966, | |
"step": 1102 | |
}, | |
{ | |
"epoch": 0.03601632653061224, | |
"grad_norm": 3.484065532684326, | |
"learning_rate": 9.965610711952973e-06, | |
"loss": 0.7744, | |
"step": 1103 | |
}, | |
{ | |
"epoch": 0.036048979591836734, | |
"grad_norm": 3.468071699142456, | |
"learning_rate": 9.965578053559765e-06, | |
"loss": 0.7575, | |
"step": 1104 | |
}, | |
{ | |
"epoch": 0.036081632653061226, | |
"grad_norm": 3.8356101512908936, | |
"learning_rate": 9.965545395166558e-06, | |
"loss": 0.7817, | |
"step": 1105 | |
}, | |
{ | |
"epoch": 0.03611428571428572, | |
"grad_norm": 3.2512879371643066, | |
"learning_rate": 9.965512736773351e-06, | |
"loss": 0.7356, | |
"step": 1106 | |
}, | |
{ | |
"epoch": 0.0361469387755102, | |
"grad_norm": 3.004352569580078, | |
"learning_rate": 9.965480078380145e-06, | |
"loss": 0.7286, | |
"step": 1107 | |
}, | |
{ | |
"epoch": 0.03617959183673469, | |
"grad_norm": 3.194286823272705, | |
"learning_rate": 9.965447419986938e-06, | |
"loss": 0.7474, | |
"step": 1108 | |
}, | |
{ | |
"epoch": 0.036212244897959184, | |
"grad_norm": 3.509319543838501, | |
"learning_rate": 9.96541476159373e-06, | |
"loss": 0.7888, | |
"step": 1109 | |
}, | |
{ | |
"epoch": 0.036244897959183675, | |
"grad_norm": 3.528698205947876, | |
"learning_rate": 9.965382103200524e-06, | |
"loss": 0.794, | |
"step": 1110 | |
}, | |
{ | |
"epoch": 0.036277551020408166, | |
"grad_norm": 3.2880067825317383, | |
"learning_rate": 9.965349444807316e-06, | |
"loss": 0.7408, | |
"step": 1111 | |
}, | |
{ | |
"epoch": 0.03631020408163265, | |
"grad_norm": 3.031257152557373, | |
"learning_rate": 9.965316786414109e-06, | |
"loss": 0.7847, | |
"step": 1112 | |
}, | |
{ | |
"epoch": 0.03634285714285714, | |
"grad_norm": 3.2594382762908936, | |
"learning_rate": 9.965284128020902e-06, | |
"loss": 0.7911, | |
"step": 1113 | |
}, | |
{ | |
"epoch": 0.03637551020408163, | |
"grad_norm": 3.2341842651367188, | |
"learning_rate": 9.965251469627694e-06, | |
"loss": 0.7412, | |
"step": 1114 | |
}, | |
{ | |
"epoch": 0.036408163265306125, | |
"grad_norm": 3.251246213912964, | |
"learning_rate": 9.965218811234489e-06, | |
"loss": 0.7549, | |
"step": 1115 | |
}, | |
{ | |
"epoch": 0.03644081632653061, | |
"grad_norm": 3.0365469455718994, | |
"learning_rate": 9.96518615284128e-06, | |
"loss": 0.78, | |
"step": 1116 | |
}, | |
{ | |
"epoch": 0.0364734693877551, | |
"grad_norm": 3.059936285018921, | |
"learning_rate": 9.965153494448074e-06, | |
"loss": 0.805, | |
"step": 1117 | |
}, | |
{ | |
"epoch": 0.03650612244897959, | |
"grad_norm": 3.0952272415161133, | |
"learning_rate": 9.965120836054867e-06, | |
"loss": 0.7563, | |
"step": 1118 | |
}, | |
{ | |
"epoch": 0.03653877551020408, | |
"grad_norm": 3.136528491973877, | |
"learning_rate": 9.96508817766166e-06, | |
"loss": 0.7767, | |
"step": 1119 | |
}, | |
{ | |
"epoch": 0.036571428571428574, | |
"grad_norm": 2.979304313659668, | |
"learning_rate": 9.965055519268453e-06, | |
"loss": 0.8002, | |
"step": 1120 | |
}, | |
{ | |
"epoch": 0.03660408163265306, | |
"grad_norm": 2.8801610469818115, | |
"learning_rate": 9.965022860875245e-06, | |
"loss": 0.7793, | |
"step": 1121 | |
}, | |
{ | |
"epoch": 0.03663673469387755, | |
"grad_norm": 2.9416167736053467, | |
"learning_rate": 9.964990202482038e-06, | |
"loss": 0.7703, | |
"step": 1122 | |
}, | |
{ | |
"epoch": 0.03666938775510204, | |
"grad_norm": 3.1201093196868896, | |
"learning_rate": 9.964957544088831e-06, | |
"loss": 0.7699, | |
"step": 1123 | |
}, | |
{ | |
"epoch": 0.03670204081632653, | |
"grad_norm": 3.1061949729919434, | |
"learning_rate": 9.964924885695625e-06, | |
"loss": 0.7974, | |
"step": 1124 | |
}, | |
{ | |
"epoch": 0.036734693877551024, | |
"grad_norm": 3.0530447959899902, | |
"learning_rate": 9.964892227302418e-06, | |
"loss": 0.7576, | |
"step": 1125 | |
}, | |
{ | |
"epoch": 0.03676734693877551, | |
"grad_norm": 3.002209186553955, | |
"learning_rate": 9.964859568909211e-06, | |
"loss": 0.7699, | |
"step": 1126 | |
}, | |
{ | |
"epoch": 0.0368, | |
"grad_norm": 3.1953022480010986, | |
"learning_rate": 9.964826910516003e-06, | |
"loss": 0.7153, | |
"step": 1127 | |
}, | |
{ | |
"epoch": 0.03683265306122449, | |
"grad_norm": 3.091996192932129, | |
"learning_rate": 9.964794252122796e-06, | |
"loss": 0.7368, | |
"step": 1128 | |
}, | |
{ | |
"epoch": 0.03686530612244898, | |
"grad_norm": 3.173081398010254, | |
"learning_rate": 9.964761593729589e-06, | |
"loss": 0.7839, | |
"step": 1129 | |
}, | |
{ | |
"epoch": 0.036897959183673466, | |
"grad_norm": 3.0133304595947266, | |
"learning_rate": 9.964728935336382e-06, | |
"loss": 0.7827, | |
"step": 1130 | |
}, | |
{ | |
"epoch": 0.03693061224489796, | |
"grad_norm": 3.1596643924713135, | |
"learning_rate": 9.964696276943176e-06, | |
"loss": 0.7649, | |
"step": 1131 | |
}, | |
{ | |
"epoch": 0.03696326530612245, | |
"grad_norm": 3.2854256629943848, | |
"learning_rate": 9.964663618549967e-06, | |
"loss": 0.751, | |
"step": 1132 | |
}, | |
{ | |
"epoch": 0.03699591836734694, | |
"grad_norm": 2.8992016315460205, | |
"learning_rate": 9.964630960156762e-06, | |
"loss": 0.7535, | |
"step": 1133 | |
}, | |
{ | |
"epoch": 0.03702857142857143, | |
"grad_norm": 2.9541709423065186, | |
"learning_rate": 9.964598301763554e-06, | |
"loss": 0.7433, | |
"step": 1134 | |
}, | |
{ | |
"epoch": 0.037061224489795916, | |
"grad_norm": 2.9385082721710205, | |
"learning_rate": 9.964565643370347e-06, | |
"loss": 0.8005, | |
"step": 1135 | |
}, | |
{ | |
"epoch": 0.03709387755102041, | |
"grad_norm": 3.042072057723999, | |
"learning_rate": 9.96453298497714e-06, | |
"loss": 0.7819, | |
"step": 1136 | |
}, | |
{ | |
"epoch": 0.0371265306122449, | |
"grad_norm": 2.9819841384887695, | |
"learning_rate": 9.964500326583932e-06, | |
"loss": 0.7911, | |
"step": 1137 | |
}, | |
{ | |
"epoch": 0.03715918367346939, | |
"grad_norm": 3.242607831954956, | |
"learning_rate": 9.964467668190726e-06, | |
"loss": 0.7926, | |
"step": 1138 | |
}, | |
{ | |
"epoch": 0.037191836734693874, | |
"grad_norm": 2.9996449947357178, | |
"learning_rate": 9.964435009797518e-06, | |
"loss": 0.7904, | |
"step": 1139 | |
}, | |
{ | |
"epoch": 0.037224489795918365, | |
"grad_norm": 3.1378376483917236, | |
"learning_rate": 9.964402351404311e-06, | |
"loss": 0.8, | |
"step": 1140 | |
}, | |
{ | |
"epoch": 0.03725714285714286, | |
"grad_norm": 2.9573757648468018, | |
"learning_rate": 9.964369693011105e-06, | |
"loss": 0.7589, | |
"step": 1141 | |
}, | |
{ | |
"epoch": 0.03728979591836735, | |
"grad_norm": 3.122229814529419, | |
"learning_rate": 9.964337034617898e-06, | |
"loss": 0.7852, | |
"step": 1142 | |
}, | |
{ | |
"epoch": 0.03732244897959184, | |
"grad_norm": 3.3379294872283936, | |
"learning_rate": 9.964304376224691e-06, | |
"loss": 0.7868, | |
"step": 1143 | |
}, | |
{ | |
"epoch": 0.037355102040816324, | |
"grad_norm": 3.2145421504974365, | |
"learning_rate": 9.964271717831484e-06, | |
"loss": 0.7516, | |
"step": 1144 | |
}, | |
{ | |
"epoch": 0.037387755102040815, | |
"grad_norm": 3.068854808807373, | |
"learning_rate": 9.964239059438276e-06, | |
"loss": 0.7539, | |
"step": 1145 | |
}, | |
{ | |
"epoch": 0.037420408163265306, | |
"grad_norm": 3.1995043754577637, | |
"learning_rate": 9.964206401045069e-06, | |
"loss": 0.7927, | |
"step": 1146 | |
}, | |
{ | |
"epoch": 0.0374530612244898, | |
"grad_norm": 3.0854032039642334, | |
"learning_rate": 9.964173742651862e-06, | |
"loss": 0.7608, | |
"step": 1147 | |
}, | |
{ | |
"epoch": 0.03748571428571429, | |
"grad_norm": 3.3977510929107666, | |
"learning_rate": 9.964141084258655e-06, | |
"loss": 0.779, | |
"step": 1148 | |
}, | |
{ | |
"epoch": 0.03751836734693877, | |
"grad_norm": 3.0493321418762207, | |
"learning_rate": 9.964108425865449e-06, | |
"loss": 0.7329, | |
"step": 1149 | |
}, | |
{ | |
"epoch": 0.037551020408163265, | |
"grad_norm": 2.8519468307495117, | |
"learning_rate": 9.96407576747224e-06, | |
"loss": 0.7954, | |
"step": 1150 | |
}, | |
{ | |
"epoch": 0.037551020408163265, | |
"eval_loss": 0.8091681003570557, | |
"eval_runtime": 85.0018, | |
"eval_samples_per_second": 1.176, | |
"eval_steps_per_second": 1.176, | |
"step": 1150 | |
}, | |
{ | |
"epoch": 0.037583673469387756, | |
"grad_norm": 2.9106438159942627, | |
"learning_rate": 9.964043109079035e-06, | |
"loss": 0.7796, | |
"step": 1151 | |
}, | |
{ | |
"epoch": 0.03761632653061225, | |
"grad_norm": 3.1851396560668945, | |
"learning_rate": 9.964010450685827e-06, | |
"loss": 0.7718, | |
"step": 1152 | |
}, | |
{ | |
"epoch": 0.03764897959183673, | |
"grad_norm": 3.101126194000244, | |
"learning_rate": 9.96397779229262e-06, | |
"loss": 0.8017, | |
"step": 1153 | |
}, | |
{ | |
"epoch": 0.03768163265306122, | |
"grad_norm": 3.1867284774780273, | |
"learning_rate": 9.963945133899413e-06, | |
"loss": 0.7973, | |
"step": 1154 | |
}, | |
{ | |
"epoch": 0.037714285714285714, | |
"grad_norm": 2.98366379737854, | |
"learning_rate": 9.963912475506205e-06, | |
"loss": 0.8105, | |
"step": 1155 | |
}, | |
{ | |
"epoch": 0.037746938775510205, | |
"grad_norm": 2.998530387878418, | |
"learning_rate": 9.963879817113e-06, | |
"loss": 0.7738, | |
"step": 1156 | |
}, | |
{ | |
"epoch": 0.0377795918367347, | |
"grad_norm": 3.0798680782318115, | |
"learning_rate": 9.963847158719791e-06, | |
"loss": 0.7693, | |
"step": 1157 | |
}, | |
{ | |
"epoch": 0.03781224489795918, | |
"grad_norm": 3.2727184295654297, | |
"learning_rate": 9.963814500326584e-06, | |
"loss": 0.7271, | |
"step": 1158 | |
}, | |
{ | |
"epoch": 0.03784489795918367, | |
"grad_norm": 3.05658221244812, | |
"learning_rate": 9.963781841933378e-06, | |
"loss": 0.7759, | |
"step": 1159 | |
}, | |
{ | |
"epoch": 0.037877551020408164, | |
"grad_norm": 3.0974197387695312, | |
"learning_rate": 9.963749183540171e-06, | |
"loss": 0.7844, | |
"step": 1160 | |
}, | |
{ | |
"epoch": 0.037910204081632655, | |
"grad_norm": 3.1191930770874023, | |
"learning_rate": 9.963716525146964e-06, | |
"loss": 0.8066, | |
"step": 1161 | |
}, | |
{ | |
"epoch": 0.037942857142857146, | |
"grad_norm": 3.0258705615997314, | |
"learning_rate": 9.963683866753756e-06, | |
"loss": 0.7373, | |
"step": 1162 | |
}, | |
{ | |
"epoch": 0.03797551020408163, | |
"grad_norm": 3.1126925945281982, | |
"learning_rate": 9.963651208360549e-06, | |
"loss": 0.7577, | |
"step": 1163 | |
}, | |
{ | |
"epoch": 0.03800816326530612, | |
"grad_norm": 3.1630795001983643, | |
"learning_rate": 9.963618549967342e-06, | |
"loss": 0.7333, | |
"step": 1164 | |
}, | |
{ | |
"epoch": 0.03804081632653061, | |
"grad_norm": 3.100018262863159, | |
"learning_rate": 9.963585891574135e-06, | |
"loss": 0.7697, | |
"step": 1165 | |
}, | |
{ | |
"epoch": 0.038073469387755104, | |
"grad_norm": 3.369335889816284, | |
"learning_rate": 9.963553233180929e-06, | |
"loss": 0.7788, | |
"step": 1166 | |
}, | |
{ | |
"epoch": 0.03810612244897959, | |
"grad_norm": 3.357944965362549, | |
"learning_rate": 9.963520574787722e-06, | |
"loss": 0.7744, | |
"step": 1167 | |
}, | |
{ | |
"epoch": 0.03813877551020408, | |
"grad_norm": 3.253232717514038, | |
"learning_rate": 9.963487916394513e-06, | |
"loss": 0.752, | |
"step": 1168 | |
}, | |
{ | |
"epoch": 0.03817142857142857, | |
"grad_norm": 2.9491517543792725, | |
"learning_rate": 9.963455258001307e-06, | |
"loss": 0.7657, | |
"step": 1169 | |
}, | |
{ | |
"epoch": 0.03820408163265306, | |
"grad_norm": 3.1153948307037354, | |
"learning_rate": 9.9634225996081e-06, | |
"loss": 0.7545, | |
"step": 1170 | |
}, | |
{ | |
"epoch": 0.038236734693877554, | |
"grad_norm": 3.0334460735321045, | |
"learning_rate": 9.963389941214893e-06, | |
"loss": 0.7999, | |
"step": 1171 | |
}, | |
{ | |
"epoch": 0.03826938775510204, | |
"grad_norm": 3.0638813972473145, | |
"learning_rate": 9.963357282821686e-06, | |
"loss": 0.7922, | |
"step": 1172 | |
}, | |
{ | |
"epoch": 0.03830204081632653, | |
"grad_norm": 3.0759074687957764, | |
"learning_rate": 9.963324624428478e-06, | |
"loss": 0.7521, | |
"step": 1173 | |
}, | |
{ | |
"epoch": 0.03833469387755102, | |
"grad_norm": 3.0248312950134277, | |
"learning_rate": 9.963291966035273e-06, | |
"loss": 0.7695, | |
"step": 1174 | |
}, | |
{ | |
"epoch": 0.03836734693877551, | |
"grad_norm": 3.101550817489624, | |
"learning_rate": 9.963259307642064e-06, | |
"loss": 0.7446, | |
"step": 1175 | |
}, | |
{ | |
"epoch": 0.0384, | |
"grad_norm": 3.0817489624023438, | |
"learning_rate": 9.963226649248858e-06, | |
"loss": 0.7653, | |
"step": 1176 | |
}, | |
{ | |
"epoch": 0.03843265306122449, | |
"grad_norm": 3.051537036895752, | |
"learning_rate": 9.96319399085565e-06, | |
"loss": 0.7723, | |
"step": 1177 | |
}, | |
{ | |
"epoch": 0.03846530612244898, | |
"grad_norm": 3.030776023864746, | |
"learning_rate": 9.963161332462442e-06, | |
"loss": 0.7762, | |
"step": 1178 | |
}, | |
{ | |
"epoch": 0.03849795918367347, | |
"grad_norm": 3.1081271171569824, | |
"learning_rate": 9.963128674069237e-06, | |
"loss": 0.8198, | |
"step": 1179 | |
}, | |
{ | |
"epoch": 0.03853061224489796, | |
"grad_norm": 3.1396594047546387, | |
"learning_rate": 9.963096015676029e-06, | |
"loss": 0.7389, | |
"step": 1180 | |
}, | |
{ | |
"epoch": 0.038563265306122446, | |
"grad_norm": 2.977698564529419, | |
"learning_rate": 9.963063357282822e-06, | |
"loss": 0.7961, | |
"step": 1181 | |
}, | |
{ | |
"epoch": 0.03859591836734694, | |
"grad_norm": 3.048220157623291, | |
"learning_rate": 9.963030698889615e-06, | |
"loss": 0.8025, | |
"step": 1182 | |
}, | |
{ | |
"epoch": 0.03862857142857143, | |
"grad_norm": 3.087770700454712, | |
"learning_rate": 9.962998040496409e-06, | |
"loss": 0.6993, | |
"step": 1183 | |
}, | |
{ | |
"epoch": 0.03866122448979592, | |
"grad_norm": 3.0148611068725586, | |
"learning_rate": 9.962965382103202e-06, | |
"loss": 0.7555, | |
"step": 1184 | |
}, | |
{ | |
"epoch": 0.03869387755102041, | |
"grad_norm": 3.4241836071014404, | |
"learning_rate": 9.962932723709993e-06, | |
"loss": 0.7876, | |
"step": 1185 | |
}, | |
{ | |
"epoch": 0.038726530612244896, | |
"grad_norm": 3.200383424758911, | |
"learning_rate": 9.962900065316787e-06, | |
"loss": 0.7739, | |
"step": 1186 | |
}, | |
{ | |
"epoch": 0.03875918367346939, | |
"grad_norm": 3.0834004878997803, | |
"learning_rate": 9.96286740692358e-06, | |
"loss": 0.7524, | |
"step": 1187 | |
}, | |
{ | |
"epoch": 0.03879183673469388, | |
"grad_norm": 2.9796390533447266, | |
"learning_rate": 9.962834748530373e-06, | |
"loss": 0.7495, | |
"step": 1188 | |
}, | |
{ | |
"epoch": 0.03882448979591837, | |
"grad_norm": 3.1456053256988525, | |
"learning_rate": 9.962802090137166e-06, | |
"loss": 0.7939, | |
"step": 1189 | |
}, | |
{ | |
"epoch": 0.038857142857142854, | |
"grad_norm": 3.227130651473999, | |
"learning_rate": 9.96276943174396e-06, | |
"loss": 0.7283, | |
"step": 1190 | |
}, | |
{ | |
"epoch": 0.038889795918367345, | |
"grad_norm": 3.2170636653900146, | |
"learning_rate": 9.962736773350751e-06, | |
"loss": 0.7481, | |
"step": 1191 | |
}, | |
{ | |
"epoch": 0.03892244897959184, | |
"grad_norm": 6.70438814163208, | |
"learning_rate": 9.962704114957546e-06, | |
"loss": 0.7565, | |
"step": 1192 | |
}, | |
{ | |
"epoch": 0.03895510204081633, | |
"grad_norm": 3.2916693687438965, | |
"learning_rate": 9.962671456564338e-06, | |
"loss": 0.7732, | |
"step": 1193 | |
}, | |
{ | |
"epoch": 0.03898775510204082, | |
"grad_norm": 3.4526243209838867, | |
"learning_rate": 9.96263879817113e-06, | |
"loss": 0.7563, | |
"step": 1194 | |
}, | |
{ | |
"epoch": 0.039020408163265304, | |
"grad_norm": 3.188997507095337, | |
"learning_rate": 9.962606139777924e-06, | |
"loss": 0.771, | |
"step": 1195 | |
}, | |
{ | |
"epoch": 0.039053061224489795, | |
"grad_norm": 3.0594732761383057, | |
"learning_rate": 9.962573481384716e-06, | |
"loss": 0.7433, | |
"step": 1196 | |
}, | |
{ | |
"epoch": 0.039085714285714286, | |
"grad_norm": 3.239617109298706, | |
"learning_rate": 9.96254082299151e-06, | |
"loss": 0.7823, | |
"step": 1197 | |
}, | |
{ | |
"epoch": 0.03911836734693878, | |
"grad_norm": 2.983693838119507, | |
"learning_rate": 9.962508164598302e-06, | |
"loss": 0.7557, | |
"step": 1198 | |
}, | |
{ | |
"epoch": 0.03915102040816327, | |
"grad_norm": 2.9180068969726562, | |
"learning_rate": 9.962475506205095e-06, | |
"loss": 0.7915, | |
"step": 1199 | |
}, | |
{ | |
"epoch": 0.03918367346938775, | |
"grad_norm": 3.102025270462036, | |
"learning_rate": 9.962442847811888e-06, | |
"loss": 0.764, | |
"step": 1200 | |
}, | |
{ | |
"epoch": 0.03918367346938775, | |
"eval_loss": 0.807881772518158, | |
"eval_runtime": 85.0203, | |
"eval_samples_per_second": 1.176, | |
"eval_steps_per_second": 1.176, | |
"step": 1200 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 306250, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 50, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 1.305701335141549e+18, | |
"train_batch_size": 1, | |
"trial_name": null, | |
"trial_params": null | |
} | |