|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.04019846943066185, |
|
"eval_steps": 500, |
|
"global_step": 239, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00016819443276427552, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.2492, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00033638886552855103, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0005045832982928265, |
|
"grad_norm": NaN, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0006727777310571021, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0008409721638213775, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.001009166596585653, |
|
"grad_norm": NaN, |
|
"learning_rate": 6e-06, |
|
"loss": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0011773610293499286, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 0.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0013455554621142041, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0015137498948784795, |
|
"grad_norm": NaN, |
|
"learning_rate": 9e-06, |
|
"loss": 0.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.001681944327642755, |
|
"grad_norm": NaN, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0018501387604070306, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.002018333193171306, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0021865276259355813, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.002354722058699857, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0025229164914641325, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0026911109242284082, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0028593053569926836, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 0.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.003027499789756959, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0031956942225212347, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.00336388865528551, |
|
"grad_norm": NaN, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0035320830880497854, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0037002775208140612, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0038684719535783366, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 0.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.004036666386342612, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.004204860819106888, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.004373055251871163, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.004541249684635438, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.004709444117399714, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.00487763855016399, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.9e-05, |
|
"loss": 0.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.005045832982928265, |
|
"grad_norm": NaN, |
|
"learning_rate": 3e-05, |
|
"loss": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.005214027415692541, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0053822218484568165, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.005550416281221091, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.005718610713985367, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.005886805146749643, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.006054999579513918, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.006223194012278194, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0063913884450424695, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.006559582877806744, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 0.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.00672777731057102, |
|
"grad_norm": NaN, |
|
"learning_rate": 4e-05, |
|
"loss": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.006895971743335296, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.007064166176099571, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.007232360608863847, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0074005550416281225, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.007568749474392397, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.007736943907156673, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.007905138339920948, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.008073332772685224, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0082415272054495, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.9e-05, |
|
"loss": 0.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.008409721638213775, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.008577916070978051, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 0.0, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.008746110503742325, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 0.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.008914304936506601, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 0.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.009082499369270877, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.009250693802035153, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.009418888234799428, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 0.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.009587082667563704, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 0.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.00975527710032798, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.009923471533092254, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.01009166596585653, |
|
"grad_norm": NaN, |
|
"learning_rate": 6e-05, |
|
"loss": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.010259860398620806, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.1e-05, |
|
"loss": 0.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.010428054831385081, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.010596249264149357, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.3e-05, |
|
"loss": 0.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.010764443696913633, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.010932638129677907, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.011100832562442183, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.011269026995206459, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.7e-05, |
|
"loss": 0.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.011437221427970734, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.01160541586073501, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.9e-05, |
|
"loss": 0.0, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.011773610293499286, |
|
"grad_norm": NaN, |
|
"learning_rate": 7e-05, |
|
"loss": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01194180472626356, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.1e-05, |
|
"loss": 0.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.012109999159027836, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.012278193591792112, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.3e-05, |
|
"loss": 0.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.012446388024556387, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.012614582457320663, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.012782776890084939, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.012950971322849213, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.7e-05, |
|
"loss": 0.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.013119165755613489, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.013287360188377765, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 0.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.01345555462114204, |
|
"grad_norm": NaN, |
|
"learning_rate": 8e-05, |
|
"loss": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.013623749053906316, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.1e-05, |
|
"loss": 0.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.013791943486670592, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.2e-05, |
|
"loss": 0.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.013960137919434866, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.3e-05, |
|
"loss": 0.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.014128332352199142, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.014296526784963418, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.5e-05, |
|
"loss": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.014464721217727693, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.014632915650491969, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.7e-05, |
|
"loss": 0.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.014801110083256245, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.014969304516020519, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 0.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.015137498948784795, |
|
"grad_norm": NaN, |
|
"learning_rate": 9e-05, |
|
"loss": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01530569338154907, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.1e-05, |
|
"loss": 0.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.015473887814313346, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.01564208224707762, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 0.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.015810276679841896, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.4e-05, |
|
"loss": 0.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.015978471112606172, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.016146665545370448, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.016314859978134724, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.7e-05, |
|
"loss": 0.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.016483054410899, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.8e-05, |
|
"loss": 0.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.016651248843663275, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 0.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.01681944327642755, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.016987637709191827, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999999277778003e-05, |
|
"loss": 0.0, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.017155832141956102, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999997111112216e-05, |
|
"loss": 0.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.017324026574720378, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999993500003267e-05, |
|
"loss": 0.0, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.01749222100748465, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999988444452199e-05, |
|
"loss": 0.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.017660415440248926, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999981944460473e-05, |
|
"loss": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.017828609873013202, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999974000029966e-05, |
|
"loss": 0.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.017996804305777478, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999964611162974e-05, |
|
"loss": 0.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.018164998738541754, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999953777862207e-05, |
|
"loss": 0.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.01833319317130603, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999941500130797e-05, |
|
"loss": 0.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.018501387604070305, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.99992777797229e-05, |
|
"loss": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.01866958203683458, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999912611390651e-05, |
|
"loss": 0.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.018837776469598857, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999896000390261e-05, |
|
"loss": 0.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.019005970902363133, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999877944975917e-05, |
|
"loss": 0.0, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.01917416533512741, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999858445152839e-05, |
|
"loss": 0.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.019342359767891684, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999837500926656e-05, |
|
"loss": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.01951055420065596, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.99981511230342e-05, |
|
"loss": 0.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.019678748633420232, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999791279289601e-05, |
|
"loss": 0.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.019846943066184508, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999766001892081e-05, |
|
"loss": 0.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.020015137498948784, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999739280118163e-05, |
|
"loss": 0.0, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.02018333193171306, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999711113975568e-05, |
|
"loss": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.020351526364477335, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999681503472433e-05, |
|
"loss": 0.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.02051972079724161, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.99965044861731e-05, |
|
"loss": 0.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.020687915230005887, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999617949419174e-05, |
|
"loss": 0.0, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.020856109662770163, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999584005887407e-05, |
|
"loss": 0.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.02102430409553444, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999548618031823e-05, |
|
"loss": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.021192498528298714, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.99951178586264e-05, |
|
"loss": 0.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.02136069296106299, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.9994735093905e-05, |
|
"loss": 0.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.021528887393827266, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999433788626461e-05, |
|
"loss": 0.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.02169708182659154, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999392623581997e-05, |
|
"loss": 0.0, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.021865276259355814, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999350014269e-05, |
|
"loss": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.02203347069212009, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999305960699781e-05, |
|
"loss": 0.0, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.022201665124884366, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999260462887064e-05, |
|
"loss": 0.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.02236985955764864, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999213520843994e-05, |
|
"loss": 0.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.022538053990412917, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999165134584133e-05, |
|
"loss": 0.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.022706248423177193, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999115304121457e-05, |
|
"loss": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.02287444285594147, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999064029470366e-05, |
|
"loss": 0.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.023042637288705745, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.999011310645668e-05, |
|
"loss": 0.0, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.02321083172147002, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998957147662594e-05, |
|
"loss": 0.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.023379026154234296, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998901540536792e-05, |
|
"loss": 0.0, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.023547220586998572, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998844489284327e-05, |
|
"loss": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.023715415019762844, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998785993921678e-05, |
|
"loss": 0.0, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.02388360945252712, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998726054465744e-05, |
|
"loss": 0.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.024051803885291396, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998664670933844e-05, |
|
"loss": 0.0, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.02421999831805567, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998601843343707e-05, |
|
"loss": 0.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.024388192750819947, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998537571713487e-05, |
|
"loss": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.024556387183584223, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998471856061747e-05, |
|
"loss": 0.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.0247245816163485, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998404696407476e-05, |
|
"loss": 0.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.024892776049112775, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998336092770073e-05, |
|
"loss": 0.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.02506097048187705, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998266045169356e-05, |
|
"loss": 0.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.025229164914641326, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998194553625563e-05, |
|
"loss": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.025397359347405602, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998121618159346e-05, |
|
"loss": 0.0, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.025565553780169878, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.998047238791777e-05, |
|
"loss": 0.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.02573374821293415, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.997971415544341e-05, |
|
"loss": 0.0, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.025901942645698426, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.997894148438944e-05, |
|
"loss": 0.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.026070137078462702, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.997815437497908e-05, |
|
"loss": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.026238331511226978, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.997735282743969e-05, |
|
"loss": 0.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.026406525943991253, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.997653684200286e-05, |
|
"loss": 0.0, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.02657472037675553, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.99757064189043e-05, |
|
"loss": 0.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.026742914809519805, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.997486155838392e-05, |
|
"loss": 0.0, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.02691110924228408, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.997400226068578e-05, |
|
"loss": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.027079303675048357, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.997312852605814e-05, |
|
"loss": 0.0, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.027247498107812632, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.997224035475339e-05, |
|
"loss": 0.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.027415692540576908, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.997133774702812e-05, |
|
"loss": 0.0, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.027583886973341184, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.997042070314309e-05, |
|
"loss": 0.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.027752081406105456, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.996948922336323e-05, |
|
"loss": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.027920275838869732, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.996854330795761e-05, |
|
"loss": 0.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.028088470271634008, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.996758295719951e-05, |
|
"loss": 0.0, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.028256664704398284, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.996660817136636e-05, |
|
"loss": 0.0, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.02842485913716256, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.996561895073976e-05, |
|
"loss": 0.0, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.028593053569926835, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.996461529560553e-05, |
|
"loss": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.02876124800269111, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.996359720625354e-05, |
|
"loss": 0.0, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.028929442435455387, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.996256468297795e-05, |
|
"loss": 0.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.029097636868219662, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.996151772607704e-05, |
|
"loss": 0.0, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.029265831300983938, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.996045633585326e-05, |
|
"loss": 0.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.029434025733748214, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.995938051261324e-05, |
|
"loss": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.02960222016651249, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.995829025666775e-05, |
|
"loss": 0.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.029770414599276766, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.995718556833178e-05, |
|
"loss": 0.0, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.029938609032041038, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.995606644792446e-05, |
|
"loss": 0.0, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.030106803464805314, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.995493289576907e-05, |
|
"loss": 0.0, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.03027499789756959, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.99537849121931e-05, |
|
"loss": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.030443192330333865, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.995262249752817e-05, |
|
"loss": 0.0, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.03061138676309814, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.995144565211012e-05, |
|
"loss": 0.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.030779581195862417, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.99502543762789e-05, |
|
"loss": 0.0, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.030947775628626693, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.994904867037867e-05, |
|
"loss": 0.0, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.03111597006139097, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.994782853475774e-05, |
|
"loss": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.03128416449415524, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.994659396976859e-05, |
|
"loss": 0.0, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.03145235892691952, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.994534497576787e-05, |
|
"loss": 0.0, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.03162055335968379, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.994408155311642e-05, |
|
"loss": 0.0, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.03178874779244807, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.994280370217922e-05, |
|
"loss": 0.0, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.031956942225212344, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.99415114233254e-05, |
|
"loss": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.03212513665797662, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.994020471692833e-05, |
|
"loss": 0.0, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.032293331090740895, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.993888358336545e-05, |
|
"loss": 0.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.032461525523505175, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.993754802301847e-05, |
|
"loss": 0.0, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.03262971995626945, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.993619803627321e-05, |
|
"loss": 0.0, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.032797914389033726, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.993483362351963e-05, |
|
"loss": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.032966108821798, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.993345478515194e-05, |
|
"loss": 0.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.03313430325456227, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.993206152156844e-05, |
|
"loss": 0.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.03330249768732655, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.993065383317163e-05, |
|
"loss": 0.0, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.03347069212009082, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.992923172036819e-05, |
|
"loss": 0.0, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.0336388865528551, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.992779518356896e-05, |
|
"loss": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.033807080985619374, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.99263442231889e-05, |
|
"loss": 0.0, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.03397527541838365, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.99248788396472e-05, |
|
"loss": 0.0, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.034143469851147926, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.992339903336722e-05, |
|
"loss": 0.0, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.034311664283912205, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.992190480477641e-05, |
|
"loss": 0.0, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.03447985871667648, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.992039615430648e-05, |
|
"loss": 0.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.034648053149440756, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.991887308239322e-05, |
|
"loss": 0.0, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.03481624758220503, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.991733558947667e-05, |
|
"loss": 0.0, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.0349844420149693, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.991578367600096e-05, |
|
"loss": 0.0, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.03515263644773358, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.991421734241444e-05, |
|
"loss": 0.0, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.03532083088049785, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.99126365891696e-05, |
|
"loss": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.03548902531326213, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.991104141672309e-05, |
|
"loss": 0.0, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.035657219746026404, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.990943182553579e-05, |
|
"loss": 0.0, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.035825414178790684, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.990780781607261e-05, |
|
"loss": 0.0, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.035993608611554956, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.990616938880278e-05, |
|
"loss": 0.0, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.036161803044319235, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.990451654419957e-05, |
|
"loss": 0.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.03632999747708351, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.990284928274051e-05, |
|
"loss": 0.0, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.03649819190984779, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.990116760490723e-05, |
|
"loss": 0.0, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.03666638634261206, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.989947151118555e-05, |
|
"loss": 0.0, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.03683458077537634, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.989776100206548e-05, |
|
"loss": 0.0, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.03700277520814061, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.989603607804112e-05, |
|
"loss": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.03717096964090488, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.98942967396108e-05, |
|
"loss": 0.0, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.03733916407366916, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.989254298727702e-05, |
|
"loss": 0.0, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.037507358506433434, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.989077482154638e-05, |
|
"loss": 0.0, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.037675552939197714, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.988899224292971e-05, |
|
"loss": 0.0, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.037843747371961986, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.988719525194198e-05, |
|
"loss": 0.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.038011941804726265, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.988538384910231e-05, |
|
"loss": 0.0, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.03818013623749054, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.988355803493398e-05, |
|
"loss": 0.0, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.03834833067025482, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.988171780996446e-05, |
|
"loss": 0.0, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.03851652510301909, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.987986317472539e-05, |
|
"loss": 0.0, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.03868471953578337, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.987799412975252e-05, |
|
"loss": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.03885291396854764, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.987611067558582e-05, |
|
"loss": 0.0, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.03902110840131192, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.987421281276939e-05, |
|
"loss": 0.0, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.03918930283407619, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.98723005418515e-05, |
|
"loss": 0.0, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.039357497266840465, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.987037386338458e-05, |
|
"loss": 0.0, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.039525691699604744, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.986843277792523e-05, |
|
"loss": 0.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.039693886132369016, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.986647728603422e-05, |
|
"loss": 0.0, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.039862080565133295, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.986450738827646e-05, |
|
"loss": 0.0, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.04003027499789757, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.986252308522101e-05, |
|
"loss": 0.0, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.04019846943066185, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.986052437744115e-05, |
|
"loss": 0.0, |
|
"step": 239 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 5945, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 239, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4501042330337280.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|