|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2757479663587481, |
|
"eval_steps": 500, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0013787398317937406, |
|
"grad_norm": 1.0734721422195435, |
|
"learning_rate": 0.0002, |
|
"loss": 2.8003, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.002757479663587481, |
|
"grad_norm": 0.19643057882785797, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3727, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.004136219495381221, |
|
"grad_norm": 0.8137874007225037, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3519, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.005514959327174962, |
|
"grad_norm": 0.4064357578754425, |
|
"learning_rate": 0.0002, |
|
"loss": 0.357, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0068936991589687024, |
|
"grad_norm": 1.0673978328704834, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3515, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.008272438990762443, |
|
"grad_norm": 2.2639453411102295, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4068, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.009651178822556184, |
|
"grad_norm": 1.2247616052627563, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5108, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.011029918654349925, |
|
"grad_norm": 0.07231716066598892, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3644, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.012408658486143665, |
|
"grad_norm": 1.4329577684402466, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3571, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.013787398317937405, |
|
"grad_norm": 0.8798255920410156, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3521, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.015166138149731145, |
|
"grad_norm": 0.06499180942773819, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3597, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.016544877981524885, |
|
"grad_norm": 0.284839928150177, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4099, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.017923617813318627, |
|
"grad_norm": 0.12186373770236969, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3572, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.01930235764511237, |
|
"grad_norm": 0.21747085452079773, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3497, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.020681097476906107, |
|
"grad_norm": 0.22812429070472717, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3778, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02205983730869985, |
|
"grad_norm": 1.1110987663269043, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3607, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.023438577140493588, |
|
"grad_norm": 0.8457236886024475, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9183, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.02481731697228733, |
|
"grad_norm": 0.46427470445632935, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3503, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.02619605680408107, |
|
"grad_norm": 2.4730632305145264, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3824, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.02757479663587481, |
|
"grad_norm": 0.3589227497577667, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0105, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02895353646766855, |
|
"grad_norm": 3.5308260917663574, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3605, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.03033227629946229, |
|
"grad_norm": 0.5237946510314941, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3983, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.031711016131256035, |
|
"grad_norm": 0.5702632665634155, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3521, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.03308975596304977, |
|
"grad_norm": 1.1318608522415161, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3617, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.03446849579484351, |
|
"grad_norm": 0.536571741104126, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3555, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.035847235626637254, |
|
"grad_norm": 0.12125778943300247, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3733, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.037225975458430996, |
|
"grad_norm": 0.045536063611507416, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3514, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.03860471529022474, |
|
"grad_norm": 0.31765612959861755, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3536, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.03998345512201847, |
|
"grad_norm": 0.27900660037994385, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3495, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.041362194953812215, |
|
"grad_norm": 0.15112566947937012, |
|
"learning_rate": 0.0002, |
|
"loss": 0.355, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.04274093478560596, |
|
"grad_norm": 0.2682303786277771, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3609, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.0441196746173997, |
|
"grad_norm": 1.0106860399246216, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3697, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.04549841444919344, |
|
"grad_norm": 1.0782426595687866, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3594, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.046877154280987175, |
|
"grad_norm": 2.294581651687622, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3676, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.04825589411278092, |
|
"grad_norm": 0.6223801970481873, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3741, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.04963463394457466, |
|
"grad_norm": 0.2735952138900757, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3628, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0510133737763684, |
|
"grad_norm": 0.7569056153297424, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1827, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.05239211360816214, |
|
"grad_norm": 0.6536706686019897, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3543, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.05377085343995588, |
|
"grad_norm": 0.3573110103607178, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3529, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.05514959327174962, |
|
"grad_norm": 0.8121228218078613, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3566, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05652833310354336, |
|
"grad_norm": 0.7444269061088562, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7803, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.0579070729353371, |
|
"grad_norm": 0.5015038847923279, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6134, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.059285812767130845, |
|
"grad_norm": 0.08748292177915573, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3501, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.06066455259892458, |
|
"grad_norm": 0.3987080156803131, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3484, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.06204329243071832, |
|
"grad_norm": 347.7005920410156, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8413, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.06342203226251207, |
|
"grad_norm": 88.2750473022461, |
|
"learning_rate": 0.0002, |
|
"loss": 2.8013, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.0648007720943058, |
|
"grad_norm": 0.8716701865196228, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8356, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.06617951192609954, |
|
"grad_norm": 0.8243119120597839, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3616, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.06755825175789329, |
|
"grad_norm": 1.1744294166564941, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3998, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.06893699158968702, |
|
"grad_norm": 0.03163053095340729, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3549, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.07031573142148077, |
|
"grad_norm": 3.4403915405273438, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3722, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.07169447125327451, |
|
"grad_norm": 1.0608879327774048, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3701, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.07307321108506824, |
|
"grad_norm": 1.2809940576553345, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3602, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.07445195091686199, |
|
"grad_norm": 0.40460118651390076, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3532, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.07583069074865573, |
|
"grad_norm": 0.6290703415870667, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3477, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.07720943058044948, |
|
"grad_norm": 0.2159261256456375, |
|
"learning_rate": 0.0002, |
|
"loss": 0.405, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.07858817041224321, |
|
"grad_norm": 0.37101301550865173, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3752, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.07996691024403695, |
|
"grad_norm": 1.3007190227508545, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3503, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.0813456500758307, |
|
"grad_norm": 0.4508918225765228, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3531, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.08272438990762443, |
|
"grad_norm": 0.46898791193962097, |
|
"learning_rate": 0.0002, |
|
"loss": 0.347, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.08410312973941818, |
|
"grad_norm": 0.8449831604957581, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3546, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.08548186957121191, |
|
"grad_norm": 0.7988163232803345, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3505, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.08686060940300565, |
|
"grad_norm": 0.4426226317882538, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3649, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.0882393492347994, |
|
"grad_norm": 0.2260913848876953, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3499, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.08961808906659313, |
|
"grad_norm": 1.476747751235962, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3546, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.09099682889838688, |
|
"grad_norm": 0.7640777230262756, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3568, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.09237556873018062, |
|
"grad_norm": 0.8559088706970215, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3507, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.09375430856197435, |
|
"grad_norm": 0.20833595097064972, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3556, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.0951330483937681, |
|
"grad_norm": 1.1485021114349365, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3516, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.09651178822556183, |
|
"grad_norm": 1.0206815004348755, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3591, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.09789052805735558, |
|
"grad_norm": 0.9966775178909302, |
|
"learning_rate": 0.0002, |
|
"loss": 0.359, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.09926926788914932, |
|
"grad_norm": 0.8833585977554321, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3546, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.10064800772094305, |
|
"grad_norm": 1.0842584371566772, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3556, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.1020267475527368, |
|
"grad_norm": 0.3791511058807373, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3571, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.10340548738453054, |
|
"grad_norm": 0.24666732549667358, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3502, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.10478422721632429, |
|
"grad_norm": 0.21794968843460083, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3483, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.10616296704811802, |
|
"grad_norm": 0.47017499804496765, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3499, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.10754170687991176, |
|
"grad_norm": 0.2813131809234619, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3482, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.1089204467117055, |
|
"grad_norm": 1.2175363302230835, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3524, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.11029918654349924, |
|
"grad_norm": 0.2712210416793823, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3526, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.11167792637529299, |
|
"grad_norm": 0.1428445726633072, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3518, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.11305666620708672, |
|
"grad_norm": 0.23716595768928528, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3501, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.11443540603888046, |
|
"grad_norm": 0.07993923872709274, |
|
"learning_rate": 0.0002, |
|
"loss": 0.349, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.1158141458706742, |
|
"grad_norm": 1.2958595752716064, |
|
"learning_rate": 0.0002, |
|
"loss": 0.352, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.11719288570246794, |
|
"grad_norm": 1.6257132291793823, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3589, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.11857162553426169, |
|
"grad_norm": 0.20367591083049774, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3602, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.11995036536605543, |
|
"grad_norm": 1.147210955619812, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3619, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.12132910519784916, |
|
"grad_norm": 0.19706425070762634, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3536, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.12270784502964291, |
|
"grad_norm": 0.17990930378437042, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3501, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.12408658486143664, |
|
"grad_norm": 0.5770463943481445, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3509, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.12546532469323038, |
|
"grad_norm": 0.24645955860614777, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3526, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.12684406452502414, |
|
"grad_norm": 0.15745119750499725, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3503, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.12822280435681788, |
|
"grad_norm": 0.054484590888023376, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3508, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.1296015441886116, |
|
"grad_norm": 0.30564025044441223, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3489, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.13098028402040535, |
|
"grad_norm": 0.3614678382873535, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3477, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.13235902385219908, |
|
"grad_norm": 0.703029990196228, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3552, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.13373776368399284, |
|
"grad_norm": 1.1954560279846191, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3528, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.13511650351578658, |
|
"grad_norm": 0.8106504678726196, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3586, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.13649524334758031, |
|
"grad_norm": 0.40758854150772095, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3908, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.13787398317937405, |
|
"grad_norm": 0.613096296787262, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3519, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.13925272301116778, |
|
"grad_norm": 0.38185614347457886, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3506, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.14063146284296155, |
|
"grad_norm": 0.07220327854156494, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3472, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.14201020267475528, |
|
"grad_norm": 0.1451689898967743, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3534, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.14338894250654902, |
|
"grad_norm": 0.08052591234445572, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3476, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.14476768233834275, |
|
"grad_norm": 1.0108163356781006, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3508, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.1461464221701365, |
|
"grad_norm": 0.5895722508430481, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3541, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.14752516200193025, |
|
"grad_norm": 0.6988415718078613, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3512, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.14890390183372398, |
|
"grad_norm": 0.54078608751297, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3479, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.15028264166551772, |
|
"grad_norm": 0.19162333011627197, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3518, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.15166138149731145, |
|
"grad_norm": 0.36928215622901917, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3505, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1530401213291052, |
|
"grad_norm": 0.572607696056366, |
|
"learning_rate": 0.0002, |
|
"loss": 0.355, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.15441886116089895, |
|
"grad_norm": 0.20841191709041595, |
|
"learning_rate": 0.0002, |
|
"loss": 0.348, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.1557976009926927, |
|
"grad_norm": 0.04682110995054245, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3502, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.15717634082448642, |
|
"grad_norm": 0.867899477481842, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3476, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.15855508065628016, |
|
"grad_norm": 0.2828502655029297, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3525, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.1599338204880739, |
|
"grad_norm": 0.44510889053344727, |
|
"learning_rate": 0.0002, |
|
"loss": 0.35, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.16131256031986765, |
|
"grad_norm": 0.1896822154521942, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3493, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.1626913001516614, |
|
"grad_norm": 0.15781590342521667, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3477, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.16407003998345512, |
|
"grad_norm": 0.2315225899219513, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3498, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.16544877981524886, |
|
"grad_norm": 0.2198018729686737, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3484, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1668275196470426, |
|
"grad_norm": 0.2039571851491928, |
|
"learning_rate": 0.0002, |
|
"loss": 0.348, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.16820625947883636, |
|
"grad_norm": 0.009352603927254677, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3481, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.1695849993106301, |
|
"grad_norm": 0.2558707892894745, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3475, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.17096373914242383, |
|
"grad_norm": 0.07278712838888168, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3471, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.17234247897421756, |
|
"grad_norm": 0.4133436381816864, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3453, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.1737212188060113, |
|
"grad_norm": 0.16729828715324402, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3527, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.17509995863780506, |
|
"grad_norm": 0.33326980471611023, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3463, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.1764786984695988, |
|
"grad_norm": 0.7140666246414185, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3627, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.17785743830139253, |
|
"grad_norm": 0.17751634120941162, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3506, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.17923617813318626, |
|
"grad_norm": 0.44009125232696533, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3516, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.18061491796498, |
|
"grad_norm": 0.07371579110622406, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3479, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.18199365779677376, |
|
"grad_norm": 0.6804266571998596, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3476, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.1833723976285675, |
|
"grad_norm": 0.19634029269218445, |
|
"learning_rate": 0.0002, |
|
"loss": 0.354, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.18475113746036123, |
|
"grad_norm": 0.34020882844924927, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3481, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.18612987729215497, |
|
"grad_norm": 0.38502731919288635, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3501, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.1875086171239487, |
|
"grad_norm": 0.0810522586107254, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3473, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.18888735695574246, |
|
"grad_norm": 0.4057389497756958, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3489, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.1902660967875362, |
|
"grad_norm": 0.17514599859714508, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3472, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.19164483661932993, |
|
"grad_norm": 0.10964088141918182, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3479, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.19302357645112367, |
|
"grad_norm": 0.20920871198177338, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3488, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.1944023162829174, |
|
"grad_norm": 1.149121880531311, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3548, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.19578105611471117, |
|
"grad_norm": 1.3394649028778076, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3495, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.1971597959465049, |
|
"grad_norm": 1.2763960361480713, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3679, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.19853853577829864, |
|
"grad_norm": 0.5421571731567383, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3538, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.19991727561009237, |
|
"grad_norm": 0.22273503243923187, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3518, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.2012960154418861, |
|
"grad_norm": 0.6335702538490295, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3481, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.20267475527367987, |
|
"grad_norm": 0.7090324759483337, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3486, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.2040534951054736, |
|
"grad_norm": 0.011333847418427467, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3476, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.20543223493726734, |
|
"grad_norm": 0.24088676273822784, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3569, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.20681097476906107, |
|
"grad_norm": 0.8654371500015259, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3528, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2081897146008548, |
|
"grad_norm": 0.06135034188628197, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3509, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.20956845443264857, |
|
"grad_norm": 0.38141730427742004, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3474, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.2109471942644423, |
|
"grad_norm": 1.1622456312179565, |
|
"learning_rate": 0.0002, |
|
"loss": 0.353, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.21232593409623604, |
|
"grad_norm": 0.5747712254524231, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3513, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.21370467392802978, |
|
"grad_norm": 0.09723293781280518, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3482, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.2150834137598235, |
|
"grad_norm": 0.18574804067611694, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3512, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.21646215359161727, |
|
"grad_norm": 0.33651217818260193, |
|
"learning_rate": 0.0002, |
|
"loss": 0.349, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.217840893423411, |
|
"grad_norm": 0.07309216260910034, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3493, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.21921963325520474, |
|
"grad_norm": 0.19346486032009125, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3478, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.22059837308699848, |
|
"grad_norm": 0.3398933708667755, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3496, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.2219771129187922, |
|
"grad_norm": 0.34032130241394043, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3488, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.22335585275058598, |
|
"grad_norm": 0.901030421257019, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3531, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.2247345925823797, |
|
"grad_norm": 0.500088632106781, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3516, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.22611333241417345, |
|
"grad_norm": 0.3230324387550354, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3546, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.22749207224596718, |
|
"grad_norm": 1.2476601600646973, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3512, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.22887081207776092, |
|
"grad_norm": 0.23318485915660858, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3456, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.23024955190955468, |
|
"grad_norm": 0.472400963306427, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3551, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.2316282917413484, |
|
"grad_norm": 0.04836912825703621, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3505, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.23300703157314215, |
|
"grad_norm": 0.34590113162994385, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3507, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.23438577140493588, |
|
"grad_norm": 0.23341989517211914, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3478, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.23576451123672962, |
|
"grad_norm": 0.001271920627914369, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3562, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.23714325106852338, |
|
"grad_norm": 0.20549911260604858, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3479, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.23852199090031712, |
|
"grad_norm": 0.3825775384902954, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3482, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.23990073073211085, |
|
"grad_norm": 0.028804048895835876, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3481, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.24127947056390459, |
|
"grad_norm": 0.04462611302733421, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3477, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.24265821039569832, |
|
"grad_norm": 0.6634818315505981, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3501, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.24403695022749208, |
|
"grad_norm": 1.3807406425476074, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3547, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.24541569005928582, |
|
"grad_norm": 0.24347831308841705, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3473, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.24679442989107955, |
|
"grad_norm": 0.61258465051651, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3492, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.2481731697228733, |
|
"grad_norm": 0.011543272994458675, |
|
"learning_rate": 0.0002, |
|
"loss": 0.347, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.24955190955466702, |
|
"grad_norm": 0.09996844828128815, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3474, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.25093064938646076, |
|
"grad_norm": 0.17044603824615479, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3471, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.2523093892182545, |
|
"grad_norm": 0.17940489947795868, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3478, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.2536881290500483, |
|
"grad_norm": 0.21834205090999603, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3461, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.255066868881842, |
|
"grad_norm": 0.2272634655237198, |
|
"learning_rate": 0.0002, |
|
"loss": 0.347, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.25644560871363575, |
|
"grad_norm": 0.18734070658683777, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3497, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.25782434854542946, |
|
"grad_norm": 0.04078834876418114, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3493, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.2592030883772232, |
|
"grad_norm": 0.3463903069496155, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3494, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.260581828209017, |
|
"grad_norm": 0.3256634771823883, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3488, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.2619605680408107, |
|
"grad_norm": 0.04404434189200401, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3509, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.26333930787260446, |
|
"grad_norm": 0.20446011424064636, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3526, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.26471804770439816, |
|
"grad_norm": 0.06089179962873459, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3491, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.2660967875361919, |
|
"grad_norm": 0.32661405205726624, |
|
"learning_rate": 0.0002, |
|
"loss": 0.35, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.2674755273679857, |
|
"grad_norm": 0.09823151677846909, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3482, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.2688542671997794, |
|
"grad_norm": 0.11397412419319153, |
|
"learning_rate": 0.0002, |
|
"loss": 0.347, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.27023300703157316, |
|
"grad_norm": 0.2632172703742981, |
|
"learning_rate": 0.0002, |
|
"loss": 0.352, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.27161174686336687, |
|
"grad_norm": 0.27215296030044556, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3482, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.27299048669516063, |
|
"grad_norm": 0.20016005635261536, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3489, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.2743692265269544, |
|
"grad_norm": 0.3071637749671936, |
|
"learning_rate": 0.0002, |
|
"loss": 0.354, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.2757479663587481, |
|
"grad_norm": 1.0373337268829346, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3481, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 14, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.09149951218339e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|