{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.907120743034056, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1238390092879257, "grad_norm": 7.5771355628967285, "learning_rate": 0.0001999229036240723, "loss": 1.3532, "step": 10 }, { "epoch": 0.2476780185758514, "grad_norm": 0.3571035861968994, "learning_rate": 0.0001996917333733128, "loss": 1.0315, "step": 20 }, { "epoch": 0.3715170278637771, "grad_norm": 0.3278755843639374, "learning_rate": 0.00019930684569549264, "loss": 0.9018, "step": 30 }, { "epoch": 0.4953560371517028, "grad_norm": 0.4095502495765686, "learning_rate": 0.00019876883405951377, "loss": 0.807, "step": 40 }, { "epoch": 0.6191950464396285, "grad_norm": 0.4147421717643738, "learning_rate": 0.00019807852804032305, "loss": 0.7204, "step": 50 }, { "epoch": 0.7430340557275542, "grad_norm": 0.2525322735309601, "learning_rate": 0.00019723699203976766, "loss": 0.6548, "step": 60 }, { "epoch": 0.8668730650154799, "grad_norm": 0.28104496002197266, "learning_rate": 0.00019624552364536473, "loss": 0.6687, "step": 70 }, { "epoch": 0.9907120743034056, "grad_norm": 0.4467855393886566, "learning_rate": 0.00019510565162951537, "loss": 0.6317, "step": 80 }, { "epoch": 1.1145510835913313, "grad_norm": 0.2571893036365509, "learning_rate": 0.00019381913359224842, "loss": 0.6319, "step": 90 }, { "epoch": 1.238390092879257, "grad_norm": 0.24546292424201965, "learning_rate": 0.0001923879532511287, "loss": 0.6262, "step": 100 }, { "epoch": 1.3622291021671826, "grad_norm": 0.24089373648166656, "learning_rate": 0.00019081431738250814, "loss": 0.6309, "step": 110 }, { "epoch": 1.4860681114551084, "grad_norm": 0.24842403829097748, "learning_rate": 0.0001891006524188368, "loss": 0.6142, "step": 120 }, { "epoch": 1.609907120743034, "grad_norm": 0.2339727133512497, "learning_rate": 0.00018724960070727972, "loss": 0.6131, "step": 130 }, { "epoch": 1.7337461300309598, "grad_norm": 0.21254688501358032, "learning_rate": 0.00018526401643540922, "loss": 0.5892, "step": 140 }, { "epoch": 1.8575851393188856, "grad_norm": 0.34352943301200867, "learning_rate": 0.00018314696123025454, "loss": 0.6037, "step": 150 }, { "epoch": 1.9814241486068112, "grad_norm": 0.21427258849143982, "learning_rate": 0.00018090169943749476, "loss": 0.6051, "step": 160 }, { "epoch": 2.1052631578947367, "grad_norm": 0.23226872086524963, "learning_rate": 0.00017853169308807448, "loss": 0.6109, "step": 170 }, { "epoch": 2.2291021671826625, "grad_norm": 0.254842072725296, "learning_rate": 0.0001760405965600031, "loss": 0.5912, "step": 180 }, { "epoch": 2.3529411764705883, "grad_norm": 0.2571081519126892, "learning_rate": 0.00017343225094356855, "loss": 0.5975, "step": 190 }, { "epoch": 2.476780185758514, "grad_norm": 0.25343191623687744, "learning_rate": 0.00017071067811865476, "loss": 0.5786, "step": 200 }, { "epoch": 2.6006191950464395, "grad_norm": 0.21258015930652618, "learning_rate": 0.0001678800745532942, "loss": 0.586, "step": 210 }, { "epoch": 2.7244582043343653, "grad_norm": 0.25848379731178284, "learning_rate": 0.00016494480483301836, "loss": 0.5714, "step": 220 }, { "epoch": 2.848297213622291, "grad_norm": 0.26716166734695435, "learning_rate": 0.00016190939493098344, "loss": 0.5887, "step": 230 }, { "epoch": 2.972136222910217, "grad_norm": 0.23578402400016785, "learning_rate": 0.00015877852522924732, "loss": 0.5902, "step": 240 }, { "epoch": 3.0959752321981426, "grad_norm": 0.23565009236335754, "learning_rate": 0.00015555702330196023, "loss": 0.5792, "step": 250 }, { "epoch": 3.219814241486068, "grad_norm": 0.2390134632587433, "learning_rate": 0.0001522498564715949, "loss": 0.5676, "step": 260 }, { "epoch": 3.343653250773994, "grad_norm": 0.25006794929504395, "learning_rate": 0.00014886212414969553, "loss": 0.5788, "step": 270 }, { "epoch": 3.4674922600619196, "grad_norm": 0.2533760666847229, "learning_rate": 0.00014539904997395468, "loss": 0.5769, "step": 280 }, { "epoch": 3.5913312693498454, "grad_norm": 0.2808171510696411, "learning_rate": 0.0001418659737537428, "loss": 0.5521, "step": 290 }, { "epoch": 3.715170278637771, "grad_norm": 0.28783777356147766, "learning_rate": 0.000138268343236509, "loss": 0.5723, "step": 300 }, { "epoch": 3.8390092879256965, "grad_norm": 0.29237958788871765, "learning_rate": 0.0001346117057077493, "loss": 0.5668, "step": 310 }, { "epoch": 3.9628482972136223, "grad_norm": 0.2757062315940857, "learning_rate": 0.00013090169943749476, "loss": 0.579, "step": 320 }, { "epoch": 4.086687306501548, "grad_norm": 0.28595200181007385, "learning_rate": 0.00012714404498650743, "loss": 0.5488, "step": 330 }, { "epoch": 4.2105263157894735, "grad_norm": 0.2707183063030243, "learning_rate": 0.00012334453638559057, "loss": 0.5511, "step": 340 }, { "epoch": 4.3343653250774, "grad_norm": 0.3559975028038025, "learning_rate": 0.00011950903220161285, "loss": 0.5683, "step": 350 }, { "epoch": 4.458204334365325, "grad_norm": 0.2762058973312378, "learning_rate": 0.0001156434465040231, "loss": 0.5499, "step": 360 }, { "epoch": 4.58204334365325, "grad_norm": 0.2717606723308563, "learning_rate": 0.00011175373974578378, "loss": 0.5612, "step": 370 }, { "epoch": 4.705882352941177, "grad_norm": 0.27757707238197327, "learning_rate": 0.0001078459095727845, "loss": 0.5602, "step": 380 }, { "epoch": 4.829721362229102, "grad_norm": 0.3977556526660919, "learning_rate": 0.00010392598157590688, "loss": 0.5459, "step": 390 }, { "epoch": 4.953560371517028, "grad_norm": 0.26867300271987915, "learning_rate": 0.0001, "loss": 0.537, "step": 400 }, { "epoch": 5.077399380804954, "grad_norm": 0.26843276619911194, "learning_rate": 9.607401842409317e-05, "loss": 0.5601, "step": 410 }, { "epoch": 5.201238390092879, "grad_norm": 0.30268290638923645, "learning_rate": 9.215409042721552e-05, "loss": 0.5317, "step": 420 }, { "epoch": 5.325077399380805, "grad_norm": 0.3163929581642151, "learning_rate": 8.824626025421626e-05, "loss": 0.5343, "step": 430 }, { "epoch": 5.4489164086687305, "grad_norm": 0.2883571982383728, "learning_rate": 8.435655349597689e-05, "loss": 0.5255, "step": 440 }, { "epoch": 5.572755417956657, "grad_norm": 0.3254496157169342, "learning_rate": 8.049096779838719e-05, "loss": 0.5281, "step": 450 }, { "epoch": 5.696594427244582, "grad_norm": 0.2983749508857727, "learning_rate": 7.66554636144095e-05, "loss": 0.5515, "step": 460 }, { "epoch": 5.820433436532507, "grad_norm": 0.2880017161369324, "learning_rate": 7.285595501349258e-05, "loss": 0.5575, "step": 470 }, { "epoch": 5.944272445820434, "grad_norm": 0.43873119354248047, "learning_rate": 6.909830056250527e-05, "loss": 0.5367, "step": 480 }, { "epoch": 6.068111455108359, "grad_norm": 0.33720219135284424, "learning_rate": 6.538829429225069e-05, "loss": 0.5509, "step": 490 }, { "epoch": 6.191950464396285, "grad_norm": 0.3185509741306305, "learning_rate": 6.173165676349103e-05, "loss": 0.5176, "step": 500 }, { "epoch": 6.315789473684211, "grad_norm": 0.3240034878253937, "learning_rate": 5.8134026246257225e-05, "loss": 0.5306, "step": 510 }, { "epoch": 6.439628482972136, "grad_norm": 0.33068713545799255, "learning_rate": 5.4600950026045326e-05, "loss": 0.517, "step": 520 }, { "epoch": 6.563467492260062, "grad_norm": 0.33544909954071045, "learning_rate": 5.113787585030454e-05, "loss": 0.5288, "step": 530 }, { "epoch": 6.687306501547988, "grad_norm": 0.3468843400478363, "learning_rate": 4.7750143528405126e-05, "loss": 0.5222, "step": 540 }, { "epoch": 6.811145510835914, "grad_norm": 0.33482104539871216, "learning_rate": 4.444297669803981e-05, "loss": 0.5227, "step": 550 }, { "epoch": 6.934984520123839, "grad_norm": 0.3804668188095093, "learning_rate": 4.12214747707527e-05, "loss": 0.5251, "step": 560 }, { "epoch": 7.0588235294117645, "grad_norm": 0.3186335861682892, "learning_rate": 3.8090605069016595e-05, "loss": 0.5264, "step": 570 }, { "epoch": 7.182662538699691, "grad_norm": 0.34853196144104004, "learning_rate": 3.5055195166981645e-05, "loss": 0.5281, "step": 580 }, { "epoch": 7.306501547987616, "grad_norm": 0.36451995372772217, "learning_rate": 3.211992544670582e-05, "loss": 0.5074, "step": 590 }, { "epoch": 7.430340557275541, "grad_norm": 0.3326849639415741, "learning_rate": 2.9289321881345254e-05, "loss": 0.5249, "step": 600 }, { "epoch": 7.554179566563468, "grad_norm": 0.3516250550746918, "learning_rate": 2.6567749056431467e-05, "loss": 0.5112, "step": 610 }, { "epoch": 7.678018575851393, "grad_norm": 0.3434501588344574, "learning_rate": 2.3959403439996907e-05, "loss": 0.5176, "step": 620 }, { "epoch": 7.801857585139319, "grad_norm": 0.3573139011859894, "learning_rate": 2.146830691192553e-05, "loss": 0.5251, "step": 630 }, { "epoch": 7.925696594427245, "grad_norm": 0.3552079200744629, "learning_rate": 1.9098300562505266e-05, "loss": 0.5066, "step": 640 }, { "epoch": 8.04953560371517, "grad_norm": 0.3211444020271301, "learning_rate": 1.6853038769745467e-05, "loss": 0.5255, "step": 650 }, { "epoch": 8.173374613003096, "grad_norm": 0.3437272310256958, "learning_rate": 1.4735983564590783e-05, "loss": 0.5152, "step": 660 }, { "epoch": 8.297213622291022, "grad_norm": 0.39420753717422485, "learning_rate": 1.2750399292720283e-05, "loss": 0.5147, "step": 670 }, { "epoch": 8.421052631578947, "grad_norm": 0.33711323142051697, "learning_rate": 1.0899347581163221e-05, "loss": 0.5053, "step": 680 }, { "epoch": 8.544891640866872, "grad_norm": 0.41042107343673706, "learning_rate": 9.185682617491863e-06, "loss": 0.4921, "step": 690 }, { "epoch": 8.6687306501548, "grad_norm": 0.3305673897266388, "learning_rate": 7.612046748871327e-06, "loss": 0.5182, "step": 700 }, { "epoch": 8.792569659442725, "grad_norm": 0.42124322056770325, "learning_rate": 6.180866407751595e-06, "loss": 0.5229, "step": 710 }, { "epoch": 8.91640866873065, "grad_norm": 0.3476842939853668, "learning_rate": 4.8943483704846475e-06, "loss": 0.517, "step": 720 }, { "epoch": 9.040247678018575, "grad_norm": 0.3766673803329468, "learning_rate": 3.7544763546352834e-06, "loss": 0.4968, "step": 730 }, { "epoch": 9.1640866873065, "grad_norm": 0.34576883912086487, "learning_rate": 2.7630079602323442e-06, "loss": 0.5143, "step": 740 }, { "epoch": 9.287925696594428, "grad_norm": 0.3424369692802429, "learning_rate": 1.921471959676957e-06, "loss": 0.5028, "step": 750 }, { "epoch": 9.411764705882353, "grad_norm": 0.3952752947807312, "learning_rate": 1.231165940486234e-06, "loss": 0.4951, "step": 760 }, { "epoch": 9.535603715170279, "grad_norm": 0.35501015186309814, "learning_rate": 6.931543045073708e-07, "loss": 0.5218, "step": 770 }, { "epoch": 9.659442724458204, "grad_norm": 0.3750287592411041, "learning_rate": 3.0826662668720364e-07, "loss": 0.5143, "step": 780 }, { "epoch": 9.78328173374613, "grad_norm": 0.336580365896225, "learning_rate": 7.709637592770991e-08, "loss": 0.5106, "step": 790 }, { "epoch": 9.907120743034056, "grad_norm": 0.35525137186050415, "learning_rate": 0.0, "loss": 0.514, "step": 800 }, { "epoch": 9.907120743034056, "step": 800, "total_flos": 3.90379106992128e+16, "train_loss": 0.5782659471035003, "train_runtime": 2175.2449, "train_samples_per_second": 1.485, "train_steps_per_second": 0.368 } ], "logging_steps": 10, "max_steps": 800, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 3.90379106992128e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }