|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.907120743034056, |
|
"eval_steps": 500, |
|
"global_step": 800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1238390092879257, |
|
"grad_norm": 7.5771355628967285, |
|
"learning_rate": 0.0001999229036240723, |
|
"loss": 1.3532, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2476780185758514, |
|
"grad_norm": 0.3571035861968994, |
|
"learning_rate": 0.0001996917333733128, |
|
"loss": 1.0315, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3715170278637771, |
|
"grad_norm": 0.3278755843639374, |
|
"learning_rate": 0.00019930684569549264, |
|
"loss": 0.9018, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4953560371517028, |
|
"grad_norm": 0.4095502495765686, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 0.807, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6191950464396285, |
|
"grad_norm": 0.4147421717643738, |
|
"learning_rate": 0.00019807852804032305, |
|
"loss": 0.7204, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7430340557275542, |
|
"grad_norm": 0.2525322735309601, |
|
"learning_rate": 0.00019723699203976766, |
|
"loss": 0.6548, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8668730650154799, |
|
"grad_norm": 0.28104496002197266, |
|
"learning_rate": 0.00019624552364536473, |
|
"loss": 0.6687, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9907120743034056, |
|
"grad_norm": 0.4467855393886566, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.6317, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.1145510835913313, |
|
"grad_norm": 0.2571893036365509, |
|
"learning_rate": 0.00019381913359224842, |
|
"loss": 0.6319, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.238390092879257, |
|
"grad_norm": 0.24546292424201965, |
|
"learning_rate": 0.0001923879532511287, |
|
"loss": 0.6262, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.3622291021671826, |
|
"grad_norm": 0.24089373648166656, |
|
"learning_rate": 0.00019081431738250814, |
|
"loss": 0.6309, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.4860681114551084, |
|
"grad_norm": 0.24842403829097748, |
|
"learning_rate": 0.0001891006524188368, |
|
"loss": 0.6142, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.609907120743034, |
|
"grad_norm": 0.2339727133512497, |
|
"learning_rate": 0.00018724960070727972, |
|
"loss": 0.6131, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.7337461300309598, |
|
"grad_norm": 0.21254688501358032, |
|
"learning_rate": 0.00018526401643540922, |
|
"loss": 0.5892, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.8575851393188856, |
|
"grad_norm": 0.34352943301200867, |
|
"learning_rate": 0.00018314696123025454, |
|
"loss": 0.6037, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.9814241486068112, |
|
"grad_norm": 0.21427258849143982, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.6051, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 0.23226872086524963, |
|
"learning_rate": 0.00017853169308807448, |
|
"loss": 0.6109, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.2291021671826625, |
|
"grad_norm": 0.254842072725296, |
|
"learning_rate": 0.0001760405965600031, |
|
"loss": 0.5912, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 0.2571081519126892, |
|
"learning_rate": 0.00017343225094356855, |
|
"loss": 0.5975, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.476780185758514, |
|
"grad_norm": 0.25343191623687744, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.5786, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.6006191950464395, |
|
"grad_norm": 0.21258015930652618, |
|
"learning_rate": 0.0001678800745532942, |
|
"loss": 0.586, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.7244582043343653, |
|
"grad_norm": 0.25848379731178284, |
|
"learning_rate": 0.00016494480483301836, |
|
"loss": 0.5714, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.848297213622291, |
|
"grad_norm": 0.26716166734695435, |
|
"learning_rate": 0.00016190939493098344, |
|
"loss": 0.5887, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.972136222910217, |
|
"grad_norm": 0.23578402400016785, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 0.5902, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.0959752321981426, |
|
"grad_norm": 0.23565009236335754, |
|
"learning_rate": 0.00015555702330196023, |
|
"loss": 0.5792, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.219814241486068, |
|
"grad_norm": 0.2390134632587433, |
|
"learning_rate": 0.0001522498564715949, |
|
"loss": 0.5676, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.343653250773994, |
|
"grad_norm": 0.25006794929504395, |
|
"learning_rate": 0.00014886212414969553, |
|
"loss": 0.5788, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.4674922600619196, |
|
"grad_norm": 0.2533760666847229, |
|
"learning_rate": 0.00014539904997395468, |
|
"loss": 0.5769, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.5913312693498454, |
|
"grad_norm": 0.2808171510696411, |
|
"learning_rate": 0.0001418659737537428, |
|
"loss": 0.5521, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.715170278637771, |
|
"grad_norm": 0.28783777356147766, |
|
"learning_rate": 0.000138268343236509, |
|
"loss": 0.5723, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.8390092879256965, |
|
"grad_norm": 0.29237958788871765, |
|
"learning_rate": 0.0001346117057077493, |
|
"loss": 0.5668, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.9628482972136223, |
|
"grad_norm": 0.2757062315940857, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.579, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.086687306501548, |
|
"grad_norm": 0.28595200181007385, |
|
"learning_rate": 0.00012714404498650743, |
|
"loss": 0.5488, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.2105263157894735, |
|
"grad_norm": 0.2707183063030243, |
|
"learning_rate": 0.00012334453638559057, |
|
"loss": 0.5511, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.3343653250774, |
|
"grad_norm": 0.3559975028038025, |
|
"learning_rate": 0.00011950903220161285, |
|
"loss": 0.5683, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.458204334365325, |
|
"grad_norm": 0.2762058973312378, |
|
"learning_rate": 0.0001156434465040231, |
|
"loss": 0.5499, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.58204334365325, |
|
"grad_norm": 0.2717606723308563, |
|
"learning_rate": 0.00011175373974578378, |
|
"loss": 0.5612, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.705882352941177, |
|
"grad_norm": 0.27757707238197327, |
|
"learning_rate": 0.0001078459095727845, |
|
"loss": 0.5602, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.829721362229102, |
|
"grad_norm": 0.3977556526660919, |
|
"learning_rate": 0.00010392598157590688, |
|
"loss": 0.5459, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.953560371517028, |
|
"grad_norm": 0.26867300271987915, |
|
"learning_rate": 0.0001, |
|
"loss": 0.537, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 5.077399380804954, |
|
"grad_norm": 0.26843276619911194, |
|
"learning_rate": 9.607401842409317e-05, |
|
"loss": 0.5601, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 5.201238390092879, |
|
"grad_norm": 0.30268290638923645, |
|
"learning_rate": 9.215409042721552e-05, |
|
"loss": 0.5317, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.325077399380805, |
|
"grad_norm": 0.3163929581642151, |
|
"learning_rate": 8.824626025421626e-05, |
|
"loss": 0.5343, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.4489164086687305, |
|
"grad_norm": 0.2883571982383728, |
|
"learning_rate": 8.435655349597689e-05, |
|
"loss": 0.5255, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.572755417956657, |
|
"grad_norm": 0.3254496157169342, |
|
"learning_rate": 8.049096779838719e-05, |
|
"loss": 0.5281, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.696594427244582, |
|
"grad_norm": 0.2983749508857727, |
|
"learning_rate": 7.66554636144095e-05, |
|
"loss": 0.5515, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.820433436532507, |
|
"grad_norm": 0.2880017161369324, |
|
"learning_rate": 7.285595501349258e-05, |
|
"loss": 0.5575, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 5.944272445820434, |
|
"grad_norm": 0.43873119354248047, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.5367, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 6.068111455108359, |
|
"grad_norm": 0.33720219135284424, |
|
"learning_rate": 6.538829429225069e-05, |
|
"loss": 0.5509, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 6.191950464396285, |
|
"grad_norm": 0.3185509741306305, |
|
"learning_rate": 6.173165676349103e-05, |
|
"loss": 0.5176, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 6.315789473684211, |
|
"grad_norm": 0.3240034878253937, |
|
"learning_rate": 5.8134026246257225e-05, |
|
"loss": 0.5306, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 6.439628482972136, |
|
"grad_norm": 0.33068713545799255, |
|
"learning_rate": 5.4600950026045326e-05, |
|
"loss": 0.517, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.563467492260062, |
|
"grad_norm": 0.33544909954071045, |
|
"learning_rate": 5.113787585030454e-05, |
|
"loss": 0.5288, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.687306501547988, |
|
"grad_norm": 0.3468843400478363, |
|
"learning_rate": 4.7750143528405126e-05, |
|
"loss": 0.5222, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.811145510835914, |
|
"grad_norm": 0.33482104539871216, |
|
"learning_rate": 4.444297669803981e-05, |
|
"loss": 0.5227, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 6.934984520123839, |
|
"grad_norm": 0.3804668188095093, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 0.5251, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 7.0588235294117645, |
|
"grad_norm": 0.3186335861682892, |
|
"learning_rate": 3.8090605069016595e-05, |
|
"loss": 0.5264, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 7.182662538699691, |
|
"grad_norm": 0.34853196144104004, |
|
"learning_rate": 3.5055195166981645e-05, |
|
"loss": 0.5281, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 7.306501547987616, |
|
"grad_norm": 0.36451995372772217, |
|
"learning_rate": 3.211992544670582e-05, |
|
"loss": 0.5074, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 7.430340557275541, |
|
"grad_norm": 0.3326849639415741, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.5249, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 7.554179566563468, |
|
"grad_norm": 0.3516250550746918, |
|
"learning_rate": 2.6567749056431467e-05, |
|
"loss": 0.5112, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 7.678018575851393, |
|
"grad_norm": 0.3434501588344574, |
|
"learning_rate": 2.3959403439996907e-05, |
|
"loss": 0.5176, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 7.801857585139319, |
|
"grad_norm": 0.3573139011859894, |
|
"learning_rate": 2.146830691192553e-05, |
|
"loss": 0.5251, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 7.925696594427245, |
|
"grad_norm": 0.3552079200744629, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.5066, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 8.04953560371517, |
|
"grad_norm": 0.3211444020271301, |
|
"learning_rate": 1.6853038769745467e-05, |
|
"loss": 0.5255, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 8.173374613003096, |
|
"grad_norm": 0.3437272310256958, |
|
"learning_rate": 1.4735983564590783e-05, |
|
"loss": 0.5152, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 8.297213622291022, |
|
"grad_norm": 0.39420753717422485, |
|
"learning_rate": 1.2750399292720283e-05, |
|
"loss": 0.5147, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 8.421052631578947, |
|
"grad_norm": 0.33711323142051697, |
|
"learning_rate": 1.0899347581163221e-05, |
|
"loss": 0.5053, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 8.544891640866872, |
|
"grad_norm": 0.41042107343673706, |
|
"learning_rate": 9.185682617491863e-06, |
|
"loss": 0.4921, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 8.6687306501548, |
|
"grad_norm": 0.3305673897266388, |
|
"learning_rate": 7.612046748871327e-06, |
|
"loss": 0.5182, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.792569659442725, |
|
"grad_norm": 0.42124322056770325, |
|
"learning_rate": 6.180866407751595e-06, |
|
"loss": 0.5229, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 8.91640866873065, |
|
"grad_norm": 0.3476842939853668, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 0.517, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 9.040247678018575, |
|
"grad_norm": 0.3766673803329468, |
|
"learning_rate": 3.7544763546352834e-06, |
|
"loss": 0.4968, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 9.1640866873065, |
|
"grad_norm": 0.34576883912086487, |
|
"learning_rate": 2.7630079602323442e-06, |
|
"loss": 0.5143, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 9.287925696594428, |
|
"grad_norm": 0.3424369692802429, |
|
"learning_rate": 1.921471959676957e-06, |
|
"loss": 0.5028, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 9.411764705882353, |
|
"grad_norm": 0.3952752947807312, |
|
"learning_rate": 1.231165940486234e-06, |
|
"loss": 0.4951, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 9.535603715170279, |
|
"grad_norm": 0.35501015186309814, |
|
"learning_rate": 6.931543045073708e-07, |
|
"loss": 0.5218, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 9.659442724458204, |
|
"grad_norm": 0.3750287592411041, |
|
"learning_rate": 3.0826662668720364e-07, |
|
"loss": 0.5143, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 9.78328173374613, |
|
"grad_norm": 0.336580365896225, |
|
"learning_rate": 7.709637592770991e-08, |
|
"loss": 0.5106, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 9.907120743034056, |
|
"grad_norm": 0.35525137186050415, |
|
"learning_rate": 0.0, |
|
"loss": 0.514, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 9.907120743034056, |
|
"step": 800, |
|
"total_flos": 3.90379106992128e+16, |
|
"train_loss": 0.5782659471035003, |
|
"train_runtime": 2175.2449, |
|
"train_samples_per_second": 1.485, |
|
"train_steps_per_second": 0.368 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 3.90379106992128e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|