Training in progress, epoch 1

Browse files

Files changed (4) hide show

adapter_config.json +6 -3
adapter_model.safetensors +2 -2
trainer_state.json +590 -0
training_args.bin +1 -1

adapter_config.json CHANGED Viewed

@@ -20,10 +20,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "k_proj",
-    "q_proj",
     "o_proj",
-    "v_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "o_proj",
+    "gate_proj",
+    "v_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:562e078b6a35e9889757848bbe9c7f2d25289bbbe309b3459915b84b169b6713
-size 14764744

 version https://git-lfs.github.com/spec/v1
+oid sha256:ac3b74eb05601f3640faca779bdef5346172fe8b2dbb6e8a38a60354b90a63b0
+size 78480072

trainer_state.json ADDED Viewed

	@@ -0,0 +1,590 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 9.907120743034056,
+  "eval_steps": 500,
+  "global_step": 800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.1238390092879257,
+      "grad_norm": 7.5771355628967285,
+      "learning_rate": 0.0001999229036240723,
+      "loss": 1.3532,
+      "step": 10
+    },
+    {
+      "epoch": 0.2476780185758514,
+      "grad_norm": 0.3571035861968994,
+      "learning_rate": 0.0001996917333733128,
+      "loss": 1.0315,
+      "step": 20
+    },
+    {
+      "epoch": 0.3715170278637771,
+      "grad_norm": 0.3278755843639374,
+      "learning_rate": 0.00019930684569549264,
+      "loss": 0.9018,
+      "step": 30
+    },
+    {
+      "epoch": 0.4953560371517028,
+      "grad_norm": 0.4095502495765686,
+      "learning_rate": 0.00019876883405951377,
+      "loss": 0.807,
+      "step": 40
+    },
+    {
+      "epoch": 0.6191950464396285,
+      "grad_norm": 0.4147421717643738,
+      "learning_rate": 0.00019807852804032305,
+      "loss": 0.7204,
+      "step": 50
+    },
+    {
+      "epoch": 0.7430340557275542,
+      "grad_norm": 0.2525322735309601,
+      "learning_rate": 0.00019723699203976766,
+      "loss": 0.6548,
+      "step": 60
+    },
+    {
+      "epoch": 0.8668730650154799,
+      "grad_norm": 0.28104496002197266,
+      "learning_rate": 0.00019624552364536473,
+      "loss": 0.6687,
+      "step": 70
+    },
+    {
+      "epoch": 0.9907120743034056,
+      "grad_norm": 0.4467855393886566,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.6317,
+      "step": 80
+    },
+    {
+      "epoch": 1.1145510835913313,
+      "grad_norm": 0.2571893036365509,
+      "learning_rate": 0.00019381913359224842,
+      "loss": 0.6319,
+      "step": 90
+    },
+    {
+      "epoch": 1.238390092879257,
+      "grad_norm": 0.24546292424201965,
+      "learning_rate": 0.0001923879532511287,
+      "loss": 0.6262,
+      "step": 100
+    },
+    {
+      "epoch": 1.3622291021671826,
+      "grad_norm": 0.24089373648166656,
+      "learning_rate": 0.00019081431738250814,
+      "loss": 0.6309,
+      "step": 110
+    },
+    {
+      "epoch": 1.4860681114551084,
+      "grad_norm": 0.24842403829097748,
+      "learning_rate": 0.0001891006524188368,
+      "loss": 0.6142,
+      "step": 120
+    },
+    {
+      "epoch": 1.609907120743034,
+      "grad_norm": 0.2339727133512497,
+      "learning_rate": 0.00018724960070727972,
+      "loss": 0.6131,
+      "step": 130
+    },
+    {
+      "epoch": 1.7337461300309598,
+      "grad_norm": 0.21254688501358032,
+      "learning_rate": 0.00018526401643540922,
+      "loss": 0.5892,
+      "step": 140
+    },
+    {
+      "epoch": 1.8575851393188856,
+      "grad_norm": 0.34352943301200867,
+      "learning_rate": 0.00018314696123025454,
+      "loss": 0.6037,
+      "step": 150
+    },
+    {
+      "epoch": 1.9814241486068112,
+      "grad_norm": 0.21427258849143982,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.6051,
+      "step": 160
+    },
+    {
+      "epoch": 2.1052631578947367,
+      "grad_norm": 0.23226872086524963,
+      "learning_rate": 0.00017853169308807448,
+      "loss": 0.6109,
+      "step": 170
+    },
+    {
+      "epoch": 2.2291021671826625,
+      "grad_norm": 0.254842072725296,
+      "learning_rate": 0.0001760405965600031,
+      "loss": 0.5912,
+      "step": 180
+    },
+    {
+      "epoch": 2.3529411764705883,
+      "grad_norm": 0.2571081519126892,
+      "learning_rate": 0.00017343225094356855,
+      "loss": 0.5975,
+      "step": 190
+    },
+    {
+      "epoch": 2.476780185758514,
+      "grad_norm": 0.25343191623687744,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.5786,
+      "step": 200
+    },
+    {
+      "epoch": 2.6006191950464395,
+      "grad_norm": 0.21258015930652618,
+      "learning_rate": 0.0001678800745532942,
+      "loss": 0.586,
+      "step": 210
+    },
+    {
+      "epoch": 2.7244582043343653,
+      "grad_norm": 0.25848379731178284,
+      "learning_rate": 0.00016494480483301836,
+      "loss": 0.5714,
+      "step": 220
+    },
+    {
+      "epoch": 2.848297213622291,
+      "grad_norm": 0.26716166734695435,
+      "learning_rate": 0.00016190939493098344,
+      "loss": 0.5887,
+      "step": 230
+    },
+    {
+      "epoch": 2.972136222910217,
+      "grad_norm": 0.23578402400016785,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.5902,
+      "step": 240
+    },
+    {
+      "epoch": 3.0959752321981426,
+      "grad_norm": 0.23565009236335754,
+      "learning_rate": 0.00015555702330196023,
+      "loss": 0.5792,
+      "step": 250
+    },
+    {
+      "epoch": 3.219814241486068,
+      "grad_norm": 0.2390134632587433,
+      "learning_rate": 0.0001522498564715949,
+      "loss": 0.5676,
+      "step": 260
+    },
+    {
+      "epoch": 3.343653250773994,
+      "grad_norm": 0.25006794929504395,
+      "learning_rate": 0.00014886212414969553,
+      "loss": 0.5788,
+      "step": 270
+    },
+    {
+      "epoch": 3.4674922600619196,
+      "grad_norm": 0.2533760666847229,
+      "learning_rate": 0.00014539904997395468,
+      "loss": 0.5769,
+      "step": 280
+    },
+    {
+      "epoch": 3.5913312693498454,
+      "grad_norm": 0.2808171510696411,
+      "learning_rate": 0.0001418659737537428,
+      "loss": 0.5521,
+      "step": 290
+    },
+    {
+      "epoch": 3.715170278637771,
+      "grad_norm": 0.28783777356147766,
+      "learning_rate": 0.000138268343236509,
+      "loss": 0.5723,
+      "step": 300
+    },
+    {
+      "epoch": 3.8390092879256965,
+      "grad_norm": 0.29237958788871765,
+      "learning_rate": 0.0001346117057077493,
+      "loss": 0.5668,
+      "step": 310
+    },
+    {
+      "epoch": 3.9628482972136223,
+      "grad_norm": 0.2757062315940857,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.579,
+      "step": 320
+    },
+    {
+      "epoch": 4.086687306501548,
+      "grad_norm": 0.28595200181007385,
+      "learning_rate": 0.00012714404498650743,
+      "loss": 0.5488,
+      "step": 330
+    },
+    {
+      "epoch": 4.2105263157894735,
+      "grad_norm": 0.2707183063030243,
+      "learning_rate": 0.00012334453638559057,
+      "loss": 0.5511,
+      "step": 340
+    },
+    {
+      "epoch": 4.3343653250774,
+      "grad_norm": 0.3559975028038025,
+      "learning_rate": 0.00011950903220161285,
+      "loss": 0.5683,
+      "step": 350
+    },
+    {
+      "epoch": 4.458204334365325,
+      "grad_norm": 0.2762058973312378,
+      "learning_rate": 0.0001156434465040231,
+      "loss": 0.5499,
+      "step": 360
+    },
+    {
+      "epoch": 4.58204334365325,
+      "grad_norm": 0.2717606723308563,
+      "learning_rate": 0.00011175373974578378,
+      "loss": 0.5612,
+      "step": 370
+    },
+    {
+      "epoch": 4.705882352941177,
+      "grad_norm": 0.27757707238197327,
+      "learning_rate": 0.0001078459095727845,
+      "loss": 0.5602,
+      "step": 380
+    },
+    {
+      "epoch": 4.829721362229102,
+      "grad_norm": 0.3977556526660919,
+      "learning_rate": 0.00010392598157590688,
+      "loss": 0.5459,
+      "step": 390
+    },
+    {
+      "epoch": 4.953560371517028,
+      "grad_norm": 0.26867300271987915,
+      "learning_rate": 0.0001,
+      "loss": 0.537,
+      "step": 400
+    },
+    {
+      "epoch": 5.077399380804954,
+      "grad_norm": 0.26843276619911194,
+      "learning_rate": 9.607401842409317e-05,
+      "loss": 0.5601,
+      "step": 410
+    },
+    {
+      "epoch": 5.201238390092879,
+      "grad_norm": 0.30268290638923645,
+      "learning_rate": 9.215409042721552e-05,
+      "loss": 0.5317,
+      "step": 420
+    },
+    {
+      "epoch": 5.325077399380805,
+      "grad_norm": 0.3163929581642151,
+      "learning_rate": 8.824626025421626e-05,
+      "loss": 0.5343,
+      "step": 430
+    },
+    {
+      "epoch": 5.4489164086687305,
+      "grad_norm": 0.2883571982383728,
+      "learning_rate": 8.435655349597689e-05,
+      "loss": 0.5255,
+      "step": 440
+    },
+    {
+      "epoch": 5.572755417956657,
+      "grad_norm": 0.3254496157169342,
+      "learning_rate": 8.049096779838719e-05,
+      "loss": 0.5281,
+      "step": 450
+    },
+    {
+      "epoch": 5.696594427244582,
+      "grad_norm": 0.2983749508857727,
+      "learning_rate": 7.66554636144095e-05,
+      "loss": 0.5515,
+      "step": 460
+    },
+    {
+      "epoch": 5.820433436532507,
+      "grad_norm": 0.2880017161369324,
+      "learning_rate": 7.285595501349258e-05,
+      "loss": 0.5575,
+      "step": 470
+    },
+    {
+      "epoch": 5.944272445820434,
+      "grad_norm": 0.43873119354248047,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.5367,
+      "step": 480
+    },
+    {
+      "epoch": 6.068111455108359,
+      "grad_norm": 0.33720219135284424,
+      "learning_rate": 6.538829429225069e-05,
+      "loss": 0.5509,
+      "step": 490
+    },
+    {
+      "epoch": 6.191950464396285,
+      "grad_norm": 0.3185509741306305,
+      "learning_rate": 6.173165676349103e-05,
+      "loss": 0.5176,
+      "step": 500
+    },
+    {
+      "epoch": 6.315789473684211,
+      "grad_norm": 0.3240034878253937,
+      "learning_rate": 5.8134026246257225e-05,
+      "loss": 0.5306,
+      "step": 510
+    },
+    {
+      "epoch": 6.439628482972136,
+      "grad_norm": 0.33068713545799255,
+      "learning_rate": 5.4600950026045326e-05,
+      "loss": 0.517,
+      "step": 520
+    },
+    {
+      "epoch": 6.563467492260062,
+      "grad_norm": 0.33544909954071045,
+      "learning_rate": 5.113787585030454e-05,
+      "loss": 0.5288,
+      "step": 530
+    },
+    {
+      "epoch": 6.687306501547988,
+      "grad_norm": 0.3468843400478363,
+      "learning_rate": 4.7750143528405126e-05,
+      "loss": 0.5222,
+      "step": 540
+    },
+    {
+      "epoch": 6.811145510835914,
+      "grad_norm": 0.33482104539871216,
+      "learning_rate": 4.444297669803981e-05,
+      "loss": 0.5227,
+      "step": 550
+    },
+    {
+      "epoch": 6.934984520123839,
+      "grad_norm": 0.3804668188095093,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.5251,
+      "step": 560
+    },
+    {
+      "epoch": 7.0588235294117645,
+      "grad_norm": 0.3186335861682892,
+      "learning_rate": 3.8090605069016595e-05,
+      "loss": 0.5264,
+      "step": 570
+    },
+    {
+      "epoch": 7.182662538699691,
+      "grad_norm": 0.34853196144104004,
+      "learning_rate": 3.5055195166981645e-05,
+      "loss": 0.5281,
+      "step": 580
+    },
+    {
+      "epoch": 7.306501547987616,
+      "grad_norm": 0.36451995372772217,
+      "learning_rate": 3.211992544670582e-05,
+      "loss": 0.5074,
+      "step": 590
+    },
+    {
+      "epoch": 7.430340557275541,
+      "grad_norm": 0.3326849639415741,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.5249,
+      "step": 600
+    },
+    {
+      "epoch": 7.554179566563468,
+      "grad_norm": 0.3516250550746918,
+      "learning_rate": 2.6567749056431467e-05,
+      "loss": 0.5112,
+      "step": 610
+    },
+    {
+      "epoch": 7.678018575851393,
+      "grad_norm": 0.3434501588344574,
+      "learning_rate": 2.3959403439996907e-05,
+      "loss": 0.5176,
+      "step": 620
+    },
+    {
+      "epoch": 7.801857585139319,
+      "grad_norm": 0.3573139011859894,
+      "learning_rate": 2.146830691192553e-05,
+      "loss": 0.5251,
+      "step": 630
+    },
+    {
+      "epoch": 7.925696594427245,
+      "grad_norm": 0.3552079200744629,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.5066,
+      "step": 640
+    },
+    {
+      "epoch": 8.04953560371517,
+      "grad_norm": 0.3211444020271301,
+      "learning_rate": 1.6853038769745467e-05,
+      "loss": 0.5255,
+      "step": 650
+    },
+    {
+      "epoch": 8.173374613003096,
+      "grad_norm": 0.3437272310256958,
+      "learning_rate": 1.4735983564590783e-05,
+      "loss": 0.5152,
+      "step": 660
+    },
+    {
+      "epoch": 8.297213622291022,
+      "grad_norm": 0.39420753717422485,
+      "learning_rate": 1.2750399292720283e-05,
+      "loss": 0.5147,
+      "step": 670
+    },
+    {
+      "epoch": 8.421052631578947,
+      "grad_norm": 0.33711323142051697,
+      "learning_rate": 1.0899347581163221e-05,
+      "loss": 0.5053,
+      "step": 680
+    },
+    {
+      "epoch": 8.544891640866872,
+      "grad_norm": 0.41042107343673706,
+      "learning_rate": 9.185682617491863e-06,
+      "loss": 0.4921,
+      "step": 690
+    },
+    {
+      "epoch": 8.6687306501548,
+      "grad_norm": 0.3305673897266388,
+      "learning_rate": 7.612046748871327e-06,
+      "loss": 0.5182,
+      "step": 700
+    },
+    {
+      "epoch": 8.792569659442725,
+      "grad_norm": 0.42124322056770325,
+      "learning_rate": 6.180866407751595e-06,
+      "loss": 0.5229,
+      "step": 710
+    },
+    {
+      "epoch": 8.91640866873065,
+      "grad_norm": 0.3476842939853668,
+      "learning_rate": 4.8943483704846475e-06,
+      "loss": 0.517,
+      "step": 720
+    },
+    {
+      "epoch": 9.040247678018575,
+      "grad_norm": 0.3766673803329468,
+      "learning_rate": 3.7544763546352834e-06,
+      "loss": 0.4968,
+      "step": 730
+    },
+    {
+      "epoch": 9.1640866873065,
+      "grad_norm": 0.34576883912086487,
+      "learning_rate": 2.7630079602323442e-06,
+      "loss": 0.5143,
+      "step": 740
+    },
+    {
+      "epoch": 9.287925696594428,
+      "grad_norm": 0.3424369692802429,
+      "learning_rate": 1.921471959676957e-06,
+      "loss": 0.5028,
+      "step": 750
+    },
+    {
+      "epoch": 9.411764705882353,
+      "grad_norm": 0.3952752947807312,
+      "learning_rate": 1.231165940486234e-06,
+      "loss": 0.4951,
+      "step": 760
+    },
+    {
+      "epoch": 9.535603715170279,
+      "grad_norm": 0.35501015186309814,
+      "learning_rate": 6.931543045073708e-07,
+      "loss": 0.5218,
+      "step": 770
+    },
+    {
+      "epoch": 9.659442724458204,
+      "grad_norm": 0.3750287592411041,
+      "learning_rate": 3.0826662668720364e-07,
+      "loss": 0.5143,
+      "step": 780
+    },
+    {
+      "epoch": 9.78328173374613,
+      "grad_norm": 0.336580365896225,
+      "learning_rate": 7.709637592770991e-08,
+      "loss": 0.5106,
+      "step": 790
+    },
+    {
+      "epoch": 9.907120743034056,
+      "grad_norm": 0.35525137186050415,
+      "learning_rate": 0.0,
+      "loss": 0.514,
+      "step": 800
+    },
+    {
+      "epoch": 9.907120743034056,
+      "step": 800,
+      "total_flos": 3.90379106992128e+16,
+      "train_loss": 0.5782659471035003,
+      "train_runtime": 2175.2449,
+      "train_samples_per_second": 1.485,
+      "train_steps_per_second": 0.368
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 800,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "total_flos": 3.90379106992128e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a6bcabbd06ed3bb6e5375af9e0f56fdfd6c61e840195cfe0cd74d724a76ebccf
 size 4984

 version https://git-lfs.github.com/spec/v1
+oid sha256:705a2853995e314484503d1a0ae51c3da964356e26fab03b8c300c38563e3519
 size 4984