Training Script and LayerProfile
Browse files- 32_TEST_bone_2b9_mytest.csv +35 -0
- step-2-train-sft-x070.sh +25 -0
32_TEST_bone_2b9_mytest.csv
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Layer,Mode,Rank,Alpha,Dropout,Weight_lr_init,Weight_lr_final,Weight_decay,State_lr_init,State_lr_final,RejectParts
|
2 |
+
emb,freeze,0,0,0.01,0.000001,0.0000001,0.01,0.05,0.01,
|
3 |
+
0,bone,256,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
4 |
+
1,bone,256,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
5 |
+
2,bone,256,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
6 |
+
3,bone,256,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
7 |
+
4,bone,256,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
8 |
+
5,bone,256,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
9 |
+
6,bone,256,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
10 |
+
7,bone,256,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
11 |
+
8,bone,256,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
12 |
+
9,bone,256,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
13 |
+
10,bone,256,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
14 |
+
11,bone,256,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
15 |
+
12,bone,256,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
16 |
+
13,bone,512,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
17 |
+
14,bone,512,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
18 |
+
15,bone,512,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
19 |
+
16,bone,512,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
20 |
+
17,bone,512,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
21 |
+
18,bone,512,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
22 |
+
19,bone,512,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
23 |
+
20,bone,512,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
24 |
+
21,bone,512,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
25 |
+
22,bone,512,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
26 |
+
23,bone,512,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
27 |
+
24,full,1280,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
28 |
+
25,full,1280,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
29 |
+
26,full,1024,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
30 |
+
27,full,1024,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
31 |
+
28,full,1024,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
32 |
+
29,full,1024,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
33 |
+
30,full,1024,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
34 |
+
31,full,1024,32,0.01,5.00E-05,1.00E-05,0.01,0.05,0.01,
|
35 |
+
head,full,512,32,0.01,0.00001,0.000001,0.01,0.05,0.01,
|
step-2-train-sft-x070.sh
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python train.py --load_model "myfolder/models/rwkv-x070-2b9-world-v3-40_trained-20250113-ctx4k.pth" \
|
2 |
+
--wandb "RWKV-LM-RLHF x070-2b9 General JPENCN v3" \
|
3 |
+
--proj_dir "myfolder/Outputs/x070GeneralJPENCNv3" \
|
4 |
+
--state 0 \
|
5 |
+
--infctx 0 \
|
6 |
+
--vocab_size 65536 --ctx_len 5120 \
|
7 |
+
--epoch_steps 2000 --epoch_count 200 --epoch_begin 0 --epoch_save 1 \
|
8 |
+
--micro_bsz 3 --n_layer 32 --n_embd 2560 \
|
9 |
+
--lr_init 1e-5 --lr_final 1e-6 \
|
10 |
+
--warmup_steps 100 --beta1 0.9 --beta2 0.999 --adam_eps 1e-8 \
|
11 |
+
--accelerator gpu --devices 2 --precision 'bf16' \
|
12 |
+
--grad_cp 1 --my_testing "x070" \
|
13 |
+
--strategy deepspeed_stage_2_offload \
|
14 |
+
--layer_profile 'layerprofile/32_TEST_bone_2b9_mytest.csv' \
|
15 |
+
--quant 0 \
|
16 |
+
--quant_mode 'nf4'\
|
17 |
+
--gpu_arch 'rocm' \
|
18 |
+
--limited_lora 0 \
|
19 |
+
--sft 1 \
|
20 |
+
--smoothing 0.001 \
|
21 |
+
--random_mode 1 \
|
22 |
+
--optim '' \
|
23 |
+
--train_data_file 'myfolder/datasets/General-jpencnv3.h5' \
|
24 |
+
--infctx_dataset_multiplier 8 \
|
25 |
+
--accumulate_grad_batches 16
|