winglian commited on
Commit
9ac16ed
β€’
2 Parent(s): f620706 6b3f509

Merge pull request #190 from OpenAccess-AI-Collective/fixes-20230711-v2

Browse files
README.md CHANGED
@@ -16,13 +16,14 @@
16
 
17
  ## Axolotl supports
18
 
19
- | | fp16/fp32 | fp16/fp32 w/ lora | qlora | 4bit-quant | 4bit-quant w/flash attention | flash attention | xformers attention |
20
- |---------|:----------|:------------------|------|------------|------------------------------|-----------------|--------------------|
21
- | llama | βœ… | βœ… | βœ… | βœ… | βœ… | βœ… | βœ… |
22
- | Pythia | βœ… | βœ… | ❓ | ❌ | ❌ | ❌ | ❓ |
23
- | cerebras | βœ… | βœ… | ❓ | ❌ | ❌ | ❌ | ❓ |
24
- | mpt | βœ… | ❌ | ❓ | ❌ | ❌ | ❌ | ❓ |
25
- | falcon | βœ… | βœ… | βœ… | ❌ | ❌ | ❌ | ❓ |
 
26
 
27
 
28
  ## Quickstart ⚑
 
16
 
17
  ## Axolotl supports
18
 
19
+ | | fp16/fp32 | lora | qlora | gptq | gptq w/ lora | gptq w/flash attn | flash attn | xformers attn |
20
+ |----------|:----------|:-----|-------|------|:-------------|-------------------|------------|---------------|
21
+ | llama | βœ… | βœ… | βœ… | βœ… | βœ… | βœ… | βœ… | βœ… |
22
+ | Pythia | βœ… | βœ… | βœ… | ❌ | ❓ | ❌ | ❌ | ❓ |
23
+ | cerebras | βœ… | βœ… | βœ… | ❌ | ❓ | ❌ | ❌ | βœ… |
24
+ | mpt | βœ… | ❌ | ❓ | ❌ | ❓ | ❌ | ❌ | ❓ |
25
+ | falcon | βœ… | βœ… | βœ… | ❌ | ❓ | ❌ | ❌ | βœ… |
26
+ | gpt-j | βœ… | βœ… | βœ… | ❌ | ❓ | ❌ | ❓ | βœ… |
27
 
28
 
29
  ## Quickstart ⚑
configs/cerebras_1_3B_alpaca.yml DELETED
@@ -1,40 +0,0 @@
1
- base_model: cerebras/Cerebras-GPT-1.3B
2
- model_type: AutoModelForCausalLM
3
- tokenizer_type: AutoTokenizer
4
- load_in_8bit: true
5
- datasets:
6
- - path: data/alpaca_data_gpt4.jsonl
7
- type: alpaca
8
- - path: data/vicuna_cleaned.jsonl
9
- type: sharegpt
10
- - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
11
- type: gpteacher
12
- - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
13
- type: gpteacher
14
- dataset_prepared_path: last_run_prepared
15
- val_set_size: 0.05
16
- adapter: lora
17
- sequence_len: 2048
18
- lora_r: 8
19
- lora_alpha: 16
20
- lora_dropout: 0.05
21
- lora_target_modules:
22
- - c_attn
23
- lora_fan_in_fan_out: false
24
- wandb_project: pythia-1.4b-lora
25
- wandb_watch:
26
- wandb_run_id:
27
- wandb_log_model:
28
- output_dir: ./lora-alpaca
29
- gradient_accumulation_steps: 1
30
- micro_batch_size: 4
31
- num_epochs: 5
32
- learning_rate: 0.0003
33
- train_on_inputs: false
34
- group_by_length: false
35
- bf16: True
36
- tf32: True
37
- gradient_checkpointing:
38
- early_stopping_patience:
39
- resume_from_checkpoint:
40
- local_rank:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/galactica_1_3B.yml DELETED
@@ -1,41 +0,0 @@
1
- base_model: facebook/galactica-1.3b
2
- model_type: AutoModelForCausalLM
3
- tokenizer_type: AutoTokenizer
4
- load_in_8bit: false
5
- datasets:
6
- - path: tatsu-lab/alpaca
7
- type: alpaca
8
- dataset_prepared_path: last_run_prepared
9
- val_set_size: 0.1
10
- adapter:
11
- lora_model_dir:
12
- sequence_len: 1024
13
- max_packed_sequence_len: 1024
14
- lora_r: 8
15
- lora_alpha: 16
16
- lora_dropout: 0.05
17
- lora_target_modules:
18
- - q_proj
19
- - v_proj
20
- lora_fan_in_fan_out: false
21
- wandb_project:
22
- wandb_watch:
23
- wandb_run_id:
24
- wandb_log_model:
25
- output_dir: ./lora-llama-alpaca
26
- gradient_accumulation_steps: 1
27
- micro_batch_size: 16
28
- num_epochs: 3
29
- learning_rate: 0.00003
30
- train_on_inputs: false
31
- group_by_length: false
32
- bf16: false
33
- tf32: false
34
- early_stopping_patience:
35
- resume_from_checkpoint:
36
- local_rank:
37
- tokens:
38
- pad_token: "[PAD]"
39
- bos_token: "<s>"
40
- eos_token: "</s>"
41
- unk_token: "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/gpt_neox_20b.yml DELETED
@@ -1,39 +0,0 @@
1
- base_model: EleutherAI/gpt-neox-20b
2
- base_model_ignore_patterns: pytorch* # prefer safetensors
3
- model_type: GPTNeoXForCausalLM
4
- tokenizer_type: AutoTokenizer
5
- load_in_8bit: true
6
- datasets:
7
- - path: nomic-ai/gpt4all-j-prompt-generations
8
- type: alpaca
9
- shards: 4
10
- shards_index: 0
11
- dataset_prepared_path: last_run_prepared
12
- val_set_size: 0.05
13
- adapter: lora
14
- lora_model_dir:
15
- sequence_len: 2048
16
- max_packed_sequence_len: 2048
17
- lora_r: 8
18
- lora_alpha: 32
19
- lora_dropout: 0.05
20
- lora_target_modules:
21
- - query_key_value
22
- lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific
23
- wandb_project: gpt4all-neox-20b
24
- wandb_watch:
25
- wandb_run_id:
26
- wandb_log_model:
27
- output_dir: ./gpt4all-neox-20b
28
- gradient_accumulation_steps: 1
29
- micro_batch_size: 4
30
- num_epochs: 5
31
- learning_rate: 0.00003
32
- lr_scheduler: one_cycle
33
- train_on_inputs: false
34
- group_by_length: false
35
- bf16: True
36
- tf32: True
37
- early_stopping_patience:
38
- resume_from_checkpoint:
39
- local_rank:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/llama_7B_alpaca.yml DELETED
@@ -1,41 +0,0 @@
1
- base_model: huggyllama/llama-7b
2
- model_type: LlamaForCausalLM
3
- tokenizer_type: LlamaTokenizer
4
- load_in_8bit: true
5
- datasets:
6
- - path: data/alpaca_data_gpt4.jsonl
7
- type: alpaca
8
- - path: data/vicuna_cleaned.jsonl
9
- type: sharegpt
10
- - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
11
- type: gpteacher
12
- - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
13
- type: gpteacher
14
- dataset_prepared_path: last_run_prepared
15
- val_set_size: 0.04
16
- adapter: lora
17
- lora_model_dir:
18
- sequence_len: 2048
19
- lora_r: 8
20
- lora_alpha: 16
21
- lora_dropout: 0.05
22
- lora_target_modules:
23
- - q_proj
24
- - v_proj
25
- lora_fan_in_fan_out: false
26
- wandb_project: llama-7b-lora
27
- wandb_watch:
28
- wandb_run_id:
29
- wandb_log_model:
30
- output_dir: ./lora-llama-alpaca
31
- gradient_accumulation_steps: 1
32
- micro_batch_size: 16
33
- num_epochs: 5
34
- learning_rate: 0.00003
35
- train_on_inputs: false
36
- group_by_length: false
37
- bf16: true
38
- tf32: true
39
- early_stopping_patience:
40
- resume_from_checkpoint:
41
- local_rank:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/sample.yml DELETED
@@ -1,87 +0,0 @@
1
- # this is the huggingface model that contains *.pt, *.safetensors, or *.bin files
2
- # this can also be a relative path to a model on disk
3
- base_model: decapoda-research/llama-7b-hf-int4
4
- # you can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
5
- base_model_ignore_patterns:
6
- # if the base_model repo on hf hub doesn't include configuration .json files,
7
- # you can set that here, or leave this empty to default to base_model
8
- base_model_config: decapoda-research/llama-7b-hf
9
- # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
10
- model_type: AutoModelForCausalLM
11
- # Corresponding tokenizer for the model AutoTokenizer is a good choice
12
- tokenizer_type: AutoTokenizer
13
- # whether you are training a 4-bit quantized model
14
- load_4bit: true
15
- # this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
16
- load_in_8bit: true
17
- # a list of one or more datasets to finetune the model with
18
- datasets:
19
- # this can be either a hf dataset, or relative path
20
- - path: vicgalle/alpaca-gpt4
21
- # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
22
- type: alpaca
23
- # axolotl attempts to save the dataset as an arrow after packing the data together so
24
- # subsequent training attempts load faster, relative path
25
- dataset_prepared_path: data/last_run_prepared
26
- # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc
27
- val_set_size: 0.04
28
- # if you want to use lora, leave blank to train all parameters in original model
29
- adapter: lora
30
- # if you already have a lora model trained that you want to load, put that here
31
- lora_model_dir:
32
- # the maximum length of an input to train with, this should typically be less than 2048
33
- # as most models have a token/context limit of 2048
34
- sequence_len: 2048
35
- # max sequence length to concatenate training samples together up to
36
- # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
37
- max_packed_sequence_len: 1024
38
- # lora hyperparameters
39
- lora_r: 8
40
- lora_alpha: 16
41
- lora_dropout: 0.05
42
- lora_target_modules:
43
- - q_proj
44
- - v_proj
45
- # - k_proj
46
- # - o_proj
47
- lora_fan_in_fan_out: false
48
- # wandb configuration if your're using it
49
- wandb_project:
50
- wandb_watch:
51
- wandb_run_id:
52
- wandb_log_model:
53
- # where to save the finsihed model to
54
- output_dir: ./completed-model
55
- # training hyperparameters
56
- gradient_accumulation_steps: 1
57
- batch_size:
58
- micro_batch_size: 2
59
- num_epochs: 3
60
- warmup_steps: 100
61
- learning_rate: 0.00003
62
- # whether to mask out or include the human's prompt from the training labels
63
- train_on_inputs: false
64
- # don't use this, leads to wonky training (according to someone on the internet)
65
- group_by_length: false
66
- # Use CUDA bf16
67
- bf16: true
68
- # Use CUDA tf32
69
- tf32: true
70
- # does not work with current implementation of 4-bit LoRA
71
- gradient_checkpointing: false
72
- # stop training after this many evaluation losses have increased in a row
73
- # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
74
- early_stopping_patience: 3
75
- # specify a scheduler to use with the optimizer. only one_cycle is supported currently
76
- lr_scheduler:
77
- # whether to use xformers attention patch https://github.com/facebookresearch/xformers:
78
- xformers_attention:
79
- # whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
80
- flash_attention:
81
- # resume from a specific checkpoint dir
82
- resume_from_checkpoint:
83
- # if resume_from_checkpoint isn't set and you simply want it to start where it left off
84
- # be careful with this being turned on between different models
85
- auto_resume_from_checkpoints: false
86
- # don't mess with this, it's here for accelerate and torchrun
87
- local_rank:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/cerebras/qlora.yml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: cerebras/Cerebras-GPT-1.3B
2
+ base_model_config: cerebras/Cerebras-GPT-1.3B
3
+ load_in_8bit: false
4
+ load_in_4bit: true
5
+ strict: false
6
+ push_dataset_to_hub:
7
+ datasets:
8
+ - path: teknium/GPT4-LLM-Cleaned
9
+ type: alpaca
10
+ dataset_prepared_path: last_run_prepared
11
+ val_set_size: 0.01
12
+ adapter: qlora
13
+ lora_model_dir:
14
+ sequence_len: 2048
15
+ max_packed_sequence_len: 2048
16
+ lora_r: 16
17
+ lora_alpha: 32
18
+ lora_dropout: 0.05
19
+ lora_target_modules:
20
+ - c_fc
21
+ - c_attn
22
+ - c_proj
23
+ lora_target_linear:
24
+ lora_fan_in_fan_out:
25
+ wandb_project:
26
+ wandb_watch:
27
+ wandb_run_id:
28
+ wandb_log_model:
29
+ output_dir: ./qlora-out
30
+ batch_size: 4
31
+ micro_batch_size: 4
32
+ num_epochs: 2
33
+ optimizer: paged_adamw_8bit
34
+ torchdistx_path:
35
+ lr_scheduler: cosine
36
+ learning_rate: 0.0002
37
+ train_on_inputs: false
38
+ group_by_length: true
39
+ bf16: true
40
+ fp16: false
41
+ tf32: true
42
+ gradient_checkpointing: true
43
+ early_stopping_patience:
44
+ resume_from_checkpoint:
45
+ local_rank:
46
+ logging_steps: 1
47
+ xformers_attention: true
48
+ flash_attention:
49
+ gptq_groupsize:
50
+ gptq_model_v1:
51
+ warmup_steps: 10
52
+ eval_steps: 20
53
+ save_steps:
54
+ debug:
55
+ deepspeed:
56
+ weight_decay: 0.1
57
+ fsdp:
58
+ fsdp_config:
59
+ special_tokens:
60
+ pad_token: "<|endoftext|>"
configs/stability_3b.yml β†’ examples/gptj/qlora.yml RENAMED
@@ -1,38 +1,42 @@
1
- base_model: stabilityai/stablelm-base-alpha-3b
2
- base_model_config: stabilityai/stablelm-base-alpha-3b
3
  load_in_8bit: false
 
 
 
4
  datasets:
5
- - path: vicgalle/alpaca-gpt4
6
  type: alpaca
7
  dataset_prepared_path: last_run_prepared
8
- val_set_size: 0.04
9
- adapter:
10
  lora_model_dir:
11
- sequence_len: 4096
12
- max_packed_sequence_len: 4096
13
  lora_r: 8
14
- lora_alpha: 16
15
  lora_dropout: 0.05
16
  lora_target_modules:
17
- - q_proj
18
- - v_proj
19
- lora_fan_in_fan_out: false
20
- wandb_project: stable-alpaca-3b
21
  wandb_watch:
22
  wandb_run_id:
23
  wandb_log_model:
24
- output_dir: ./stable-alpaca-3b
25
- gradient_accumulation_steps: 1
26
- micro_batch_size: 1
27
- num_epochs: 1
28
- optimizer: adamw_bnb_8bit
29
  torchdistx_path:
30
  lr_scheduler: cosine
31
- learning_rate: 0.0000002
32
  train_on_inputs: false
33
- group_by_length: false
34
  bf16: true
 
35
  tf32: true
 
36
  early_stopping_patience:
37
  resume_from_checkpoint:
38
  local_rank:
@@ -41,16 +45,13 @@ xformers_attention: true
41
  flash_attention:
42
  gptq_groupsize:
43
  gptq_model_v1:
44
- warmup_steps: 100
45
- eval_steps: 50
46
- save_steps: 200
47
  debug:
48
  deepspeed:
49
- weight_decay: 0.01
50
  fsdp:
51
  fsdp_config:
52
- #tokens:
53
- # pad_token: "[PAD]"
54
- # bos_token: "<s>"
55
- # eos_token: "</s>"
56
- # unk_token: "<unk>"
 
1
+ base_model: EleutherAI/gpt-j-6b
2
+ base_model_config: EleutherAI/gpt-j-6b
3
  load_in_8bit: false
4
+ load_in_4bit: true
5
+ strict: false
6
+ push_dataset_to_hub:
7
  datasets:
8
+ - path: teknium/GPT4-LLM-Cleaned
9
  type: alpaca
10
  dataset_prepared_path: last_run_prepared
11
+ val_set_size: 0.01
12
+ adapter: qlora
13
  lora_model_dir:
14
+ sequence_len: 2048
15
+ max_packed_sequence_len:
16
  lora_r: 8
17
+ lora_alpha: 32
18
  lora_dropout: 0.05
19
  lora_target_modules:
20
+ lora_target_linear: true
21
+ lora_fan_in_fan_out:
22
+ wandb_project:
 
23
  wandb_watch:
24
  wandb_run_id:
25
  wandb_log_model:
26
+ output_dir: ./qlora-out
27
+ gradient_accumulation_steps: 2
28
+ micro_batch_size: 2
29
+ num_epochs: 2
30
+ optimizer: paged_adamw_8bit
31
  torchdistx_path:
32
  lr_scheduler: cosine
33
+ learning_rate: 0.0001
34
  train_on_inputs: false
35
+ group_by_length: true
36
  bf16: true
37
+ fp16: false
38
  tf32: true
39
+ gradient_checkpointing: true
40
  early_stopping_patience:
41
  resume_from_checkpoint:
42
  local_rank:
 
45
  flash_attention:
46
  gptq_groupsize:
47
  gptq_model_v1:
48
+ warmup_steps: 10
49
+ eval_steps: 20
50
+ save_steps:
51
  debug:
52
  deepspeed:
53
+ weight_decay: 0.1
54
  fsdp:
55
  fsdp_config:
56
+ special_tokens:
57
+ pad_token: "<|endoftext|>"
 
 
 
configs/llama_7B_jeopardy.yml β†’ examples/jeopardy-bot/config.yml RENAMED
@@ -7,30 +7,28 @@ datasets:
7
  - path: openaccess-ai-collective/jeopardy
8
  type: jeopardy
9
  dataset_prepared_path: last_run_prepared
10
- val_set_size: 0.01
11
  adapter:
12
  lora_model_dir:
13
- sequence_len: 2048
14
- max_packed_sequence_len: 2048
15
- lora_r: 8
16
- lora_alpha: 16
17
- lora_dropout: 0.05
18
  lora_target_modules:
19
- - q_proj
20
- - v_proj
21
  lora_fan_in_fan_out: false
22
- wandb_project: jeopardy-bot-7b
23
  wandb_watch:
24
  wandb_run_id:
25
  wandb_log_model:
26
  output_dir: ./jeopardy-bot-7b
27
- gradient_accumulation_steps: 2
28
  micro_batch_size: 1
29
- num_epochs: 2
30
  optimizer: adamw_bnb_8bit
31
  torchdistx_path:
32
  lr_scheduler: cosine
33
- learning_rate: 0.0000002
34
  train_on_inputs: false
35
  group_by_length: false
36
  bf16: true
@@ -48,11 +46,10 @@ eval_steps: 110
48
  save_steps: 660
49
  debug:
50
  deepspeed:
51
- weight_decay: 0.0001
52
  fsdp:
53
  fsdp_config:
54
  tokens:
55
- pad_token: "[PAD]"
56
  bos_token: "<s>"
57
  eos_token: "</s>"
58
  unk_token: "<unk>"
 
7
  - path: openaccess-ai-collective/jeopardy
8
  type: jeopardy
9
  dataset_prepared_path: last_run_prepared
10
+ val_set_size: 0.02
11
  adapter:
12
  lora_model_dir:
13
+ sequence_len: 512
14
+ max_packed_sequence_len:
15
+ lora_r:
16
+ lora_alpha:
17
+ lora_dropout:
18
  lora_target_modules:
 
 
19
  lora_fan_in_fan_out: false
20
+ wandb_project:
21
  wandb_watch:
22
  wandb_run_id:
23
  wandb_log_model:
24
  output_dir: ./jeopardy-bot-7b
25
+ gradient_accumulation_steps: 1
26
  micro_batch_size: 1
27
+ num_epochs: 3
28
  optimizer: adamw_bnb_8bit
29
  torchdistx_path:
30
  lr_scheduler: cosine
31
+ learning_rate: 0.00003
32
  train_on_inputs: false
33
  group_by_length: false
34
  bf16: true
 
46
  save_steps: 660
47
  debug:
48
  deepspeed:
49
+ weight_decay: 0.1
50
  fsdp:
51
  fsdp_config:
52
  tokens:
 
53
  bos_token: "<s>"
54
  eos_token: "</s>"
55
  unk_token: "<unk>"
examples/openllama-3b/README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # openllama-3b
2
+
3
+ Basic full tune
4
+ ```shell
5
+ accelerate launch scripts/finetune.py examples/qlora-openllama-3b/config.yml
6
+ ```
7
+
8
+ LoRA
9
+ ```shell
10
+ accelerate launch scripts/finetune.py examples/qlora-openllama-3b/lora.yml
11
+ ```
12
+
13
+ QLoRA
14
+ ```shell
15
+ accelerate launch scripts/finetune.py examples/qlora-openllama-3b/qlora.yml
16
+ ```
examples/openllama-3b/config.yml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: openlm-research/open_llama_3b
2
+ base_model_config: openlm-research/open_llama_3b
3
+ model_type: LlamaForCausalLM
4
+ tokenizer_type: LlamaTokenizer
5
+ load_in_8bit: false
6
+ load_in_4bit: false
7
+ strict: false
8
+ push_dataset_to_hub:
9
+ datasets:
10
+ - path: teknium/GPT4-LLM-Cleaned
11
+ type: alpaca
12
+ dataset_prepared_path: last_run_prepared
13
+ val_set_size: 0.02
14
+ adapter:
15
+ lora_model_dir:
16
+ sequence_len: 256
17
+ max_packed_sequence_len:
18
+ lora_r:
19
+ lora_alpha:
20
+ lora_dropout:
21
+ lora_target_modules:
22
+ lora_target_linear:
23
+ lora_fan_in_fan_out:
24
+ wandb_project:
25
+ wandb_watch:
26
+ wandb_run_id:
27
+ wandb_log_model:
28
+ output_dir: ./lora-out
29
+ batch_size: 16
30
+ micro_batch_size: 4
31
+ num_epochs: 3
32
+ optimizer: adamw_bnb_8bit
33
+ torchdistx_path:
34
+ lr_scheduler: cosine
35
+ learning_rate: 0.0002
36
+ train_on_inputs: false
37
+ group_by_length: false
38
+ bf16: false
39
+ fp16: true
40
+ tf32: false
41
+ gradient_checkpointing: true
42
+ early_stopping_patience:
43
+ resume_from_checkpoint:
44
+ local_rank:
45
+ logging_steps: 1
46
+ xformers_attention:
47
+ flash_attention:
48
+ gptq_groupsize:
49
+ gptq_model_v1:
50
+ warmup_steps: 10
51
+ eval_steps: 50
52
+ save_steps:
53
+ debug:
54
+ deepspeed:
55
+ weight_decay: 0.0
56
+ fsdp:
57
+ fsdp_config:
58
+ special_tokens:
59
+ bos_token: "<s>"
60
+ eos_token: "</s>"
61
+ unk_token: "<unk>"
examples/{lora-openllama-3b/config.yml β†’ openllama-3b/lora.yml} RENAMED
@@ -1,5 +1,5 @@
1
- base_model: openlm-research/open_llama_3b_600bt_preview
2
- base_model_config: openlm-research/open_llama_3b_600bt_preview
3
  model_type: LlamaForCausalLM
4
  tokenizer_type: LlamaTokenizer
5
  load_in_8bit: true
 
1
+ base_model: openlm-research/open_llama_3b
2
+ base_model_config: openlm-research/open_llama_3b
3
  model_type: LlamaForCausalLM
4
  tokenizer_type: LlamaTokenizer
5
  load_in_8bit: true
examples/{qlora-openllama-3b/config.yml β†’ openllama-3b/qlora.yml} RENAMED
@@ -1,5 +1,5 @@
1
- base_model: openlm-research/open_llama_3b_600bt_preview
2
- base_model_config: openlm-research/open_llama_3b_600bt_preview
3
  model_type: LlamaForCausalLM
4
  tokenizer_type: LlamaTokenizer
5
  load_in_8bit: false
 
1
+ base_model: openlm-research/open_llama_3b
2
+ base_model_config: openlm-research/open_llama_3b
3
  model_type: LlamaForCausalLM
4
  tokenizer_type: LlamaTokenizer
5
  load_in_8bit: false
examples/qlora-openllama-3b/README.md DELETED
@@ -1,6 +0,0 @@
1
- # qlora-openllama-3b
2
-
3
- ```shell
4
- accelerate launch scripts/finetune.py examples/qlora-openllama-3b/config.yml
5
-
6
- ```