nroggendorff commited on
Commit
632f592
·
verified ·
1 Parent(s): 93dce4c

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +44 -56
train.py CHANGED
@@ -1,67 +1,62 @@
1
  import os
2
-
3
  import torch
4
  import trl
5
-
6
- from transformers import AutoTokenizer, LlamaConfig, AutoModelForCausalLM, LlamaForCausalLM, TrainingArguments, PreTrainedTokenizerFast, AdamW, get_cosine_schedule_with_warmup
 
 
7
  from datasets import load_dataset, Dataset
8
  from tokenizers import ByteLevelBPETokenizer
 
 
9
 
10
- BATCH_SIZE = 32
11
- EPOCHS = 1
12
- LEARNING_RATE = 5e-4
13
- FACTOR = 22 * 35
14
  MAX_SEQ_LENGTH = 128
15
- VOCAB_SIZE = 52000
16
  INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
17
  INSTRUCT_DATASET = "nroggendorff/elephant"
18
  OUTPUT_REPO = "nroggendorff/smallama"
19
  INSTRUCT_FINETUNE_BOOL = False
20
- INIT = 0#/3
21
- SHARD_SIZE = int(2e+6)
22
  FP16 = True
23
- WARMUP_STEPS = 0
24
- DECAY = 0
25
- GRADIENT_ACCUMULATION_STEPS = 4
26
  PUSH_TO_HUB = True
 
27
 
28
  def load_data():
29
  if not INSTRUCT_FINETUNE_BOOL:
30
  dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
31
  dataset = Dataset.from_generator(lambda: dataset.take(int(8e+6)))
32
- # dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
33
  else:
34
- dataset = load_dataset(INSTRUCT_DATASET, split="train")#, streaming=True)
35
- # dataset = Dataset.from_generator(lambda: dataset.take(int(5e+6)))
36
  return dataset
37
 
38
  def create_tokenizer(training_corpus):
39
  tokenizer = ByteLevelBPETokenizer()
40
  special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
41
  if INSTRUCT_FINETUNE_BOOL:
42
- special_tokens.append(["<|user|>", "<|bot|>", "<|end|>"])
43
  tokenizer.train_from_iterator(
44
  training_corpus,
45
  vocab_size=VOCAB_SIZE,
46
  min_frequency=2,
47
  special_tokens=special_tokens
48
  )
49
-
50
  fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer)
51
  return fast_tokenizer
52
 
53
  def load_tokenizer():
54
- tokenizer = AutoTokenizer.from_pretrained(OUTPUT_REPO)
55
- return tokenizer
56
 
57
  def get_training_corpus(dataset):
58
- texts = []
59
- #for field in ['pretrain', 'instruct']:
60
- # texts.extend(dataset[field]['text'])
61
- texts.extend(dataset['text'])
62
-
63
- for i in range(0, len(texts), 1000):
64
- yield texts[i : i + 1000]
65
 
66
  def format_prompts(examples, tokenizer, isinst):
67
  texts = []
@@ -85,10 +80,10 @@ def create_model(tokenizer):
85
  vocab_size=tokenizer.vocab_size,
86
  hidden_size=FACTOR,
87
  intermediate_size=FACTOR * 4,
88
- num_hidden_layers=max(1, FACTOR // 32),
89
- num_attention_heads=max(1, FACTOR // 64),
90
  max_position_embeddings=MAX_SEQ_LENGTH,
91
- rms_norm_eps=1e-6,
92
  initializer_range=0.02,
93
  use_cache=True,
94
  pad_token_id=tokenizer.pad_token_id,
@@ -96,13 +91,10 @@ def create_model(tokenizer):
96
  eos_token_id=tokenizer.eos_token_id,
97
  tie_word_embeddings=False,
98
  )
99
-
100
- model = LlamaForCausalLM(config)
101
- return model
102
 
103
  def load_model():
104
- model = AutoModelForCausalLM.from_pretrained(OUTPUT_REPO)
105
- return model
106
 
107
  def configure_tokenizer(tokenizer):
108
  special_tokens = {
@@ -131,16 +123,19 @@ def train_model(model, tokenizer, dataset, push, isinst):
131
  learning_rate=LEARNING_RATE,
132
  optim="adamw_torch",
133
  warmup_steps=WARMUP_STEPS,
134
- weight_decay=DECAY,
135
  gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
136
  fp16=FP16,
137
  save_steps=int(1e+10),
138
- logging_steps=10
 
 
 
139
  )
140
 
141
  dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
142
 
143
- optimizer = AdamW(model.parameters(), lr=args.learning_rate)
144
  scheduler = get_cosine_schedule_with_warmup(
145
  optimizer,
146
  num_warmup_steps=args.warmup_steps,
@@ -165,11 +160,8 @@ def train_model(model, tokenizer, dataset, push, isinst):
165
  trained_tokenizer = trainer.tokenizer
166
 
167
  if push:
168
- if INSTRUCT_FINETUNE_BOOL:
169
- repo_id = OUTPUT_REPO + "-it"
170
- else:
171
- repo_id = OUTPUT_REPO
172
- msg = str(train.training_loss)
173
  trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
174
  trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
175
  else:
@@ -178,26 +170,22 @@ def train_model(model, tokenizer, dataset, push, isinst):
178
 
179
  def main(push_to_hub=True, is_inst_finetune=False):
180
  dataset = load_data()
181
- if not is_inst_finetune:
182
- if INIT == 0:
183
- training_corpus = get_training_corpus(dataset)
184
- tokenizer = create_tokenizer(training_corpus)
185
- else:
186
- tokenizer = load_tokenizer()
187
  else:
188
  tokenizer = load_tokenizer()
 
189
  configure_tokenizer(tokenizer)
 
190
  if is_inst_finetune:
191
  model = load_model()
192
  model.resize_token_embeddings(len(tokenizer))
193
- train_model(model, tokenizer, dataset, push_to_hub, True)
194
  else:
195
- if INIT == 0:
196
- model = create_model(tokenizer)
197
- else:
198
- model = load_model()
199
- train_model(model, tokenizer, dataset, push_to_hub, False)
200
 
201
  if __name__ == "__main__":
202
  main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
203
- raise RuntimeError("The script is finished.")
 
1
  import os
 
2
  import torch
3
  import trl
4
+ from transformers import (
5
+ AutoTokenizer, LlamaConfig, AutoModelForCausalLM, LlamaForCausalLM,
6
+ TrainingArguments, PreTrainedTokenizerFast, AdamW, get_cosine_schedule_with_warmup
7
+ )
8
  from datasets import load_dataset, Dataset
9
  from tokenizers import ByteLevelBPETokenizer
10
+ from torch.utils.data import DataLoader
11
+ from torch.cuda.amp import autocast, GradScaler
12
 
13
+ BATCH_SIZE = 64
14
+ EPOCHS = 3
15
+ LEARNING_RATE = 1e-4
16
+ FACTOR = 768
17
  MAX_SEQ_LENGTH = 128
18
+ VOCAB_SIZE = 32000
19
  INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
20
  INSTRUCT_DATASET = "nroggendorff/elephant"
21
  OUTPUT_REPO = "nroggendorff/smallama"
22
  INSTRUCT_FINETUNE_BOOL = False
23
+ INIT = 0#/15
24
+ SHARD_SIZE = int(5e+5)
25
  FP16 = True
26
+ WARMUP_STEPS = 1000
27
+ WEIGHT_DECAY = 0.01
28
+ GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // 4
29
  PUSH_TO_HUB = True
30
+ NUM_WORKERS = 4
31
 
32
  def load_data():
33
  if not INSTRUCT_FINETUNE_BOOL:
34
  dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
35
  dataset = Dataset.from_generator(lambda: dataset.take(int(8e+6)))
 
36
  else:
37
+ dataset = load_dataset(INSTRUCT_DATASET, split="train")
 
38
  return dataset
39
 
40
  def create_tokenizer(training_corpus):
41
  tokenizer = ByteLevelBPETokenizer()
42
  special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
43
  if INSTRUCT_FINETUNE_BOOL:
44
+ special_tokens.extend(["<|user|>", "<|bot|>", "<|end|>"])
45
  tokenizer.train_from_iterator(
46
  training_corpus,
47
  vocab_size=VOCAB_SIZE,
48
  min_frequency=2,
49
  special_tokens=special_tokens
50
  )
 
51
  fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer)
52
  return fast_tokenizer
53
 
54
  def load_tokenizer():
55
+ return AutoTokenizer.from_pretrained(OUTPUT_REPO)
 
56
 
57
  def get_training_corpus(dataset):
58
+ for i in range(0, len(dataset['text']), 1000):
59
+ yield dataset['text'][i : i + 1000]
 
 
 
 
 
60
 
61
  def format_prompts(examples, tokenizer, isinst):
62
  texts = []
 
80
  vocab_size=tokenizer.vocab_size,
81
  hidden_size=FACTOR,
82
  intermediate_size=FACTOR * 4,
83
+ num_hidden_layers=12,
84
+ num_attention_heads=12,
85
  max_position_embeddings=MAX_SEQ_LENGTH,
86
+ rms_norm_eps=1e-5,
87
  initializer_range=0.02,
88
  use_cache=True,
89
  pad_token_id=tokenizer.pad_token_id,
 
91
  eos_token_id=tokenizer.eos_token_id,
92
  tie_word_embeddings=False,
93
  )
94
+ return LlamaForCausalLM(config)
 
 
95
 
96
  def load_model():
97
+ return AutoModelForCausalLM.from_pretrained(OUTPUT_REPO)
 
98
 
99
  def configure_tokenizer(tokenizer):
100
  special_tokens = {
 
123
  learning_rate=LEARNING_RATE,
124
  optim="adamw_torch",
125
  warmup_steps=WARMUP_STEPS,
126
+ weight_decay=WEIGHT_DECAY,
127
  gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
128
  fp16=FP16,
129
  save_steps=int(1e+10),
130
+ logging_steps=10,
131
+ evaluation_strategy="steps",
132
+ eval_steps=500,
133
+ save_total_limit=2,
134
  )
135
 
136
  dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
137
 
138
+ optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=WEIGHT_DECAY)
139
  scheduler = get_cosine_schedule_with_warmup(
140
  optimizer,
141
  num_warmup_steps=args.warmup_steps,
 
160
  trained_tokenizer = trainer.tokenizer
161
 
162
  if push:
163
+ repo_id = OUTPUT_REPO + "-it" if INSTRUCT_FINETUNE_BOOL else OUTPUT_REPO
164
+ msg = f"Training loss: {train.training_loss:.4f}"
 
 
 
165
  trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
166
  trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
167
  else:
 
170
 
171
  def main(push_to_hub=True, is_inst_finetune=False):
172
  dataset = load_data()
173
+ if not is_inst_finetune and INIT == 0:
174
+ training_corpus = get_training_corpus(dataset)
175
+ tokenizer = create_tokenizer(training_corpus)
 
 
 
176
  else:
177
  tokenizer = load_tokenizer()
178
+
179
  configure_tokenizer(tokenizer)
180
+
181
  if is_inst_finetune:
182
  model = load_model()
183
  model.resize_token_embeddings(len(tokenizer))
 
184
  else:
185
+ model = create_model(tokenizer) if INIT == 0 else load_model()
186
+
187
+ train_model(model, tokenizer, dataset, push_to_hub, is_inst_finetune)
 
 
188
 
189
  if __name__ == "__main__":
190
  main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
191
+ raise Exception("Done baking!")