Spaces:
Runtime error
Runtime error
Update train.py
Browse files
train.py
CHANGED
@@ -1,67 +1,62 @@
|
|
1 |
import os
|
2 |
-
|
3 |
import torch
|
4 |
import trl
|
5 |
-
|
6 |
-
|
|
|
|
|
7 |
from datasets import load_dataset, Dataset
|
8 |
from tokenizers import ByteLevelBPETokenizer
|
|
|
|
|
9 |
|
10 |
-
BATCH_SIZE =
|
11 |
-
EPOCHS =
|
12 |
-
LEARNING_RATE =
|
13 |
-
FACTOR =
|
14 |
MAX_SEQ_LENGTH = 128
|
15 |
-
VOCAB_SIZE =
|
16 |
INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
|
17 |
INSTRUCT_DATASET = "nroggendorff/elephant"
|
18 |
OUTPUT_REPO = "nroggendorff/smallama"
|
19 |
INSTRUCT_FINETUNE_BOOL = False
|
20 |
-
INIT = 0#/
|
21 |
-
SHARD_SIZE = int(
|
22 |
FP16 = True
|
23 |
-
WARMUP_STEPS =
|
24 |
-
|
25 |
-
GRADIENT_ACCUMULATION_STEPS = 4
|
26 |
PUSH_TO_HUB = True
|
|
|
27 |
|
28 |
def load_data():
|
29 |
if not INSTRUCT_FINETUNE_BOOL:
|
30 |
dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
|
31 |
dataset = Dataset.from_generator(lambda: dataset.take(int(8e+6)))
|
32 |
-
# dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
|
33 |
else:
|
34 |
-
dataset = load_dataset(INSTRUCT_DATASET, split="train")
|
35 |
-
# dataset = Dataset.from_generator(lambda: dataset.take(int(5e+6)))
|
36 |
return dataset
|
37 |
|
38 |
def create_tokenizer(training_corpus):
|
39 |
tokenizer = ByteLevelBPETokenizer()
|
40 |
special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
|
41 |
if INSTRUCT_FINETUNE_BOOL:
|
42 |
-
special_tokens.
|
43 |
tokenizer.train_from_iterator(
|
44 |
training_corpus,
|
45 |
vocab_size=VOCAB_SIZE,
|
46 |
min_frequency=2,
|
47 |
special_tokens=special_tokens
|
48 |
)
|
49 |
-
|
50 |
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer)
|
51 |
return fast_tokenizer
|
52 |
|
53 |
def load_tokenizer():
|
54 |
-
|
55 |
-
return tokenizer
|
56 |
|
57 |
def get_training_corpus(dataset):
|
58 |
-
|
59 |
-
|
60 |
-
# texts.extend(dataset[field]['text'])
|
61 |
-
texts.extend(dataset['text'])
|
62 |
-
|
63 |
-
for i in range(0, len(texts), 1000):
|
64 |
-
yield texts[i : i + 1000]
|
65 |
|
66 |
def format_prompts(examples, tokenizer, isinst):
|
67 |
texts = []
|
@@ -85,10 +80,10 @@ def create_model(tokenizer):
|
|
85 |
vocab_size=tokenizer.vocab_size,
|
86 |
hidden_size=FACTOR,
|
87 |
intermediate_size=FACTOR * 4,
|
88 |
-
num_hidden_layers=
|
89 |
-
num_attention_heads=
|
90 |
max_position_embeddings=MAX_SEQ_LENGTH,
|
91 |
-
rms_norm_eps=1e-
|
92 |
initializer_range=0.02,
|
93 |
use_cache=True,
|
94 |
pad_token_id=tokenizer.pad_token_id,
|
@@ -96,13 +91,10 @@ def create_model(tokenizer):
|
|
96 |
eos_token_id=tokenizer.eos_token_id,
|
97 |
tie_word_embeddings=False,
|
98 |
)
|
99 |
-
|
100 |
-
model = LlamaForCausalLM(config)
|
101 |
-
return model
|
102 |
|
103 |
def load_model():
|
104 |
-
|
105 |
-
return model
|
106 |
|
107 |
def configure_tokenizer(tokenizer):
|
108 |
special_tokens = {
|
@@ -131,16 +123,19 @@ def train_model(model, tokenizer, dataset, push, isinst):
|
|
131 |
learning_rate=LEARNING_RATE,
|
132 |
optim="adamw_torch",
|
133 |
warmup_steps=WARMUP_STEPS,
|
134 |
-
weight_decay=
|
135 |
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
|
136 |
fp16=FP16,
|
137 |
save_steps=int(1e+10),
|
138 |
-
logging_steps=10
|
|
|
|
|
|
|
139 |
)
|
140 |
|
141 |
dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
|
142 |
|
143 |
-
optimizer = AdamW(model.parameters(), lr=args.learning_rate)
|
144 |
scheduler = get_cosine_schedule_with_warmup(
|
145 |
optimizer,
|
146 |
num_warmup_steps=args.warmup_steps,
|
@@ -165,11 +160,8 @@ def train_model(model, tokenizer, dataset, push, isinst):
|
|
165 |
trained_tokenizer = trainer.tokenizer
|
166 |
|
167 |
if push:
|
168 |
-
if INSTRUCT_FINETUNE_BOOL
|
169 |
-
|
170 |
-
else:
|
171 |
-
repo_id = OUTPUT_REPO
|
172 |
-
msg = str(train.training_loss)
|
173 |
trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
|
174 |
trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
|
175 |
else:
|
@@ -178,26 +170,22 @@ def train_model(model, tokenizer, dataset, push, isinst):
|
|
178 |
|
179 |
def main(push_to_hub=True, is_inst_finetune=False):
|
180 |
dataset = load_data()
|
181 |
-
if not is_inst_finetune:
|
182 |
-
|
183 |
-
|
184 |
-
tokenizer = create_tokenizer(training_corpus)
|
185 |
-
else:
|
186 |
-
tokenizer = load_tokenizer()
|
187 |
else:
|
188 |
tokenizer = load_tokenizer()
|
|
|
189 |
configure_tokenizer(tokenizer)
|
|
|
190 |
if is_inst_finetune:
|
191 |
model = load_model()
|
192 |
model.resize_token_embeddings(len(tokenizer))
|
193 |
-
train_model(model, tokenizer, dataset, push_to_hub, True)
|
194 |
else:
|
195 |
-
if INIT == 0
|
196 |
-
|
197 |
-
|
198 |
-
model = load_model()
|
199 |
-
train_model(model, tokenizer, dataset, push_to_hub, False)
|
200 |
|
201 |
if __name__ == "__main__":
|
202 |
main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
|
203 |
-
raise
|
|
|
1 |
import os
|
|
|
2 |
import torch
|
3 |
import trl
|
4 |
+
from transformers import (
|
5 |
+
AutoTokenizer, LlamaConfig, AutoModelForCausalLM, LlamaForCausalLM,
|
6 |
+
TrainingArguments, PreTrainedTokenizerFast, AdamW, get_cosine_schedule_with_warmup
|
7 |
+
)
|
8 |
from datasets import load_dataset, Dataset
|
9 |
from tokenizers import ByteLevelBPETokenizer
|
10 |
+
from torch.utils.data import DataLoader
|
11 |
+
from torch.cuda.amp import autocast, GradScaler
|
12 |
|
13 |
+
BATCH_SIZE = 64
|
14 |
+
EPOCHS = 3
|
15 |
+
LEARNING_RATE = 1e-4
|
16 |
+
FACTOR = 768
|
17 |
MAX_SEQ_LENGTH = 128
|
18 |
+
VOCAB_SIZE = 32000
|
19 |
INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
|
20 |
INSTRUCT_DATASET = "nroggendorff/elephant"
|
21 |
OUTPUT_REPO = "nroggendorff/smallama"
|
22 |
INSTRUCT_FINETUNE_BOOL = False
|
23 |
+
INIT = 0#/15
|
24 |
+
SHARD_SIZE = int(5e+5)
|
25 |
FP16 = True
|
26 |
+
WARMUP_STEPS = 1000
|
27 |
+
WEIGHT_DECAY = 0.01
|
28 |
+
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // 4
|
29 |
PUSH_TO_HUB = True
|
30 |
+
NUM_WORKERS = 4
|
31 |
|
32 |
def load_data():
|
33 |
if not INSTRUCT_FINETUNE_BOOL:
|
34 |
dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
|
35 |
dataset = Dataset.from_generator(lambda: dataset.take(int(8e+6)))
|
|
|
36 |
else:
|
37 |
+
dataset = load_dataset(INSTRUCT_DATASET, split="train")
|
|
|
38 |
return dataset
|
39 |
|
40 |
def create_tokenizer(training_corpus):
|
41 |
tokenizer = ByteLevelBPETokenizer()
|
42 |
special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
|
43 |
if INSTRUCT_FINETUNE_BOOL:
|
44 |
+
special_tokens.extend(["<|user|>", "<|bot|>", "<|end|>"])
|
45 |
tokenizer.train_from_iterator(
|
46 |
training_corpus,
|
47 |
vocab_size=VOCAB_SIZE,
|
48 |
min_frequency=2,
|
49 |
special_tokens=special_tokens
|
50 |
)
|
|
|
51 |
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer)
|
52 |
return fast_tokenizer
|
53 |
|
54 |
def load_tokenizer():
|
55 |
+
return AutoTokenizer.from_pretrained(OUTPUT_REPO)
|
|
|
56 |
|
57 |
def get_training_corpus(dataset):
|
58 |
+
for i in range(0, len(dataset['text']), 1000):
|
59 |
+
yield dataset['text'][i : i + 1000]
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
def format_prompts(examples, tokenizer, isinst):
|
62 |
texts = []
|
|
|
80 |
vocab_size=tokenizer.vocab_size,
|
81 |
hidden_size=FACTOR,
|
82 |
intermediate_size=FACTOR * 4,
|
83 |
+
num_hidden_layers=12,
|
84 |
+
num_attention_heads=12,
|
85 |
max_position_embeddings=MAX_SEQ_LENGTH,
|
86 |
+
rms_norm_eps=1e-5,
|
87 |
initializer_range=0.02,
|
88 |
use_cache=True,
|
89 |
pad_token_id=tokenizer.pad_token_id,
|
|
|
91 |
eos_token_id=tokenizer.eos_token_id,
|
92 |
tie_word_embeddings=False,
|
93 |
)
|
94 |
+
return LlamaForCausalLM(config)
|
|
|
|
|
95 |
|
96 |
def load_model():
|
97 |
+
return AutoModelForCausalLM.from_pretrained(OUTPUT_REPO)
|
|
|
98 |
|
99 |
def configure_tokenizer(tokenizer):
|
100 |
special_tokens = {
|
|
|
123 |
learning_rate=LEARNING_RATE,
|
124 |
optim="adamw_torch",
|
125 |
warmup_steps=WARMUP_STEPS,
|
126 |
+
weight_decay=WEIGHT_DECAY,
|
127 |
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
|
128 |
fp16=FP16,
|
129 |
save_steps=int(1e+10),
|
130 |
+
logging_steps=10,
|
131 |
+
evaluation_strategy="steps",
|
132 |
+
eval_steps=500,
|
133 |
+
save_total_limit=2,
|
134 |
)
|
135 |
|
136 |
dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
|
137 |
|
138 |
+
optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=WEIGHT_DECAY)
|
139 |
scheduler = get_cosine_schedule_with_warmup(
|
140 |
optimizer,
|
141 |
num_warmup_steps=args.warmup_steps,
|
|
|
160 |
trained_tokenizer = trainer.tokenizer
|
161 |
|
162 |
if push:
|
163 |
+
repo_id = OUTPUT_REPO + "-it" if INSTRUCT_FINETUNE_BOOL else OUTPUT_REPO
|
164 |
+
msg = f"Training loss: {train.training_loss:.4f}"
|
|
|
|
|
|
|
165 |
trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
|
166 |
trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
|
167 |
else:
|
|
|
170 |
|
171 |
def main(push_to_hub=True, is_inst_finetune=False):
|
172 |
dataset = load_data()
|
173 |
+
if not is_inst_finetune and INIT == 0:
|
174 |
+
training_corpus = get_training_corpus(dataset)
|
175 |
+
tokenizer = create_tokenizer(training_corpus)
|
|
|
|
|
|
|
176 |
else:
|
177 |
tokenizer = load_tokenizer()
|
178 |
+
|
179 |
configure_tokenizer(tokenizer)
|
180 |
+
|
181 |
if is_inst_finetune:
|
182 |
model = load_model()
|
183 |
model.resize_token_embeddings(len(tokenizer))
|
|
|
184 |
else:
|
185 |
+
model = create_model(tokenizer) if INIT == 0 else load_model()
|
186 |
+
|
187 |
+
train_model(model, tokenizer, dataset, push_to_hub, is_inst_finetune)
|
|
|
|
|
188 |
|
189 |
if __name__ == "__main__":
|
190 |
main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
|
191 |
+
raise Exception("Done baking!")
|