Spaces:
Paused
Paused
nroggendorff
commited on
Commit
•
93fda42
1
Parent(s):
5ad337b
Update train.py
Browse files
train.py
CHANGED
@@ -10,7 +10,7 @@ from tokenizers import ByteLevelBPETokenizer
|
|
10 |
from torch.utils.data import DataLoader
|
11 |
from torch.cuda.amp import autocast, GradScaler
|
12 |
|
13 |
-
BATCH_SIZE =
|
14 |
EPOCHS = 1
|
15 |
LEARNING_RATE = 1e-4
|
16 |
FACTOR = 768
|
@@ -20,7 +20,7 @@ INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
|
|
20 |
INSTRUCT_DATASET = "nroggendorff/elephant"
|
21 |
OUTPUT_REPO = "nroggendorff/smallama"
|
22 |
INSTRUCT_FINETUNE_BOOL = False
|
23 |
-
INIT = 0
|
24 |
SHARD_SIZE = int(5e+5)
|
25 |
FP16 = True
|
26 |
WARMUP_STEPS = 1000
|
@@ -32,9 +32,9 @@ NUM_WORKERS = 4
|
|
32 |
def load_data():
|
33 |
if not INSTRUCT_FINETUNE_BOOL:
|
34 |
dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
|
35 |
-
dataset = dataset.
|
36 |
else:
|
37 |
-
dataset = load_dataset(INSTRUCT_DATASET, split="train"
|
38 |
return dataset
|
39 |
|
40 |
def create_tokenizer(training_corpus):
|
@@ -54,6 +54,10 @@ def create_tokenizer(training_corpus):
|
|
54 |
def load_tokenizer():
|
55 |
return AutoTokenizer.from_pretrained(OUTPUT_REPO)
|
56 |
|
|
|
|
|
|
|
|
|
57 |
def format_prompts(examples, tokenizer, isinst):
|
58 |
texts = []
|
59 |
for text in examples['text']:
|
@@ -129,41 +133,48 @@ def train_model(model, tokenizer, dataset, push, isinst):
|
|
129 |
save_total_limit=2,
|
130 |
)
|
131 |
|
132 |
-
dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
|
133 |
-
|
134 |
optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=WEIGHT_DECAY)
|
135 |
scheduler = get_cosine_schedule_with_warmup(
|
136 |
optimizer,
|
137 |
num_warmup_steps=args.warmup_steps,
|
138 |
-
num_training_steps=(
|
139 |
)
|
140 |
-
|
141 |
-
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
for batch in dataloader:
|
144 |
batch = format_prompts(batch, tokenizer, isinst)
|
145 |
-
trainer = trl.SFTTrainer(
|
146 |
-
model=model,
|
147 |
-
tokenizer=tokenizer,
|
148 |
-
args=args,
|
149 |
-
train_dataset=batch,
|
150 |
-
dataset_text_field='text',
|
151 |
-
max_seq_length=MAX_SEQ_LENGTH,
|
152 |
-
optimizers=(optimizer, scheduler)
|
153 |
-
)
|
154 |
-
trainer.train()
|
155 |
-
|
156 |
-
trained_model = trainer.model
|
157 |
-
trained_tokenizer = trainer.tokenizer
|
158 |
|
159 |
if push:
|
160 |
repo_id = OUTPUT_REPO + "-it" if INSTRUCT_FINETUNE_BOOL else OUTPUT_REPO
|
161 |
-
msg = "Training
|
162 |
-
|
163 |
-
|
164 |
else:
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
if __name__ == "__main__":
|
169 |
main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
|
|
|
10 |
from torch.utils.data import DataLoader
|
11 |
from torch.cuda.amp import autocast, GradScaler
|
12 |
|
13 |
+
BATCH_SIZE = 8
|
14 |
EPOCHS = 1
|
15 |
LEARNING_RATE = 1e-4
|
16 |
FACTOR = 768
|
|
|
20 |
INSTRUCT_DATASET = "nroggendorff/elephant"
|
21 |
OUTPUT_REPO = "nroggendorff/smallama"
|
22 |
INSTRUCT_FINETUNE_BOOL = False
|
23 |
+
INIT = 0
|
24 |
SHARD_SIZE = int(5e+5)
|
25 |
FP16 = True
|
26 |
WARMUP_STEPS = 1000
|
|
|
32 |
def load_data():
|
33 |
if not INSTRUCT_FINETUNE_BOOL:
|
34 |
dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
|
35 |
+
dataset = dataset.shard(num_shards=len(dataset) // SHARD_SIZE, index=INIT)
|
36 |
else:
|
37 |
+
dataset = load_dataset(INSTRUCT_DATASET, split="train")
|
38 |
return dataset
|
39 |
|
40 |
def create_tokenizer(training_corpus):
|
|
|
54 |
def load_tokenizer():
|
55 |
return AutoTokenizer.from_pretrained(OUTPUT_REPO)
|
56 |
|
57 |
+
def get_training_corpus(dataset):
|
58 |
+
for example in dataset:
|
59 |
+
yield example['text']
|
60 |
+
|
61 |
def format_prompts(examples, tokenizer, isinst):
|
62 |
texts = []
|
63 |
for text in examples['text']:
|
|
|
133 |
save_total_limit=2,
|
134 |
)
|
135 |
|
|
|
|
|
136 |
optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=WEIGHT_DECAY)
|
137 |
scheduler = get_cosine_schedule_with_warmup(
|
138 |
optimizer,
|
139 |
num_warmup_steps=args.warmup_steps,
|
140 |
+
num_training_steps=(len(dataset) // args.per_device_train_batch_size) * args.num_train_epochs
|
141 |
)
|
|
|
|
|
142 |
|
143 |
+
dataloader = DataLoader(
|
144 |
+
dataset,
|
145 |
+
batch_size=BATCH_SIZE,
|
146 |
+
num_workers=NUM_WORKERS
|
147 |
+
)
|
148 |
+
|
149 |
for batch in dataloader:
|
150 |
batch = format_prompts(batch, tokenizer, isinst)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
if push:
|
153 |
repo_id = OUTPUT_REPO + "-it" if INSTRUCT_FINETUNE_BOOL else OUTPUT_REPO
|
154 |
+
msg = f"Training loss: {train.training_loss:.4f}"
|
155 |
+
model.push_to_hub(repo_id, commit_message=msg, force=True)
|
156 |
+
tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
|
157 |
else:
|
158 |
+
model.save_pretrained("model")
|
159 |
+
tokenizer.save_pretrained("tokenizer")
|
160 |
+
|
161 |
+
def main(push_to_hub=True, is_inst_finetune=False):
|
162 |
+
dataset = load_data()
|
163 |
+
if not is_inst_finetune and INIT == 0:
|
164 |
+
training_corpus = get_training_corpus(dataset)
|
165 |
+
tokenizer = create_tokenizer(training_corpus)
|
166 |
+
else:
|
167 |
+
tokenizer = load_tokenizer()
|
168 |
+
|
169 |
+
configure_tokenizer(tokenizer)
|
170 |
+
|
171 |
+
if is_inst_finetune:
|
172 |
+
model = load_model()
|
173 |
+
model.resize_token_embeddings(len(tokenizer))
|
174 |
+
else:
|
175 |
+
model = create_model(tokenizer) if INIT == 0 else load_model()
|
176 |
+
|
177 |
+
train_model(model, tokenizer, dataset, push_to_hub, is_inst_finetune)
|
178 |
|
179 |
if __name__ == "__main__":
|
180 |
main(PUSH_TO_HUB, INSTRUCT_FINETUNE_BOOL)
|