Spaces:
Runtime error
Runtime error
Update train.py
Browse files
train.py
CHANGED
@@ -78,18 +78,22 @@ def get_training_corpus(dataset):
|
|
78 |
def format_prompts(examples, tokenizer, isinst):
|
79 |
texts = []
|
80 |
for text in examples['text']:
|
81 |
-
if
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
91 |
else:
|
92 |
-
|
|
|
93 |
return {"text": texts}
|
94 |
|
95 |
def create_model(tokenizer):
|
@@ -167,7 +171,7 @@ def train_model(model, tokenizer, dataset, push, isinst):
|
|
167 |
fp16=FP16,
|
168 |
save_steps=WARMUP_STEPS,
|
169 |
logging_steps=WARMUP_STEPS,
|
170 |
-
|
171 |
eval_steps=1,
|
172 |
save_total_limit=2,
|
173 |
)
|
@@ -180,7 +184,7 @@ def train_model(model, tokenizer, dataset, push, isinst):
|
|
180 |
)
|
181 |
|
182 |
dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
|
183 |
-
print("Mapped dataset sample:", dataset[0]['text'])
|
184 |
|
185 |
trainer = trl.SFTTrainer(
|
186 |
model=model,
|
|
|
78 |
def format_prompts(examples, tokenizer, isinst):
|
79 |
texts = []
|
80 |
for text in examples['text']:
|
81 |
+
if text:
|
82 |
+
if isinst:
|
83 |
+
conversation = []
|
84 |
+
parts = text.split('<|end|>')
|
85 |
+
for i in range(0, len(parts) - 1, 2):
|
86 |
+
prompt = parts[i].replace("<|user|>", "").strip()
|
87 |
+
response = parts[i + 1].replace("<|bot|>", "").strip()
|
88 |
+
conversation.append({"role": "user", "content": prompt})
|
89 |
+
conversation.append({"role": "assistant", "content": response})
|
90 |
+
formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
|
91 |
+
texts.append(formatted_conversation)
|
92 |
+
else:
|
93 |
+
texts.append(tokenizer.bos_token + text + tokenizer.eos_token)
|
94 |
else:
|
95 |
+
print('Found empty entry in examples. Moving on..')
|
96 |
+
continue
|
97 |
return {"text": texts}
|
98 |
|
99 |
def create_model(tokenizer):
|
|
|
171 |
fp16=FP16,
|
172 |
save_steps=WARMUP_STEPS,
|
173 |
logging_steps=WARMUP_STEPS,
|
174 |
+
eval_strategy="no",
|
175 |
eval_steps=1,
|
176 |
save_total_limit=2,
|
177 |
)
|
|
|
184 |
)
|
185 |
|
186 |
dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
|
187 |
+
print("Mapped dataset sample length:", len(dataset[0]['text']))
|
188 |
|
189 |
trainer = trl.SFTTrainer(
|
190 |
model=model,
|