nroggendorff commited on
Commit
aa518eb
·
verified ·
1 Parent(s): 534bbd6

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +17 -13
train.py CHANGED
@@ -78,18 +78,22 @@ def get_training_corpus(dataset):
78
  def format_prompts(examples, tokenizer, isinst):
79
  texts = []
80
  for text in examples['text']:
81
- if isinst:
82
- conversation = []
83
- parts = text.split('<|end|>')
84
- for i in range(0, len(parts) - 1, 2):
85
- prompt = parts[i].replace("<|user|>", "").strip()
86
- response = parts[i + 1].replace("<|bot|>", "").strip()
87
- conversation.append({"role": "user", "content": prompt})
88
- conversation.append({"role": "assistant", "content": response})
89
- formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
90
- texts.append(formatted_conversation)
 
 
 
91
  else:
92
- texts.append(tokenizer.bos_token + text + tokenizer.eos_token)
 
93
  return {"text": texts}
94
 
95
  def create_model(tokenizer):
@@ -167,7 +171,7 @@ def train_model(model, tokenizer, dataset, push, isinst):
167
  fp16=FP16,
168
  save_steps=WARMUP_STEPS,
169
  logging_steps=WARMUP_STEPS,
170
- evaluation_strategy="no",
171
  eval_steps=1,
172
  save_total_limit=2,
173
  )
@@ -180,7 +184,7 @@ def train_model(model, tokenizer, dataset, push, isinst):
180
  )
181
 
182
  dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
183
- print("Mapped dataset sample:", dataset[0]['text'])
184
 
185
  trainer = trl.SFTTrainer(
186
  model=model,
 
78
  def format_prompts(examples, tokenizer, isinst):
79
  texts = []
80
  for text in examples['text']:
81
+ if text:
82
+ if isinst:
83
+ conversation = []
84
+ parts = text.split('<|end|>')
85
+ for i in range(0, len(parts) - 1, 2):
86
+ prompt = parts[i].replace("<|user|>", "").strip()
87
+ response = parts[i + 1].replace("<|bot|>", "").strip()
88
+ conversation.append({"role": "user", "content": prompt})
89
+ conversation.append({"role": "assistant", "content": response})
90
+ formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
91
+ texts.append(formatted_conversation)
92
+ else:
93
+ texts.append(tokenizer.bos_token + text + tokenizer.eos_token)
94
  else:
95
+ print('Found empty entry in examples. Moving on..')
96
+ continue
97
  return {"text": texts}
98
 
99
  def create_model(tokenizer):
 
171
  fp16=FP16,
172
  save_steps=WARMUP_STEPS,
173
  logging_steps=WARMUP_STEPS,
174
+ eval_strategy="no",
175
  eval_steps=1,
176
  save_total_limit=2,
177
  )
 
184
  )
185
 
186
  dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
187
+ print("Mapped dataset sample length:", len(dataset[0]['text']))
188
 
189
  trainer = trl.SFTTrainer(
190
  model=model,