winglian commited on
Commit
b164725
1 Parent(s): 937f44f

improve prepared dataset loading, fix inference

Browse files
configs/cerebras_1_3B_alpaca.yml CHANGED
@@ -11,6 +11,7 @@ datasets:
11
  type: gpteacher
12
  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
13
  type: gpteacher
 
14
  val_set_size: 0.05
15
  adapter: lora
16
  sequence_len: 2048
 
11
  type: gpteacher
12
  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
13
  type: gpteacher
14
+ dataset_prepared_path: data/last_run
15
  val_set_size: 0.05
16
  adapter: lora
17
  sequence_len: 2048
configs/llama_65B_alpaca.yml CHANGED
@@ -11,6 +11,7 @@ datasets:
11
  type: gpteacher
12
  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
13
  type: gpteacher
 
14
  val_set_size: 0.04
15
  adapter: lora
16
  lora_model_dir:
 
11
  type: gpteacher
12
  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
13
  type: gpteacher
14
+ dataset_prepared_path: data/last_run
15
  val_set_size: 0.04
16
  adapter: lora
17
  lora_model_dir:
configs/pythia_1_2B_alpaca.yml CHANGED
@@ -11,6 +11,7 @@ datasets:
11
  type: gpteacher
12
  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
13
  type: gpteacher
 
14
  val_set_size: 0.05
15
  adapter: lora
16
  lora_model_dir:
 
11
  type: gpteacher
12
  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
13
  type: gpteacher
14
+ dataset_prepared_path: data/last_run
15
  val_set_size: 0.05
16
  adapter: lora
17
  lora_model_dir:
scripts/finetune.py CHANGED
@@ -173,6 +173,8 @@ def do_inference(cfg, model, tokenizer):
173
  input = ""
174
  prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n".format(instruction=instruction, input=input)
175
  batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
 
 
176
  with torch.no_grad():
177
  generated = model.generate(inputs=batch["input_ids"],
178
  do_sample=True, use_cache=True,
@@ -255,13 +257,12 @@ def train(
255
  do_inference(cfg, model, tokenizer)
256
  return
257
 
258
- datasets = []
259
- if not isinstance(cfg.datasets, list) and isinstance(cfg.datasets, str):
260
- # assumption that we are loading a previously saved/cached dataset
261
  print("Loading prepared dataset from disk...")
262
  dataset = load_from_disk(cfg.datasets)
263
  print("Prepared dataset loaded from disk...")
264
  else:
 
265
  for d in cfg.datasets:
266
  ds: IterableDataset = load_dataset(
267
  "json", data_files=d.path, streaming=True, split=None
@@ -291,8 +292,12 @@ def train(
291
  dataset = Dataset.from_list(
292
  [_ for _ in constant_len_dataset]
293
  ).train_test_split(test_size=cfg.val_set_size, shuffle=True, seed=42)
 
294
  print("Saving prepared dataset to disk...")
295
- dataset.save_to_disk("data/last_run")
 
 
 
296
 
297
  train_dataset = dataset["train"]
298
  eval_dataset = dataset["test"]
 
173
  input = ""
174
  prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n".format(instruction=instruction, input=input)
175
  batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
176
+
177
+ model.eval()
178
  with torch.no_grad():
179
  generated = model.generate(inputs=batch["input_ids"],
180
  do_sample=True, use_cache=True,
 
257
  do_inference(cfg, model, tokenizer)
258
  return
259
 
260
+ if cfg.dataset_prepared_path and any(Path(cfg.dataset_prepared_path).glob("*")):
 
 
261
  print("Loading prepared dataset from disk...")
262
  dataset = load_from_disk(cfg.datasets)
263
  print("Prepared dataset loaded from disk...")
264
  else:
265
+ datasets = []
266
  for d in cfg.datasets:
267
  ds: IterableDataset = load_dataset(
268
  "json", data_files=d.path, streaming=True, split=None
 
292
  dataset = Dataset.from_list(
293
  [_ for _ in constant_len_dataset]
294
  ).train_test_split(test_size=cfg.val_set_size, shuffle=True, seed=42)
295
+
296
  print("Saving prepared dataset to disk...")
297
+ if cfg.dataset_prepared_path:
298
+ dataset.save_to_disk(cfg.dataset_prepared_path)
299
+ else:
300
+ dataset.save_to_disk("data/last_run")
301
 
302
  train_dataset = dataset["train"]
303
  eval_dataset = dataset["test"]