improve prepared dataset loading, fix inference
Browse files
configs/cerebras_1_3B_alpaca.yml
CHANGED
@@ -11,6 +11,7 @@ datasets:
|
|
11 |
type: gpteacher
|
12 |
- path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
|
13 |
type: gpteacher
|
|
|
14 |
val_set_size: 0.05
|
15 |
adapter: lora
|
16 |
sequence_len: 2048
|
|
|
11 |
type: gpteacher
|
12 |
- path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
|
13 |
type: gpteacher
|
14 |
+
dataset_prepared_path: data/last_run
|
15 |
val_set_size: 0.05
|
16 |
adapter: lora
|
17 |
sequence_len: 2048
|
configs/llama_65B_alpaca.yml
CHANGED
@@ -11,6 +11,7 @@ datasets:
|
|
11 |
type: gpteacher
|
12 |
- path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
|
13 |
type: gpteacher
|
|
|
14 |
val_set_size: 0.04
|
15 |
adapter: lora
|
16 |
lora_model_dir:
|
|
|
11 |
type: gpteacher
|
12 |
- path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
|
13 |
type: gpteacher
|
14 |
+
dataset_prepared_path: data/last_run
|
15 |
val_set_size: 0.04
|
16 |
adapter: lora
|
17 |
lora_model_dir:
|
configs/pythia_1_2B_alpaca.yml
CHANGED
@@ -11,6 +11,7 @@ datasets:
|
|
11 |
type: gpteacher
|
12 |
- path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
|
13 |
type: gpteacher
|
|
|
14 |
val_set_size: 0.05
|
15 |
adapter: lora
|
16 |
lora_model_dir:
|
|
|
11 |
type: gpteacher
|
12 |
- path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
|
13 |
type: gpteacher
|
14 |
+
dataset_prepared_path: data/last_run
|
15 |
val_set_size: 0.05
|
16 |
adapter: lora
|
17 |
lora_model_dir:
|
scripts/finetune.py
CHANGED
@@ -173,6 +173,8 @@ def do_inference(cfg, model, tokenizer):
|
|
173 |
input = ""
|
174 |
prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n".format(instruction=instruction, input=input)
|
175 |
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
|
|
|
|
|
176 |
with torch.no_grad():
|
177 |
generated = model.generate(inputs=batch["input_ids"],
|
178 |
do_sample=True, use_cache=True,
|
@@ -255,13 +257,12 @@ def train(
|
|
255 |
do_inference(cfg, model, tokenizer)
|
256 |
return
|
257 |
|
258 |
-
|
259 |
-
if not isinstance(cfg.datasets, list) and isinstance(cfg.datasets, str):
|
260 |
-
# assumption that we are loading a previously saved/cached dataset
|
261 |
print("Loading prepared dataset from disk...")
|
262 |
dataset = load_from_disk(cfg.datasets)
|
263 |
print("Prepared dataset loaded from disk...")
|
264 |
else:
|
|
|
265 |
for d in cfg.datasets:
|
266 |
ds: IterableDataset = load_dataset(
|
267 |
"json", data_files=d.path, streaming=True, split=None
|
@@ -291,8 +292,12 @@ def train(
|
|
291 |
dataset = Dataset.from_list(
|
292 |
[_ for _ in constant_len_dataset]
|
293 |
).train_test_split(test_size=cfg.val_set_size, shuffle=True, seed=42)
|
|
|
294 |
print("Saving prepared dataset to disk...")
|
295 |
-
|
|
|
|
|
|
|
296 |
|
297 |
train_dataset = dataset["train"]
|
298 |
eval_dataset = dataset["test"]
|
|
|
173 |
input = ""
|
174 |
prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n".format(instruction=instruction, input=input)
|
175 |
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
|
176 |
+
|
177 |
+
model.eval()
|
178 |
with torch.no_grad():
|
179 |
generated = model.generate(inputs=batch["input_ids"],
|
180 |
do_sample=True, use_cache=True,
|
|
|
257 |
do_inference(cfg, model, tokenizer)
|
258 |
return
|
259 |
|
260 |
+
if cfg.dataset_prepared_path and any(Path(cfg.dataset_prepared_path).glob("*")):
|
|
|
|
|
261 |
print("Loading prepared dataset from disk...")
|
262 |
dataset = load_from_disk(cfg.datasets)
|
263 |
print("Prepared dataset loaded from disk...")
|
264 |
else:
|
265 |
+
datasets = []
|
266 |
for d in cfg.datasets:
|
267 |
ds: IterableDataset = load_dataset(
|
268 |
"json", data_files=d.path, streaming=True, split=None
|
|
|
292 |
dataset = Dataset.from_list(
|
293 |
[_ for _ in constant_len_dataset]
|
294 |
).train_test_split(test_size=cfg.val_set_size, shuffle=True, seed=42)
|
295 |
+
|
296 |
print("Saving prepared dataset to disk...")
|
297 |
+
if cfg.dataset_prepared_path:
|
298 |
+
dataset.save_to_disk(cfg.dataset_prepared_path)
|
299 |
+
else:
|
300 |
+
dataset.save_to_disk("data/last_run")
|
301 |
|
302 |
train_dataset = dataset["train"]
|
303 |
eval_dataset = dataset["test"]
|