make sure to use train split if loading from hf
Browse files
src/axolotl/utils/data.py
CHANGED
@@ -58,6 +58,7 @@ def load_tokenized_prepared_datasets(tokenizer, cfg, default_dataset_prepared_pa
|
|
58 |
try:
|
59 |
if cfg.push_dataset_to_hub:
|
60 |
dataset = load_dataset(f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True)
|
|
|
61 |
except:
|
62 |
pass
|
63 |
|
@@ -232,6 +233,7 @@ def load_prepare_datasets(tokenizer: PreTrainedTokenizerBase, cfg, default_datas
|
|
232 |
f"checkking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
|
233 |
)
|
234 |
dataset = load_dataset(f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True)
|
|
|
235 |
except:
|
236 |
pass
|
237 |
|
|
|
58 |
try:
|
59 |
if cfg.push_dataset_to_hub:
|
60 |
dataset = load_dataset(f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True)
|
61 |
+
dataset = dataset["train"]
|
62 |
except:
|
63 |
pass
|
64 |
|
|
|
233 |
f"checkking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
|
234 |
)
|
235 |
dataset = load_dataset(f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True)
|
236 |
+
dataset = dataset["train"]
|
237 |
except:
|
238 |
pass
|
239 |
|