Nanobit commited on
Commit
1e56b88
·
unverified ·
1 Parent(s): 7570446

fix(preprocess): Make sure dataset not loaded from cache when using preprocess cli (#1136)

Browse files
Files changed (1) hide show
  1. src/axolotl/utils/data.py +10 -2
src/axolotl/utils/data.py CHANGED
@@ -152,7 +152,11 @@ def load_tokenized_prepared_datasets(
152
 
153
  if dataset:
154
  ...
155
- elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
 
 
 
 
156
  LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
157
  dataset = load_from_disk(str(prepared_ds_path))
158
  LOG.info("Prepared dataset loaded from disk...")
@@ -465,7 +469,11 @@ def load_prepare_datasets(
465
 
466
  if dataset:
467
  ...
468
- elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
 
 
 
 
469
  LOG.info(
470
  f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
471
  )
 
152
 
153
  if dataset:
154
  ...
155
+ elif (
156
+ cfg.dataset_prepared_path
157
+ and any(prepared_ds_path.glob("*"))
158
+ and not cfg.is_preprocess
159
+ ):
160
  LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
161
  dataset = load_from_disk(str(prepared_ds_path))
162
  LOG.info("Prepared dataset loaded from disk...")
 
469
 
470
  if dataset:
471
  ...
472
+ elif (
473
+ cfg.dataset_prepared_path
474
+ and any(prepared_ds_path.glob("*"))
475
+ and not cfg.is_preprocess
476
+ ):
477
  LOG.info(
478
  f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
479
  )