ethanhs commited on
Commit
8fe0e63
1 Parent(s): d1236f2

Fix bug in dataset loading (#284)

Browse files

* Fix bug in dataset loading

This fixes a bug when loading datasets. `d.data_files` is a list, so it cannot be directly passed to `hf_hub_download`

* Check type of data_files, and load accordingly

Files changed (1) hide show
  1. src/axolotl/utils/data.py +20 -5
src/axolotl/utils/data.py CHANGED
@@ -205,11 +205,26 @@ def load_tokenized_prepared_datasets(
205
  use_auth_token=use_auth_token,
206
  )
207
  else:
208
- fp = hf_hub_download(
209
- repo_id=d.path,
210
- repo_type="dataset",
211
- filename=d.data_files,
212
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  ds = load_dataset(
214
  "json", name=d.name, data_files=fp, streaming=False, split=None
215
  )
 
205
  use_auth_token=use_auth_token,
206
  )
207
  else:
208
+ if isinstance(d.data_files, str):
209
+ fp = hf_hub_download(
210
+ repo_id=d.path,
211
+ repo_type="dataset",
212
+ filename=d.data_files,
213
+ )
214
+ elif isinstance(d.data_files, list):
215
+ fp = []
216
+ for file in d.data_files:
217
+ fp.append(
218
+ hf_hub_download(
219
+ repo_id=d.path,
220
+ repo_type="dataset",
221
+ filename=file,
222
+ )
223
+ )
224
+ else:
225
+ raise ValueError(
226
+ "data_files must be either a string or list of strings"
227
+ )
228
  ds = load_dataset(
229
  "json", name=d.name, data_files=fp, streaming=False, split=None
230
  )