hamel commited on
Commit
91cf4ee
1 Parent(s): 1daecd1

allow remote data paths (#1278)

Browse files

* allow remote data paths

* add docs about public url

* only allow https

* better docs

* better docs

Files changed (2) hide show
  1. README.md +8 -0
  2. src/axolotl/utils/data.py +10 -0
README.md CHANGED
@@ -468,6 +468,14 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
468
  dataset:
469
  - path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
470
  ...
 
 
 
 
 
 
 
 
471
  ```
472
 
473
  - loading
 
468
  dataset:
469
  - path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
470
  ...
471
+
472
+ # Loading Data From a Public URL
473
+ # - URLs must use HTTPS protocol for security reasons, not HTTP.
474
+ # - The URL should be a direct link to the file you wish to load.
475
+ # - The file format is `json` (which includes `jsonl`) by default. For different formats, adjust the `ds_type` option accordingly.
476
+ dataset:
477
+ - path: https://some.url.com/yourdata.jsonl # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
478
+ ds_type: json # this is the default, see other options below.
479
  ```
480
 
481
  - loading
src/axolotl/utils/data.py CHANGED
@@ -336,6 +336,16 @@ def load_tokenized_prepared_datasets(
336
  split=None,
337
  storage_options=storage_options,
338
  )
 
 
 
 
 
 
 
 
 
 
339
  else:
340
  if isinstance(config_dataset.data_files, str):
341
  fp = hf_hub_download(
 
336
  split=None,
337
  storage_options=storage_options,
338
  )
339
+ elif config_dataset.path.startswith("https://"):
340
+ ds_type = get_ds_type(config_dataset)
341
+ ds = load_dataset(
342
+ ds_type,
343
+ name=config_dataset.name,
344
+ data_files=config_dataset.path,
345
+ streaming=False,
346
+ split=None,
347
+ storage_options=storage_options,
348
+ )
349
  else:
350
  if isinstance(config_dataset.data_files, str):
351
  fp = hf_hub_download(