allow remote data paths (#1278)
Browse files* allow remote data paths
* add docs about public url
* only allow https
* better docs
* better docs
- README.md +8 -0
- src/axolotl/utils/data.py +10 -0
README.md
CHANGED
@@ -468,6 +468,14 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
|
|
468 |
dataset:
|
469 |
- path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
|
470 |
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
471 |
```
|
472 |
|
473 |
- loading
|
|
|
468 |
dataset:
|
469 |
- path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
|
470 |
...
|
471 |
+
|
472 |
+
# Loading Data From a Public URL
|
473 |
+
# - URLs must use HTTPS protocol for security reasons, not HTTP.
|
474 |
+
# - The URL should be a direct link to the file you wish to load.
|
475 |
+
# - The file format is `json` (which includes `jsonl`) by default. For different formats, adjust the `ds_type` option accordingly.
|
476 |
+
dataset:
|
477 |
+
- path: https://some.url.com/yourdata.jsonl # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
|
478 |
+
ds_type: json # this is the default, see other options below.
|
479 |
```
|
480 |
|
481 |
- loading
|
src/axolotl/utils/data.py
CHANGED
@@ -336,6 +336,16 @@ def load_tokenized_prepared_datasets(
|
|
336 |
split=None,
|
337 |
storage_options=storage_options,
|
338 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
else:
|
340 |
if isinstance(config_dataset.data_files, str):
|
341 |
fp = hf_hub_download(
|
|
|
336 |
split=None,
|
337 |
storage_options=storage_options,
|
338 |
)
|
339 |
+
elif config_dataset.path.startswith("https://"):
|
340 |
+
ds_type = get_ds_type(config_dataset)
|
341 |
+
ds = load_dataset(
|
342 |
+
ds_type,
|
343 |
+
name=config_dataset.name,
|
344 |
+
data_files=config_dataset.path,
|
345 |
+
streaming=False,
|
346 |
+
split=None,
|
347 |
+
storage_options=storage_options,
|
348 |
+
)
|
349 |
else:
|
350 |
if isinstance(config_dataset.data_files, str):
|
351 |
fp = hf_hub_download(
|