Merge pull request #221 from utensil/local_dataset
Browse files[WIP] Support loading data files from a local directory
- src/axolotl/utils/data.py +20 -7
src/axolotl/utils/data.py
CHANGED
@@ -102,13 +102,26 @@ def load_tokenized_prepared_datasets(
|
|
102 |
pass
|
103 |
|
104 |
# prefer local dataset, even if hub exists
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
elif ds_from_hub:
|
113 |
if d.data_files:
|
114 |
ds = load_dataset(
|
|
|
102 |
pass
|
103 |
|
104 |
# prefer local dataset, even if hub exists
|
105 |
+
local_path = Path(d.path)
|
106 |
+
if local_path.exists():
|
107 |
+
if local_path.is_dir():
|
108 |
+
ds = load_dataset(
|
109 |
+
d.path,
|
110 |
+
data_files=d.data_files,
|
111 |
+
streaming=False,
|
112 |
+
split=None,
|
113 |
+
)
|
114 |
+
elif local_path.is_file():
|
115 |
+
ds = load_dataset(
|
116 |
+
"json",
|
117 |
+
data_files=d.path,
|
118 |
+
streaming=False,
|
119 |
+
split=None,
|
120 |
+
)
|
121 |
+
else:
|
122 |
+
raise ValueError(
|
123 |
+
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
124 |
+
)
|
125 |
elif ds_from_hub:
|
126 |
if d.data_files:
|
127 |
ds = load_dataset(
|