Fix local path loading and custom strategy type
Browse files
README.md
CHANGED
@@ -237,7 +237,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
|
|
237 |
#### How to add custom prompts
|
238 |
|
239 |
1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
|
240 |
-
2. Use your custom file name as the dataset type
|
241 |
|
242 |
Optionally, download some datasets, see [data/README.md](data/README.md)
|
243 |
|
@@ -255,10 +255,18 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
|
|
255 |
|
256 |
- dataset
|
257 |
```yaml
|
|
|
|
|
|
|
258 |
datasets:
|
259 |
-
- path: vicgalle/alpaca-gpt4
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
type: alpaca # format from earlier
|
261 |
-
sequence_len: 2048 # max token length / prompt
|
262 |
```
|
263 |
|
264 |
- loading
|
@@ -328,10 +336,10 @@ tf32: true # require >=ampere
|
|
328 |
|
329 |
# a list of one or more datasets to finetune the model with
|
330 |
datasets:
|
331 |
-
#
|
332 |
- path: vicgalle/alpaca-gpt4
|
333 |
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
334 |
-
type: alpaca # format
|
335 |
data_files: # path to source data files
|
336 |
shards: # number of shards to split data into
|
337 |
|
|
|
237 |
#### How to add custom prompts
|
238 |
|
239 |
1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
|
240 |
+
2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
|
241 |
|
242 |
Optionally, download some datasets, see [data/README.md](data/README.md)
|
243 |
|
|
|
255 |
|
256 |
- dataset
|
257 |
```yaml
|
258 |
+
sequence_len: 2048 # max token length for prompt
|
259 |
+
|
260 |
+
# huggingface repo
|
261 |
datasets:
|
262 |
+
- path: vicgalle/alpaca-gpt4
|
263 |
+
type: alpaca # format from earlier
|
264 |
+
|
265 |
+
# local
|
266 |
+
datasets:
|
267 |
+
- path: json
|
268 |
+
data_files: data.jsonl # or json
|
269 |
type: alpaca # format from earlier
|
|
|
270 |
```
|
271 |
|
272 |
- loading
|
|
|
336 |
|
337 |
# a list of one or more datasets to finetune the model with
|
338 |
datasets:
|
339 |
+
# hf dataset repo | "json" for local dataset, make sure to fill data_files
|
340 |
- path: vicgalle/alpaca-gpt4
|
341 |
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
342 |
+
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
343 |
data_files: # path to source data files
|
344 |
shards: # number of shards to split data into
|
345 |
|