Update data.py for signature generation (#851)
Browse files* Update data.py
Change of conversation formatting type should also trigger updating the preprocessed dataset, so it should be part of the signature.
* chore: lint
---------
Co-authored-by: Wing Lian <wing.lian@gmail.com>
src/axolotl/utils/data.py
CHANGED
@@ -99,7 +99,12 @@ def load_tokenized_prepared_datasets(
|
|
99 |
str(cfg.sequence_len)
|
100 |
+ "@"
|
101 |
+ "|".join(
|
102 |
-
sorted(
|
|
|
|
|
|
|
|
|
|
|
103 |
)
|
104 |
+ "|"
|
105 |
+ tokenizer_name
|
|
|
99 |
str(cfg.sequence_len)
|
100 |
+ "@"
|
101 |
+ "|".join(
|
102 |
+
sorted(
|
103 |
+
[
|
104 |
+
f"{d.path}:{d.type}:{d.shards}:{d.conversation}"
|
105 |
+
for d in cfg.datasets
|
106 |
+
]
|
107 |
+
)
|
108 |
)
|
109 |
+ "|"
|
110 |
+ tokenizer_name
|