Add CompletionPrompt type
Browse files- src/axolotl/prompt_tokenizers.py +19 -0
- src/axolotl/prompters.py +11 -0
- src/axolotl/utils/data.py +15 -2
src/axolotl/prompt_tokenizers.py
CHANGED
@@ -125,6 +125,25 @@ class NomicGPT4AllPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
|
|
125 |
)
|
126 |
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
class ReflectionPromptTokenizingStrategy(PromptTokenizingStrategy):
|
129 |
def parse_instruction_fields(self, prompt) -> (str, str, str, str, str):
|
130 |
raise NotImplementedError
|
|
|
125 |
)
|
126 |
|
127 |
|
128 |
+
class CompletionPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
|
129 |
+
def parse_instruction_fields(self, prompt) -> (str):
|
130 |
+
return (
|
131 |
+
prompt["text"]
|
132 |
+
)
|
133 |
+
|
134 |
+
def tokenize_prompt(self, prompt):
|
135 |
+
text = self.parse_instruction_fields(prompt)
|
136 |
+
full_prompt = self._build_full_prompt(text)
|
137 |
+
tokenized_full_prompt = self._tokenize(full_prompt)
|
138 |
+
|
139 |
+
return tokenized_full_prompt
|
140 |
+
|
141 |
+
def _build_full_prompt(self, text):
|
142 |
+
return self.prompter.build_prompt(
|
143 |
+
text
|
144 |
+
)
|
145 |
+
|
146 |
+
|
147 |
class ReflectionPromptTokenizingStrategy(PromptTokenizingStrategy):
|
148 |
def parse_instruction_fields(self, prompt) -> (str, str, str, str, str):
|
149 |
raise NotImplementedError
|
src/axolotl/prompters.py
CHANGED
@@ -35,6 +35,17 @@ class JeopardyPrompter(AlpacaPrompter):
|
|
35 |
prompt_input = "Below is a Jeopardy clue paired with input providing the category of the clue. Write a concise response that best answers tbe clue given the category.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
|
36 |
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
class GPTeacherPrompter(AlpacaPrompter):
|
39 |
...
|
40 |
|
|
|
35 |
prompt_input = "Below is a Jeopardy clue paired with input providing the category of the clue. Write a concise response that best answers tbe clue given the category.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
|
36 |
|
37 |
|
38 |
+
class CompletionPrompter(AlpacaPrompter):
|
39 |
+
def build_prompt(
|
40 |
+
self,
|
41 |
+
text: str
|
42 |
+
) -> str:
|
43 |
+
return text
|
44 |
+
|
45 |
+
def get_response(self, output: str) -> str:
|
46 |
+
return output.strip()
|
47 |
+
|
48 |
+
|
49 |
class GPTeacherPrompter(AlpacaPrompter):
|
50 |
...
|
51 |
|
src/axolotl/utils/data.py
CHANGED
@@ -11,13 +11,17 @@ from axolotl.prompt_tokenizers import (
|
|
11 |
GPTeacherPromptTokenizingStrategy,
|
12 |
OpenAssistantPromptTokenizingStrategy,
|
13 |
AlpacaReflectionPTStrategy,
|
14 |
-
ShareGPTPromptTokenizingStrategy,
|
|
|
|
|
15 |
)
|
16 |
from axolotl.prompters import (
|
17 |
AlpacaPrompter,
|
18 |
GPTeacherPrompter,
|
19 |
ReflectAlpacaPrompter,
|
20 |
-
ShareGPTPrompter,
|
|
|
|
|
21 |
)
|
22 |
|
23 |
|
@@ -118,6 +122,15 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
|
|
118 |
)
|
119 |
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
|
120 |
datasets.append(ds_wrapper)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
else:
|
122 |
logging.error(f"unhandled prompt tokenization strategy: {d.type}")
|
123 |
logging.info("tokenizing, merging, and shuffling master dataset")
|
|
|
11 |
GPTeacherPromptTokenizingStrategy,
|
12 |
OpenAssistantPromptTokenizingStrategy,
|
13 |
AlpacaReflectionPTStrategy,
|
14 |
+
ShareGPTPromptTokenizingStrategy,
|
15 |
+
JeopardyPromptTokenizingStrategy,
|
16 |
+
CompletionPromptTokenizingStrategy,
|
17 |
)
|
18 |
from axolotl.prompters import (
|
19 |
AlpacaPrompter,
|
20 |
GPTeacherPrompter,
|
21 |
ReflectAlpacaPrompter,
|
22 |
+
ShareGPTPrompter,
|
23 |
+
JeopardyPrompter,
|
24 |
+
CompletionPrompter,
|
25 |
)
|
26 |
|
27 |
|
|
|
122 |
)
|
123 |
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
|
124 |
datasets.append(ds_wrapper)
|
125 |
+
elif d.type == "completion":
|
126 |
+
ds_strategy = CompletionPromptTokenizingStrategy(
|
127 |
+
CompletionPrompter(),
|
128 |
+
tokenizer,
|
129 |
+
cfg.train_on_inputs,
|
130 |
+
cfg.sequence_len,
|
131 |
+
)
|
132 |
+
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
|
133 |
+
datasets.append(ds_wrapper)
|
134 |
else:
|
135 |
logging.error(f"unhandled prompt tokenization strategy: {d.type}")
|
136 |
logging.info("tokenizing, merging, and shuffling master dataset")
|