Spaces:

Dovakiins
/

qwerrwe

Build error

App Files Files Community

winglian commited on Jun 6, 2023

Commit

9a02e7e

unverified ·

2 Parent(s): 328c3bc 5b33e29

Merge pull request #155 from OpenAccess-AI-Collective/misc-fixes

Browse files

new prompters, misc fixes for output dir missing using fsdp, and changing max seq len

Files changed (5) hide show

README.md +20 -0
scripts/finetune.py +3 -0
src/axolotl/prompt_strategies/alpaca_chat.py +40 -0
src/axolotl/prompt_strategies/context_qa.py +67 -0
src/axolotl/utils/models.py +4 -0

README.md CHANGED Viewed

@@ -165,10 +165,30 @@ Have dataset(s) in one of the following format (JSONL recommended):
   ```json
   {"article": "...", "summary": "..."}
   ```
 - `alpaca_chat.load_qa`: question and answer for alpaca chat
   ```json
   {"question": "...", "answer": "..."}
   ```
 - `creative_acr.load_answer`: instruction and revision
   ```json
   {"instruction": "...", "revision": "..."}

   ```json
   {"article": "...", "summary": "..."}
   ```
+- `alpaca_chat`: basic instruct for alpaca chat
+  ```json
+  {"instruction": "...", "input": "...", "response": "..."}
+  ```
 - `alpaca_chat.load_qa`: question and answer for alpaca chat
   ```json
   {"question": "...", "answer": "..."}
   ```
+- `alpaca_chat.load_concise`: question and answer for alpaca chat, for concise answers
+  ```json
+  {"instruction": "...", "input": "...", "response": "..."}
+  ```
+- `alpaca_chat.load_camel_ai`: question and answer for alpaca chat, for load_camel_ai
+  ```json
+  {"message_1": "...", "message_2": "..."}
+  ```
+- `context_qa`: in context question answering from an article
+  ```json
+  {"article": "...", "question": "...", "answer": "..."}
+  ```
+- `context_qa.load_404`: in context question answering from an article, with default response for no answer from context
+  ```json
+  {"article": "...", "unanswerable_question": "..."}
+  ```
 - `creative_acr.load_answer`: instruction and revision
   ```json
   {"instruction": "...", "revision": "..."}

scripts/finetune.py CHANGED Viewed

@@ -279,6 +279,9 @@ def train(
             logging.info(
                 f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
             )
     trainer.train(resume_from_checkpoint=resume_from_checkpoint)
     logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")

             logging.info(
                 f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
             )
+    if not Path(cfg.output_dir).is_dir():
+        os.makedirs(cfg.output_dir, exist_ok=True)
     trainer.train(resume_from_checkpoint=resume_from_checkpoint)
     logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")

src/axolotl/prompt_strategies/alpaca_chat.py CHANGED Viewed

@@ -18,6 +18,15 @@ def load(tokenizer, cfg):
     )
 class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
     """
     Tokenizing strategy for AlpacaQA
@@ -31,6 +40,28 @@ class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
         )
 def load_qa(tokenizer, cfg):
     return AlpacaQAPromptTokenizingStrategy(
         AlpacaPrompter(PromptStyle.CHAT.value),
@@ -38,3 +69,12 @@ def load_qa(tokenizer, cfg):
         cfg.train_on_inputs,
         cfg.sequence_len,
     )

     )
+class AlpacaConcisePrompter(AlpacaPrompter):
+    """
+    Alpaca Prompter extending the system prompt to ask for concise answers
+    """
+    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that concisely and appropriately completes the request.\n\n"
+    system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately and concisely completes the request.\n\n"
 class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
     """
     Tokenizing strategy for AlpacaQA
         )
+class CamelAIPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
+    """
+    Tokenizing strategy for CamelAI datasets
+    """
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
+        return (
+            prompt["message_1"],
+            "",
+            prompt["message_1"],
+        )
+def load_concise(tokenizer, cfg):
+    return AlpacaPromptTokenizingStrategy(
+        AlpacaConcisePrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
 def load_qa(tokenizer, cfg):
     return AlpacaQAPromptTokenizingStrategy(
         AlpacaPrompter(PromptStyle.CHAT.value),
         cfg.train_on_inputs,
         cfg.sequence_len,
     )
+def load_camel_ai(tokenizer, cfg):
+    return CamelAIPromptTokenizingStrategy(
+        AlpacaPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )

src/axolotl/prompt_strategies/context_qa.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""Module containing the classes for Context QA Prompt Tokenization Strategies"""
+from typing import Tuple
+from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy
+from axolotl.prompters import AlpacaPrompter, PromptStyle
+# article, unanswerable_question, question, answer
+def load_404(tokenizer, cfg):
+    return AlpacaMissingInfoContextPromptTokenizingStrategy(
+        AlpacaContextPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+def load(tokenizer, cfg):
+    return AlpacaContextPromptTokenizingStrategy(
+        AlpacaContextPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+class AlpacaContextPrompter(AlpacaPrompter):
+    """
+    Customized system prompted for concise QA
+    """
+    system_prompt = (
+        "Use the following contextual information to concisely answer the question.\n"
+    )
+    system_no_input_prompt = (
+        "Use the following contextual information to concisely answer the question.\n"
+    )
+class AlpacaContextPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
+    """
+    Tokenization Strategy to combine in-context article with a question and answer
+    """
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
+        return (
+            prompt["article"] + "\n===\n" + prompt["question"],
+            "",
+            prompt["answer"],
+        )
+class AlpacaMissingInfoContextPromptTokenizingStrategy(
+    InstructionPromptTokenizingStrategy
+):
+    """
+    Tokenization Strategy to combine in-context article with a question that can't be answered
+    from the context and a default response to that effect
+    """
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
+        return (
+            prompt["article"] + "\n===\n" + prompt["unanswerable_question"],
+            "",
+            "The context provided does not contain any information about your inquiry. "
+            "Therefore, I'm unable to answer your question based on the given context.",
+        )

src/axolotl/utils/models.py CHANGED Viewed

@@ -234,6 +234,10 @@ def load_model(
                 base_model,
                 trust_remote_code=cfg.trust_remote_code or False,
             )
             model = AutoModelForCausalLM.from_pretrained(
                 base_model,
                 config=config,

                 base_model,
                 trust_remote_code=cfg.trust_remote_code or False,
             )
+            # Shouldn't be a problem most of the time. will obviously error if the model doesn't support this
+            # when training starts
+            if config.max_seq_len and cfg.sequence_len > config.max_seq_len:
+                config.max_seq_len = cfg.sequence_len
             model = AutoModelForCausalLM.from_pretrained(
                 base_model,
                 config=config,