Spaces:

zetavg
/

LLaMA-LoRA-Tuner-UI-Demo

Runtime error

App Files Files Community

zetavg commited on Apr 18, 2023

Commit

40d6251

•

2 Parent(s): 29067b4 889210b

Merge branch 'main' into hf-ui-demo

Browse files

Files changed (8) hide show

LLaMA_LoRA.ipynb +16 -19
README.md +4 -1
app.py +3 -0
llama_lora/globals.py +2 -0
llama_lora/lib/inference.py +3 -3
llama_lora/models.py +83 -23
llama_lora/ui/inference_ui.py +175 -25
llama_lora/ui/main_page.py +39 -0

LLaMA_LoRA.ipynb CHANGED Viewed

@@ -60,20 +60,15 @@
         "# @title A small workaround { display-mode: \"form\" }\n",
         "# @markdown Don't panic if you see an error here. Just click the `RESTART RUNTIME` button in the output below, then Run All again.\n",
         "# @markdown The error will disappear on the next run.\n",
-        "!pip install Pillow==9.3.0 numpy==1.23.5\n",
         "\n",
         "import PIL\n",
-        "major, minor = map(float, PIL.__version__.split(\".\")[:2])\n",
-        "version_float = major + minor / 10**len(str(minor))\n",
-        "print('PIL', version_float)\n",
-        "if version_float < 9.003:\n",
-        "    raise Exception(\"Restart the runtime by clicking the 'RESTART RUNTIME' button above (or Runtime > Restart Runtime).\")\n",
-        "\n",
         "import numpy\n",
-        "major, minor = map(float, numpy.__version__.split(\".\")[:2])\n",
-        "version_float = major + minor / 10**len(str(minor))\n",
-        "print('numpy', version_float)\n",
-        "if version_float < 1.0023:\n",
         "    raise Exception(\"Restart the runtime by clicking the 'RESTART RUNTIME' button above (or Runtime > Restart Runtime).\")"
       ],
       "metadata": {
@@ -144,15 +139,17 @@
         "# colab_notebook_name = remove_ipynb_extension(colab_notebook_filename)\n",
         "\n",
         "from google.colab import drive\n",
-        "drive.mount(google_drive_mount_path)\n",
         "\n",
-        "# google_drive_data_directory_relative_path = f\"{google_drive_colab_data_folder}/{colab_notebook_name}\"\n",
-        "google_drive_data_directory_relative_path = google_drive_folder\n",
-        "google_drive_data_directory_path = f\"{google_drive_mount_path}/My Drive/{google_drive_data_directory_relative_path}\"\n",
-        "!mkdir -p \"{google_drive_data_directory_path}\"\n",
-        "!ln -nsf \"{google_drive_data_directory_path}\" ./data\n",
-        "!touch \"data/This folder is used by the Colab notebook \\\"{colab_notebook_filename}\\\".txt\"\n",
-        "!echo \"Data will be stored in Google Drive folder: \\\"{google_drive_data_directory_relative_path}\\\", which is mounted under \\\"{google_drive_data_directory_path}\\\"\"\n"
       ],
       "metadata": {
         "id": "iZmRtUY68U5f"

         "# @title A small workaround { display-mode: \"form\" }\n",
         "# @markdown Don't panic if you see an error here. Just click the `RESTART RUNTIME` button in the output below, then Run All again.\n",
         "# @markdown The error will disappear on the next run.\n",
+        "%pip install Pillow==9.3.0 numpy==1.23.5\n",
         "\n",
+        "import pkg_resources as r\n",
         "import PIL\n",
         "import numpy\n",
+        "for module, min_version in [(PIL, \"9.3\"), (numpy, \"1.23\")]:\n",
+        "  lib_version = r.parse_version(module.__version__)\n",
+        "  print(module.__name__, lib_version)\n",
+        "  if lib_version < r.parse_version(min_version):\n",
         "    raise Exception(\"Restart the runtime by clicking the 'RESTART RUNTIME' button above (or Runtime > Restart Runtime).\")"
       ],
       "metadata": {
         "# colab_notebook_name = remove_ipynb_extension(colab_notebook_filename)\n",
         "\n",
         "from google.colab import drive\n",
+        "try:\n",
+        "  drive.mount(google_drive_mount_path)\n",
         "\n",
+        "  google_drive_data_directory_relative_path = google_drive_folder\n",
+        "  google_drive_data_directory_path = f\"{google_drive_mount_path}/My Drive/{google_drive_data_directory_relative_path}\"\n",
+        "  !mkdir -p \"{google_drive_data_directory_path}\"\n",
+        "  !ln -nsf \"{google_drive_data_directory_path}\" ./data\n",
+        "  !touch \"data/This folder is used by the Colab notebook \\\"{colab_notebook_filename}\\\".txt\"\n",
+        "  !echo \"Data will be stored in Google Drive folder: \\\"{google_drive_data_directory_relative_path}\\\", which is mounted under \\\"{google_drive_data_directory_path}\\\"\"\n",
+        "except Exception as e:\n",
+        "  print(\"Drive won't be mounted!\")"
       ],
       "metadata": {
         "id": "iZmRtUY68U5f"

README.md CHANGED Viewed

@@ -35,6 +35,7 @@ Making evaluating and fine-tuning LLaMA models with low-rank adaptation (LoRA) e
 * **[1-click up and running in Google Colab](#run-on-google-colab)** with a standard GPU runtime.
   * Loads and stores data in Google Drive.
 * Evaluate various LLaMA LoRA models stored in your folder or from Hugging Face.<br /><a href="https://youtu.be/IoEMgouZ5xU"><img width="640px" src="https://user-images.githubusercontent.com/3784687/231023326-f28c84e2-df74-4179-b0ac-c25c4e8ca001.gif" /></a>
 * Fine-tune LLaMA models with different prompt templates and training dataset format.<br /><a href="https://youtu.be/IoEMgouZ5xU?t=60"><img width="640px" src="https://user-images.githubusercontent.com/3784687/231026640-b5cf5c79-9fe9-430b-8d4e-7346eb9567ad.gif" /></a>
   * Load JSON and JSONL datasets from your folder, or even paste plain text directly into the UI.
   * Supports Stanford Alpaca [seed_tasks](https://github.com/tatsu-lab/stanford_alpaca/blob/main/seed_tasks.jsonl), [alpaca_data](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json) and [OpenAI "prompt"-"completion"](https://platform.openai.com/docs/guides/fine-tuning/data-formatting) format.
@@ -86,11 +87,13 @@ setup: |
   pip install wandb
   cd ..
   echo 'Dependencies installed.'
 # Start the app.
 run: |
   echo 'Starting...'
-  python llama_lora_tuner/app.py --data_dir='/data' --wandb_api_key "$([ -f /data/secrets/wandb_api_key ] && cat /data/secrets/wandb_api_key | tr -d '\n')" --base_model='decapoda-research/llama-7b-hf' --share
 ```
 Then launch a cluster to run the task:

 * **[1-click up and running in Google Colab](#run-on-google-colab)** with a standard GPU runtime.
   * Loads and stores data in Google Drive.
 * Evaluate various LLaMA LoRA models stored in your folder or from Hugging Face.<br /><a href="https://youtu.be/IoEMgouZ5xU"><img width="640px" src="https://user-images.githubusercontent.com/3784687/231023326-f28c84e2-df74-4179-b0ac-c25c4e8ca001.gif" /></a>
+* Switch between base models such as `decapoda-research/llama-7b-hf`, `nomic-ai/gpt4all-j`, `databricks/dolly-v2-7b`, `EleutherAI/gpt-j-6b`, or `EleutherAI/pythia-6.9b`.
 * Fine-tune LLaMA models with different prompt templates and training dataset format.<br /><a href="https://youtu.be/IoEMgouZ5xU?t=60"><img width="640px" src="https://user-images.githubusercontent.com/3784687/231026640-b5cf5c79-9fe9-430b-8d4e-7346eb9567ad.gif" /></a>
   * Load JSON and JSONL datasets from your folder, or even paste plain text directly into the UI.
   * Supports Stanford Alpaca [seed_tasks](https://github.com/tatsu-lab/stanford_alpaca/blob/main/seed_tasks.jsonl), [alpaca_data](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json) and [OpenAI "prompt"-"completion"](https://platform.openai.com/docs/guides/fine-tuning/data-formatting) format.
   pip install wandb
   cd ..
   echo 'Dependencies installed.'
+  echo 'Pre-downloading base models so that you won't have to wait for long once the app is ready...'
+  python llama_lora_tuner/download_base_model.py --base_model_names='decapoda-research/llama-7b-hf,nomic-ai/gpt4all-j,databricks/dolly-v2-7b'
 # Start the app.
 run: |
   echo 'Starting...'
+  python llama_lora_tuner/app.py --data_dir='/data' --wandb_api_key="$([ -f /data/secrets/wandb_api_key ] && cat /data/secrets/wandb_api_key | tr -d '\n')" --base_model=decapoda-research/llama-7b-hf --base_model_choices='decapoda-research/llama-7b-hf,nomic-ai/gpt4all-j,databricks/dolly-v2-7b --share
 ```
 Then launch a cluster to run the task:

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ def main(
     base_model: str = "",
     data_dir: str = "",
     base_model_choices: str = "",
     # Allows to listen on all interfaces by providing '0.0.0.0'.
     server_name: str = "127.0.0.1",
     share: bool = False,
@@ -60,6 +61,8 @@ def main(
     if base_model not in Global.base_model_choices:
         Global.base_model_choices = [base_model] + Global.base_model_choices
     Global.data_dir = os.path.abspath(data_dir)
     Global.load_8bit = load_8bit

     base_model: str = "",
     data_dir: str = "",
     base_model_choices: str = "",
+    trust_remote_code: bool = False,
     # Allows to listen on all interfaces by providing '0.0.0.0'.
     server_name: str = "127.0.0.1",
     share: bool = False,
     if base_model not in Global.base_model_choices:
         Global.base_model_choices = [base_model] + Global.base_model_choices
+    Global.trust_remote_code = trust_remote_code
     Global.data_dir = os.path.abspath(data_dir)
     Global.load_8bit = load_8bit

llama_lora/globals.py CHANGED Viewed

@@ -20,6 +20,8 @@ class Global:
     base_model_name: str = ""
     base_model_choices: List[str] = []
     # Functions
     train_fn: Any = train

     base_model_name: str = ""
     base_model_choices: List[str] = []
+    trust_remote_code = False
     # Functions
     train_fn: Any = train

llama_lora/lib/inference.py CHANGED Viewed

@@ -66,14 +66,14 @@ def generate(
         with generate_with_streaming(**generate_params) as generator:
             for output in generator:
                 decoded_output = tokenizer.decode(output, skip_special_tokens=skip_special_tokens)
-                yield decoded_output, output
                 if output[-1] in [tokenizer.eos_token_id]:
                     break
         if generation_output:
             output = generation_output.sequences[0]
             decoded_output = tokenizer.decode(output, skip_special_tokens=skip_special_tokens)
-            yield decoded_output, output
         return  # early return for stream_output
@@ -82,5 +82,5 @@ def generate(
         generation_output = model.generate(**generate_params)
     output = generation_output.sequences[0]
     decoded_output = tokenizer.decode(output, skip_special_tokens=skip_special_tokens)
-    yield decoded_output, output
     return

         with generate_with_streaming(**generate_params) as generator:
             for output in generator:
                 decoded_output = tokenizer.decode(output, skip_special_tokens=skip_special_tokens)
+                yield decoded_output, output, False
                 if output[-1] in [tokenizer.eos_token_id]:
                     break
         if generation_output:
             output = generation_output.sequences[0]
             decoded_output = tokenizer.decode(output, skip_special_tokens=skip_special_tokens)
+            yield decoded_output, output, True
         return  # early return for stream_output
         generation_output = model.generate(**generate_params)
     output = generation_output.sequences[0]
     decoded_output = tokenizer.decode(output, skip_special_tokens=skip_special_tokens)
+    yield decoded_output, output, True
     return

llama_lora/models.py CHANGED Viewed

@@ -5,7 +5,10 @@ import json
 import re
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
 from peft import PeftModel
 from .globals import Global
@@ -27,37 +30,83 @@ def get_new_base_model(base_model_name):
             Global.name_of_new_base_model_that_is_ready_to_be_used = None
             clear_cache()
     device = get_device()
     if device == "cuda":
-        model = AutoModelForCausalLM.from_pretrained(
-            base_model_name,
             load_in_8bit=Global.load_8bit,
             torch_dtype=torch.float16,
             # device_map="auto",
             # ? https://github.com/tloen/alpaca-lora/issues/21
             device_map={'': 0},
         )
     elif device == "mps":
-        model = AutoModelForCausalLM.from_pretrained(
-            base_model_name,
             device_map={"": device},
             torch_dtype=torch.float16,
         )
     else:
-        model = AutoModelForCausalLM.from_pretrained(
-            base_model_name, device_map={"": device}, low_cpu_mem_usage=True
         )
-    tokenizer = get_tokenizer(base_model_name)
-    if re.match("[^/]+/llama", base_model_name):
-        model.config.pad_token_id = tokenizer.pad_token_id = 0
-        model.config.bos_token_id = tokenizer.bos_token_id = 1
-        model.config.eos_token_id = tokenizer.eos_token_id = 2
-    return model
 def get_tokenizer(base_model_name):
     if Global.ui_dev_mode:
@@ -68,10 +117,16 @@ def get_tokenizer(base_model_name):
         return loaded_tokenizer
     try:
-        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
     except Exception as e:
         if 'LLaMATokenizer' in str(e):
-            tokenizer = LlamaTokenizer.from_pretrained(base_model_name)
         else:
             raise e
@@ -100,13 +155,15 @@ def get_model(
     peft_model_name_or_path = peft_model_name
     if peft_model_name:
-        lora_models_directory_path = os.path.join(Global.data_dir, "lora_models")
         possible_lora_model_path = os.path.join(
             lora_models_directory_path, peft_model_name)
         if os.path.isdir(possible_lora_model_path):
             peft_model_name_or_path = possible_lora_model_path
-            possible_model_info_json_path = os.path.join(possible_lora_model_path, "info.json")
             if os.path.isfile(possible_model_info_json_path):
                 try:
                     with open(possible_model_info_json_path, "r") as file:
@@ -115,7 +172,8 @@ def get_model(
                         if possible_hf_model_name and json_data.get("load_from_hf"):
                             peft_model_name_or_path = possible_hf_model_name
                 except Exception as e:
-                    raise ValueError("Error reading model info from {possible_model_info_json_path}: {e}")
     Global.loaded_models.prepare_to_set()
     clear_cache()
@@ -148,7 +206,8 @@ def get_model(
             )
     if re.match("[^/]+/llama", base_model_name):
-        model.config.pad_token_id = get_tokenizer(base_model_name).pad_token_id = 0
         model.config.bos_token_id = 1
         model.config.eos_token_id = 2
@@ -166,7 +225,8 @@ def get_model(
 def prepare_base_model(base_model_name=Global.default_base_model_name):
-    Global.new_base_model_that_is_ready_to_be_used = get_new_base_model(base_model_name)
     Global.name_of_new_base_model_that_is_ready_to_be_used = base_model_name

 import re
 import torch
+from transformers import (
+    AutoModelForCausalLM, AutoModel,
+    AutoTokenizer, LlamaTokenizer
+)
 from peft import PeftModel
 from .globals import Global
             Global.name_of_new_base_model_that_is_ready_to_be_used = None
             clear_cache()
+    model_class = AutoModelForCausalLM
+    from_tf = False
+    force_download = False
+    has_tried_force_download = False
+    while True:
+        try:
+            model = _get_model_from_pretrained(
+                model_class, base_model_name, from_tf=from_tf, force_download=force_download)
+            break
+        except Exception as e:
+            if 'from_tf' in str(e):
+                print(
+                    f"Got error while loading model {base_model_name} with AutoModelForCausalLM: {e}.")
+                print("Retrying with from_tf=True...")
+                from_tf = True
+                force_download = False
+            elif model_class == AutoModelForCausalLM:
+                print(
+                    f"Got error while loading model {base_model_name} with AutoModelForCausalLM: {e}.")
+                print("Retrying with AutoModel...")
+                model_class = AutoModel
+                force_download = False
+            else:
+                if has_tried_force_download:
+                    raise e
+                print(
+                    f"Got error while loading model {base_model_name}: {e}.")
+                print("Retrying with force_download=True...")
+                model_class = AutoModelForCausalLM
+                from_tf = False
+                force_download = True
+                has_tried_force_download = True
+    tokenizer = get_tokenizer(base_model_name)
+    if re.match("[^/]+/llama", base_model_name):
+        model.config.pad_token_id = tokenizer.pad_token_id = 0
+        model.config.bos_token_id = tokenizer.bos_token_id = 1
+        model.config.eos_token_id = tokenizer.eos_token_id = 2
+    return model
+def _get_model_from_pretrained(model_class, model_name, from_tf=False, force_download=False):
     device = get_device()
     if device == "cuda":
+        return model_class.from_pretrained(
+            model_name,
             load_in_8bit=Global.load_8bit,
             torch_dtype=torch.float16,
             # device_map="auto",
             # ? https://github.com/tloen/alpaca-lora/issues/21
             device_map={'': 0},
+            from_tf=from_tf,
+            force_download=force_download,
+            trust_remote_code=Global.trust_remote_code
         )
     elif device == "mps":
+        return model_class.from_pretrained(
+            model_name,
             device_map={"": device},
             torch_dtype=torch.float16,
+            from_tf=from_tf,
+            force_download=force_download,
+            trust_remote_code=Global.trust_remote_code
         )
     else:
+        return model_class.from_pretrained(
+            model_name,
+            device_map={"": device},
+            low_cpu_mem_usage=True,
+            from_tf=from_tf,
+            force_download=force_download,
+            trust_remote_code=Global.trust_remote_code
         )
 def get_tokenizer(base_model_name):
     if Global.ui_dev_mode:
         return loaded_tokenizer
     try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            base_model_name,
+            trust_remote_code=Global.trust_remote_code
+        )
     except Exception as e:
         if 'LLaMATokenizer' in str(e):
+            tokenizer = LlamaTokenizer.from_pretrained(
+                base_model_name,
+                trust_remote_code=Global.trust_remote_code
+            )
         else:
             raise e
     peft_model_name_or_path = peft_model_name
     if peft_model_name:
+        lora_models_directory_path = os.path.join(
+            Global.data_dir, "lora_models")
         possible_lora_model_path = os.path.join(
             lora_models_directory_path, peft_model_name)
         if os.path.isdir(possible_lora_model_path):
             peft_model_name_or_path = possible_lora_model_path
+            possible_model_info_json_path = os.path.join(
+                possible_lora_model_path, "info.json")
             if os.path.isfile(possible_model_info_json_path):
                 try:
                     with open(possible_model_info_json_path, "r") as file:
                         if possible_hf_model_name and json_data.get("load_from_hf"):
                             peft_model_name_or_path = possible_hf_model_name
                 except Exception as e:
+                    raise ValueError(
+                        "Error reading model info from {possible_model_info_json_path}: {e}")
     Global.loaded_models.prepare_to_set()
     clear_cache()
             )
     if re.match("[^/]+/llama", base_model_name):
+        model.config.pad_token_id = get_tokenizer(
+            base_model_name).pad_token_id = 0
         model.config.bos_token_id = 1
         model.config.eos_token_id = 2
 def prepare_base_model(base_model_name=Global.default_base_model_name):
+    Global.new_base_model_that_is_ready_to_be_used = get_new_base_model(
+        base_model_name)
     Global.name_of_new_base_model_that_is_ready_to_be_used = base_model_name

llama_lora/ui/inference_ui.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gradio as gr
 import time
 import json
@@ -21,13 +22,21 @@ default_show_raw = True
 inference_output_lines = 12
 def prepare_inference(lora_model_name, progress=gr.Progress(track_tqdm=True)):
     base_model_name = Global.base_model_name
     try:
         get_tokenizer(base_model_name)
         get_model(base_model_name, lora_model_name)
-        return ("", "")
     except Exception as e:
         raise gr.Error(e)
@@ -65,6 +74,31 @@ def do_inference(
         prompter = Prompter(prompt_template)
         prompt = prompter.generate_prompt(variables)
         if Global.ui_dev_mode:
             message = f"Hi, I’m currently in UI-development mode and do not have access to resources to process your request. However, this behavior is similar to what will actually happen, so you can try and see how it will work!\n\nBase model: {base_model_name}\nLoRA model: {lora_model_name}\n\nThe following is your prompt:\n\n{prompt}"
             print(message)
@@ -83,35 +117,50 @@ def do_inference(
                         out += "\n"
                         yield out
                 for partial_sentence in word_generator(message):
                     yield (
                         gr.Textbox.update(
-                            value=partial_sentence, lines=inference_output_lines),
                         json.dumps(
-                            list(range(len(partial_sentence.split()))), indent=2)
                     )
                     time.sleep(0.05)
                 return
             time.sleep(1)
             yield (
                 gr.Textbox.update(value=message, lines=inference_output_lines),
-                json.dumps(list(range(len(message.split()))), indent=2)
             )
             return
         tokenizer = get_tokenizer(base_model_name)
         model = get_model(base_model_name, lora_model_name)
-        generation_config = GenerationConfig(
-            temperature=float(temperature),  # to avoid ValueError('`temperature` has to be a strictly positive float, but is 2')
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            num_beams=num_beams,
-            do_sample=temperature > 0,  # https://github.com/huggingface/transformers/issues/22405#issuecomment-1485527953
-        )
         def ui_generation_stopping_criteria(input_ids, score, **kwargs):
             if Global.should_stop_generating:
                 return True
@@ -129,10 +178,8 @@ def do_inference(
             'stream_output': stream_output
         }
-        for (decoded_output, output) in generate(**generation_args):
-            raw_output_str = None
-            if show_raw:
-                raw_output_str = str(output)
             response = prompter.get_response(decoded_output)
             if Global.should_stop_generating:
@@ -141,7 +188,12 @@ def do_inference(
             yield (
                 gr.Textbox.update(
                     value=response, lines=inference_output_lines),
-                raw_output_str)
             if Global.should_stop_generating:
                 # If the user stops the generation, and then clicks the
@@ -199,11 +251,13 @@ def get_warning_message_for_lora_model_and_prompt_template(lora_model, prompt_te
     if lora_mode_info and isinstance(lora_mode_info, dict):
         model_base_model = lora_mode_info.get("base_model")
         if model_base_model and model_base_model != Global.base_model_name:
-            messages.append(f"⚠️ This model was trained on top of base model `{model_base_model}`, it might not work properly with the selected base model `{Global.base_model_name}`.")
         model_prompt_template = lora_mode_info.get("prompt_template")
         if model_prompt_template and model_prompt_template != prompt_template:
-            messages.append(f"This model was trained with prompt template `{model_prompt_template}`.")
     return " ".join(messages)
@@ -221,7 +275,8 @@ def handle_prompt_template_change(prompt_template, lora_model):
     model_prompt_template_message_update = gr.Markdown.update(
         "", visible=False)
-    warning_message = get_warning_message_for_lora_model_and_prompt_template(lora_model, prompt_template)
     if warning_message:
         model_prompt_template_message_update = gr.Markdown.update(
             warning_message, visible=True)
@@ -241,7 +296,8 @@ def handle_lora_model_change(lora_model, prompt_template):
     model_prompt_template_message_update = gr.Markdown.update(
         "", visible=False)
-    warning_message = get_warning_message_for_lora_model_and_prompt_template(lora_model, prompt_template)
     if warning_message:
         model_prompt_template_message_update = gr.Markdown.update(
             warning_message, visible=True)
@@ -260,6 +316,56 @@ def update_prompt_preview(prompt_template,
 def inference_ui():
     things_that_might_timeout = []
     with gr.Blocks() as inference_ui_blocks:
@@ -387,6 +493,47 @@ def inference_ui():
                     inference_output = gr.Textbox(
                         lines=inference_output_lines, label="Output", elem_id="inference_output")
                     inference_output.style(show_copy_button=True)
                     with gr.Accordion(
                             "Raw Output",
                             open=not default_show_raw,
@@ -400,7 +547,8 @@ def inference_ui():
                             interactive=False,
                             elem_id="inference_raw_output")
-        reload_selected_models_btn = gr.Button("", elem_id="inference_reload_selected_models_btn")
         show_raw_change_event = show_raw.change(
             fn=lambda show_raw: gr.Accordion.update(visible=show_raw),
@@ -440,7 +588,8 @@ def inference_ui():
         generate_event = generate_btn.click(
             fn=prepare_inference,
             inputs=[lora_model],
-            outputs=[inference_output, inference_raw_output],
         ).then(
             fn=do_inference,
             inputs=[
@@ -457,7 +606,8 @@ def inference_ui():
                 stream_output,
                 show_raw,
             ],
-            outputs=[inference_output, inference_raw_output],
             api_name="inference"
         )
         stop_btn.click(

 import gradio as gr
+import os
 import time
 import json
 inference_output_lines = 12
+class LoggingItem:
+    def __init__(self, label):
+        self.label = label
+    def deserialize(self, value, **kwargs):
+        return value
 def prepare_inference(lora_model_name, progress=gr.Progress(track_tqdm=True)):
     base_model_name = Global.base_model_name
     try:
         get_tokenizer(base_model_name)
         get_model(base_model_name, lora_model_name)
+        return ("", "", gr.Textbox.update(visible=False))
     except Exception as e:
         raise gr.Error(e)
         prompter = Prompter(prompt_template)
         prompt = prompter.generate_prompt(variables)
+        generation_config = GenerationConfig(
+            # to avoid ValueError('`temperature` has to be a strictly positive float, but is 2')
+            temperature=float(temperature),
+            top_p=top_p,
+            top_k=top_k,
+            repetition_penalty=repetition_penalty,
+            num_beams=num_beams,
+            # https://github.com/huggingface/transformers/issues/22405#issuecomment-1485527953
+            do_sample=temperature > 0,
+        )
+        def get_output_for_flagging(output, raw_output, completed=True):
+            return json.dumps({
+                'base_model': base_model_name,
+                'adaptor_model': lora_model_name,
+                'prompt': prompt,
+                'output': output,
+                'completed': completed,
+                'raw_output': raw_output,
+                'max_new_tokens': max_new_tokens,
+                'prompt_template': prompt_template,
+                'prompt_template_variables': variables,
+                'generation_config': generation_config.to_dict(),
+            })
         if Global.ui_dev_mode:
             message = f"Hi, I’m currently in UI-development mode and do not have access to resources to process your request. However, this behavior is similar to what will actually happen, so you can try and see how it will work!\n\nBase model: {base_model_name}\nLoRA model: {lora_model_name}\n\nThe following is your prompt:\n\n{prompt}"
             print(message)
                         out += "\n"
                         yield out
+                output = ""
                 for partial_sentence in word_generator(message):
+                    output = partial_sentence
                     yield (
                         gr.Textbox.update(
+                            value=output,
+                            lines=inference_output_lines),
                         json.dumps(
+                            list(range(len(output.split()))),
+                            indent=2),
+                        gr.Textbox.update(
+                            value=get_output_for_flagging(
+                                output, "", completed=False),
+                            visible=True)
                     )
                     time.sleep(0.05)
+                yield (
+                    gr.Textbox.update(
+                        value=output,
+                        lines=inference_output_lines),
+                    json.dumps(
+                        list(range(len(output.split()))),
+                        indent=2),
+                    gr.Textbox.update(
+                        value=get_output_for_flagging(
+                            output, "", completed=True),
+                        visible=True)
+                )
                 return
             time.sleep(1)
             yield (
                 gr.Textbox.update(value=message, lines=inference_output_lines),
+                json.dumps(list(range(len(message.split()))), indent=2),
+                gr.Textbox.update(
+                    value=get_output_for_flagging(message, ""),
+                    visible=True)
             )
             return
         tokenizer = get_tokenizer(base_model_name)
         model = get_model(base_model_name, lora_model_name)
         def ui_generation_stopping_criteria(input_ids, score, **kwargs):
             if Global.should_stop_generating:
                 return True
             'stream_output': stream_output
         }
+        for (decoded_output, output, completed) in generate(**generation_args):
+            raw_output_str = str(output)
             response = prompter.get_response(decoded_output)
             if Global.should_stop_generating:
             yield (
                 gr.Textbox.update(
                     value=response, lines=inference_output_lines),
+                raw_output_str,
+                gr.Textbox.update(
+                    value=get_output_for_flagging(
+                        decoded_output, raw_output_str, completed=completed),
+                    visible=True)
+            )
             if Global.should_stop_generating:
                 # If the user stops the generation, and then clicks the
     if lora_mode_info and isinstance(lora_mode_info, dict):
         model_base_model = lora_mode_info.get("base_model")
         if model_base_model and model_base_model != Global.base_model_name:
+            messages.append(
+                f"⚠️ This model was trained on top of base model `{model_base_model}`, it might not work properly with the selected base model `{Global.base_model_name}`.")
         model_prompt_template = lora_mode_info.get("prompt_template")
         if model_prompt_template and model_prompt_template != prompt_template:
+            messages.append(
+                f"This model was trained with prompt template `{model_prompt_template}`.")
     return " ".join(messages)
     model_prompt_template_message_update = gr.Markdown.update(
         "", visible=False)
+    warning_message = get_warning_message_for_lora_model_and_prompt_template(
+        lora_model, prompt_template)
     if warning_message:
         model_prompt_template_message_update = gr.Markdown.update(
             warning_message, visible=True)
     model_prompt_template_message_update = gr.Markdown.update(
         "", visible=False)
+    warning_message = get_warning_message_for_lora_model_and_prompt_template(
+        lora_model, prompt_template)
     if warning_message:
         model_prompt_template_message_update = gr.Markdown.update(
             warning_message, visible=True)
 def inference_ui():
+    flagging_dir = os.path.join(Global.data_dir, "flagging", "inference")
+    if not os.path.exists(flagging_dir):
+        os.makedirs(flagging_dir)
+    flag_callback = gr.CSVLogger()
+    flag_components = [
+        LoggingItem("Base Model"),
+        LoggingItem("Adaptor Model"),
+        LoggingItem("Type"),
+        LoggingItem("Prompt"),
+        LoggingItem("Output"),
+        LoggingItem("Completed"),
+        LoggingItem("Config"),
+        LoggingItem("Raw Output"),
+        LoggingItem("Max New Tokens"),
+        LoggingItem("Prompt Template"),
+        LoggingItem("Prompt Template Variables"),
+        LoggingItem("Generation Config"),
+    ]
+    flag_callback.setup(flag_components, flagging_dir)
+    def get_flag_callback_args(output_for_flagging_str, flag_type):
+        output_for_flagging = json.loads(output_for_flagging_str)
+        generation_config = output_for_flagging.get("generation_config", {})
+        config = []
+        if generation_config.get('do_sample', False):
+            config.append(
+                f"Temperature: {generation_config.get('temperature')}")
+            config.append(f"Top P: {generation_config.get('top_p')}")
+            config.append(f"Top K: {generation_config.get('top_k')}")
+        num_beams = generation_config.get('num_beams', 1)
+        if num_beams > 1:
+            config.append(f"Beams: {generation_config.get('num_beams')}")
+        config.append(f"RP: {generation_config.get('repetition_penalty')}")
+        return [
+            output_for_flagging.get("base_model", ""),
+            output_for_flagging.get("adaptor_model", ""),
+            flag_type,
+            output_for_flagging.get("prompt", ""),
+            output_for_flagging.get("output", ""),
+            str(output_for_flagging.get("completed", "")),
+            ", ".join(config),
+            output_for_flagging.get("raw_output", ""),
+            str(output_for_flagging.get("max_new_tokens", "")),
+            output_for_flagging.get("prompt_template", ""),
+            json.dumps(output_for_flagging.get(
+                "prompt_template_variables", "")),
+            json.dumps(output_for_flagging.get("generation_config", "")),
+        ]
     things_that_might_timeout = []
     with gr.Blocks() as inference_ui_blocks:
                     inference_output = gr.Textbox(
                         lines=inference_output_lines, label="Output", elem_id="inference_output")
                     inference_output.style(show_copy_button=True)
+                    with gr.Row(elem_id="inference_flagging_group"):
+                        output_for_flagging = gr.Textbox(
+                            interactive=False, visible=False,
+                            elem_id="inference_output_for_flagging")
+                        flag_btn = gr.Button(
+                            "Flag", elem_id="inference_flag_btn")
+                        flag_up_btn = gr.Button(
+                            "👍", elem_id="inference_flag_up_btn")
+                        flag_down_btn = gr.Button(
+                            "👎", elem_id="inference_flag_down_btn")
+                        flag_output = gr.Markdown(
+                            "", elem_id="inference_flag_output")
+                        flag_btn.click(
+                            lambda d: (flag_callback.flag(
+                                get_flag_callback_args(d, "Flag"),
+                                flag_option="Flag",
+                                username=None
+                            ), "")[1],
+                            inputs=[output_for_flagging],
+                            outputs=[flag_output],
+                            preprocess=False)
+                        flag_up_btn.click(
+                            lambda d: (flag_callback.flag(
+                                get_flag_callback_args(d, "👍"),
+                                flag_option="Up Vote",
+                                username=None
+                            ), "")[1],
+                            inputs=[output_for_flagging],
+                            outputs=[flag_output],
+                            preprocess=False)
+                        flag_down_btn.click(
+                            lambda d: (flag_callback.flag(
+                                get_flag_callback_args(d, "👎"),
+                                flag_option="Down Vote",
+                                username=None
+                            ), "")[1],
+                            inputs=[output_for_flagging],
+                            outputs=[flag_output],
+                            preprocess=False)
                     with gr.Accordion(
                             "Raw Output",
                             open=not default_show_raw,
                             interactive=False,
                             elem_id="inference_raw_output")
+        reload_selected_models_btn = gr.Button(
+            "", elem_id="inference_reload_selected_models_btn")
         show_raw_change_event = show_raw.change(
             fn=lambda show_raw: gr.Accordion.update(visible=show_raw),
         generate_event = generate_btn.click(
             fn=prepare_inference,
             inputs=[lora_model],
+            outputs=[inference_output,
+                     inference_raw_output, output_for_flagging],
         ).then(
             fn=do_inference,
             inputs=[
                 stream_output,
                 show_raw,
             ],
+            outputs=[inference_output,
+                     inference_raw_output, output_for_flagging],
             api_name="inference"
         )
         stop_btn.click(

llama_lora/ui/main_page.py CHANGED Viewed

@@ -398,6 +398,45 @@ def main_page_custom_css():
         bottom: 16px;
     }
     #dataset_plain_text_input_variables_separator textarea,
     #dataset_plain_text_input_and_output_separator textarea,
     #dataset_plain_text_data_separator textarea {

         bottom: 16px;
     }
+    #inference_flagging_group {
+        position: relative;
+    }
+    #inference_flag_output {
+        min-height: 1px !important;
+        position: absolute;
+        top: 0;
+        bottom: 0;
+        right: 0;
+        pointer-events: none;
+        opacity: 0.5;
+    }
+    #inference_flag_output .wrap {
+        top: 0;
+        bottom: 0;
+        right: 0;
+        justify-content: center;
+        align-items: flex-end;
+        padding: 4px !important;
+    }
+    #inference_flag_output .wrap svg {
+        display: none;
+    }
+    .form:has(> #inference_output_for_flagging),
+    #inference_output_for_flagging {
+        display: none;
+    }
+    #inference_flagging_group:has(#inference_output_for_flagging.hidden) {
+        opacity: 0.5;
+        pointer-events: none;
+    }
+    #inference_flag_up_btn, #inference_flag_down_btn {
+        min-width: 44px;
+        flex-grow: 1;
+    }
+    #inference_flag_btn {
+        flex-grow: 2;
+    }
     #dataset_plain_text_input_variables_separator textarea,
     #dataset_plain_text_input_and_output_separator textarea,
     #dataset_plain_text_data_separator textarea {