Spaces:

zetavg
/

LLaMA-LoRA-Tuner-UI-Demo

Runtime error

App Files Files Community

zetavg commited on Apr 10, 2023

Commit

27aa501

•

1 Parent(s): 36b4c98

finetune: support more params

Browse files

Files changed (3) hide show

llama_lora/lib/finetune.py +31 -8
llama_lora/ui/finetune_ui.py +59 -4
llama_lora/ui/main_page.py +18 -0

llama_lora/lib/finetune.py CHANGED Viewed

@@ -29,10 +29,10 @@ def train(
     # training hyperparams
     micro_batch_size: int = 4,
     gradient_accumulation_steps: int = 32,
-    num_epochs: int = 3,
     learning_rate: float = 3e-4,
     cutoff_len: int = 256,
-    val_set_size: int = 2000,
     # lora hyperparams
     lora_r: int = 8,
     lora_alpha: int = 16,
@@ -46,12 +46,16 @@ def train(
     group_by_length: bool = False,  # faster, but produces an odd training loss curve
     # either training checkpoint or final adapter
     resume_from_checkpoint: str = None,
     # logging
     callbacks: List[Any] = []
 ):
     if os.path.exists(output_dir):
         if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
-            raise ValueError(f"The output directory already exists and is not empty. ({output_dir})")
     device_map = "auto"
     world_size = int(os.environ.get("WORLD_SIZE", 1))
@@ -186,17 +190,17 @@ def train(
             per_device_train_batch_size=micro_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
             warmup_steps=100,
-            num_train_epochs=num_epochs,
             learning_rate=learning_rate,
             fp16=True,
-            logging_steps=10,
             optim="adamw_torch",
             evaluation_strategy="steps" if val_set_size > 0 else "no",
             save_strategy="steps",
             eval_steps=200 if val_set_size > 0 else None,
-            save_steps=200,
             output_dir=output_dir,
-            save_total_limit=3,
             load_best_model_at_end=True if val_set_size > 0 else False,
             ddp_find_unused_parameters=False if ddp else None,
             group_by_length=group_by_length,
@@ -213,6 +217,24 @@ def train(
         os.makedirs(output_dir)
     with open(os.path.join(output_dir, "trainer_args.json"), 'w') as trainer_args_json_file:
         json.dump(trainer.args.to_dict(), trainer_args_json_file, indent=2)
     model.config.use_cache = False
@@ -232,7 +254,8 @@ def train(
     print(f"Model saved to {output_dir}.")
     with open(os.path.join(output_dir, "trainer_log_history.jsonl"), 'w') as trainer_log_history_jsonl_file:
-        trainer_log_history = "\n".join([json.dumps(line) for line in trainer.state.log_history])
         trainer_log_history_jsonl_file.write(trainer_log_history)
     with open(os.path.join(output_dir, "train_output.json"), 'w') as train_output_json_file:

     # training hyperparams
     micro_batch_size: int = 4,
     gradient_accumulation_steps: int = 32,
+    num_train_epochs: int = 3,
     learning_rate: float = 3e-4,
     cutoff_len: int = 256,
+    val_set_size: int = 2000,  # TODO: use percentage
     # lora hyperparams
     lora_r: int = 8,
     lora_alpha: int = 16,
     group_by_length: bool = False,  # faster, but produces an odd training loss curve
     # either training checkpoint or final adapter
     resume_from_checkpoint: str = None,
+    save_steps: int = 200,
+    save_total_limit: int = 3,
+    logging_steps: int = 10,
     # logging
     callbacks: List[Any] = []
 ):
     if os.path.exists(output_dir):
         if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
+            raise ValueError(
+                f"The output directory already exists and is not empty. ({output_dir})")
     device_map = "auto"
     world_size = int(os.environ.get("WORLD_SIZE", 1))
             per_device_train_batch_size=micro_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
             warmup_steps=100,
+            num_train_epochs=num_train_epochs,
             learning_rate=learning_rate,
             fp16=True,
+            logging_steps=logging_steps,
             optim="adamw_torch",
             evaluation_strategy="steps" if val_set_size > 0 else "no",
             save_strategy="steps",
             eval_steps=200 if val_set_size > 0 else None,
+            save_steps=save_steps,
             output_dir=output_dir,
+            save_total_limit=save_total_limit,
             load_best_model_at_end=True if val_set_size > 0 else False,
             ddp_find_unused_parameters=False if ddp else None,
             group_by_length=group_by_length,
         os.makedirs(output_dir)
     with open(os.path.join(output_dir, "trainer_args.json"), 'w') as trainer_args_json_file:
         json.dump(trainer.args.to_dict(), trainer_args_json_file, indent=2)
+    with open(os.path.join(output_dir, "finetune_params.json"), 'w') as finetune_params_json_file:
+        finetune_params = {
+            'micro_batch_size': micro_batch_size,
+            'gradient_accumulation_steps': gradient_accumulation_steps,
+            'num_train_epochs': num_train_epochs,
+            'learning_rate': learning_rate,
+            'cutoff_len': cutoff_len,
+            'lora_r': lora_r,
+            'lora_alpha': lora_alpha,
+            'lora_dropout': lora_dropout,
+            'lora_target_modules': lora_target_modules,
+            'train_on_inputs': train_on_inputs,
+            'group_by_length': group_by_length,
+            'save_steps': save_steps,
+            'save_total_limit': save_total_limit,
+            'logging_steps': logging_steps,
+        }
+        json.dump(finetune_params, finetune_params_json_file, indent=2)
     model.config.use_cache = False
     print(f"Model saved to {output_dir}.")
     with open(os.path.join(output_dir, "trainer_log_history.jsonl"), 'w') as trainer_log_history_jsonl_file:
+        trainer_log_history = "\n".join(
+            [json.dumps(line) for line in trainer.state.log_history])
         trainer_log_history_jsonl_file.write(trainer_log_history)
     with open(os.path.join(output_dir, "train_output.json"), 'w') as train_output_json_file:

llama_lora/ui/finetune_ui.py CHANGED Viewed

@@ -269,6 +269,9 @@ def do_train(
     lora_dropout,
     lora_target_modules,
     model_name,
     progress=gr.Progress(track_tqdm=should_training_progress_track_tqdm),
 ):
     try:
@@ -276,7 +279,8 @@ def do_train(
         output_dir = os.path.join(Global.data_dir, "lora_models", model_name)
         if os.path.exists(output_dir):
             if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
-                raise ValueError(f"The output directory already exists and is not empty. ({output_dir})")
         if not should_training_progress_track_tqdm:
             progress(0, desc="Preparing train data...")
@@ -484,6 +488,9 @@ Train data (first 10):
             train_on_inputs,  # train_on_inputs
             False,  # group_by_length
             None,  # resume_from_checkpoint
             training_callbacks  # callbacks
         )
@@ -500,7 +507,8 @@ Train data (first 10):
         return result_message
     except Exception as e:
-        raise gr.Error(f"{e} (To dismiss this error, click the 'Abort' button)")
 def do_abort_training():
@@ -661,6 +669,8 @@ def finetune_ui():
                 )
         with gr.Row():
             micro_batch_size_default_value = 1
             if Global.gpu_total_cores is not None and Global.gpu_total_memory is not None:
@@ -695,7 +705,7 @@ def finetune_ui():
                 )
                 evaluate_data_percentage = gr.Slider(
-                    minimum=0, maximum=0.5, step=0.001, value=0.03,
                     label="Evaluation Data Percentage",
                     info="The percentage of data to be used for evaluation. This percentage of data will not be used for training and will be used to assess the performance of the model during the process."
                 )
@@ -726,6 +736,26 @@ def finetune_ui():
                     info="Modules to replace with LoRA."
                 )
                 with gr.Column():
                     model_name = gr.Textbox(
                         lines=1, label="LoRA Model Name", value=random_name,
@@ -767,7 +797,10 @@ def finetune_ui():
                 lora_alpha,
                 lora_dropout,
                 lora_target_modules,
-                model_name
             ]),
             outputs=train_output
         )
@@ -860,6 +893,28 @@ def finetune_ui():
             'Press to load a sample dataset of the current selected format into the textbox.',
         });
         tippy('#finetune_model_name', {
           placement: 'bottom',
           delay: [500, 0],

     lora_dropout,
     lora_target_modules,
     model_name,
+    save_steps,
+    save_total_limit,
+    logging_steps,
     progress=gr.Progress(track_tqdm=should_training_progress_track_tqdm),
 ):
     try:
         output_dir = os.path.join(Global.data_dir, "lora_models", model_name)
         if os.path.exists(output_dir):
             if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
+                raise ValueError(
+                    f"The output directory already exists and is not empty. ({output_dir})")
         if not should_training_progress_track_tqdm:
             progress(0, desc="Preparing train data...")
             train_on_inputs,  # train_on_inputs
             False,  # group_by_length
             None,  # resume_from_checkpoint
+            save_steps,  # save_steps
+            save_total_limit,  # save_total_limit
+            logging_steps,  # logging_steps
             training_callbacks  # callbacks
         )
         return result_message
     except Exception as e:
+        raise gr.Error(
+            f"{e} (To dismiss this error, click the 'Abort' button)")
 def do_abort_training():
                 )
         with gr.Row():
+            # https://huggingface.co/docs/transformers/main/main_classes/trainer
             micro_batch_size_default_value = 1
             if Global.gpu_total_cores is not None and Global.gpu_total_memory is not None:
                 )
                 evaluate_data_percentage = gr.Slider(
+                    minimum=0, maximum=0.5, step=0.001, value=0,
                     label="Evaluation Data Percentage",
                     info="The percentage of data to be used for evaluation. This percentage of data will not be used for training and will be used to assess the performance of the model during the process."
                 )
                     info="Modules to replace with LoRA."
                 )
+                with gr.Row():
+                    logging_steps = gr.Number(
+                        label="Logging Steps",
+                        precision=0,
+                        value=10,
+                        elem_id="finetune_logging_steps"
+                    )
+                    save_steps = gr.Number(
+                        label="Steps Per Save",
+                        precision=0,
+                        value=500,
+                        elem_id="finetune_save_steps"
+                    )
+                    save_total_limit = gr.Number(
+                        label="Saved Checkpoints Limit",
+                        precision=0,
+                        value=5,
+                        elem_id="finetune_save_total_limit"
+                    )
                 with gr.Column():
                     model_name = gr.Textbox(
                         lines=1, label="LoRA Model Name", value=random_name,
                 lora_alpha,
                 lora_dropout,
                 lora_target_modules,
+                model_name,
+                save_steps,
+                save_total_limit,
+                logging_steps,
             ]),
             outputs=train_output
         )
             'Press to load a sample dataset of the current selected format into the textbox.',
         });
+        tippy('#finetune_save_total_limit', {
+          placement: 'bottom',
+          delay: [500, 0],
+          animation: 'scale-subtle',
+          content:
+            'Total amount of checkpoints to preserve. Older checkpoints will be deleted.',
+        });
+        tippy('#finetune_save_steps', {
+          placement: 'bottom',
+          delay: [500, 0],
+          animation: 'scale-subtle',
+          content:
+            'Number of updates steps before two checkpoint saves.',
+        });
+        tippy('#finetune_logging_steps', {
+          placement: 'bottom',
+          delay: [500, 0],
+          animation: 'scale-subtle',
+          content:
+            'Number of update steps between two logs.',
+        });
         tippy('#finetune_model_name', {
           placement: 'bottom',
           delay: [500, 0],

llama_lora/ui/main_page.py CHANGED Viewed

@@ -432,6 +432,24 @@ def main_page_custom_css():
         flex: 2;
     }
     @media screen and (max-width: 392px) {
         #inference_lora_model, #finetune_template {
             border-bottom-left-radius: 0;

         flex: 2;
     }
+    #finetune_save_total_limit,
+    #finetune_save_steps,
+    #finetune_logging_steps {
+        min-width: min(120px,100%) !important;
+        padding-top: 4px;
+    }
+    #finetune_save_total_limit span,
+    #finetune_save_steps span,
+    #finetune_logging_steps span {
+        font-size: 12px;
+        margin-bottom: 5px;
+    }
+    #finetune_save_total_limit input,
+    #finetune_save_steps input,
+    #finetune_logging_steps input {
+        padding: 4px 8px;
+    }
     @media screen and (max-width: 392px) {
         #inference_lora_model, #finetune_template {
             border-bottom-left-radius: 0;