zetavg commited on
Commit
27aa501
1 Parent(s): 36b4c98

finetune: support more params

Browse files
llama_lora/lib/finetune.py CHANGED
@@ -29,10 +29,10 @@ def train(
29
  # training hyperparams
30
  micro_batch_size: int = 4,
31
  gradient_accumulation_steps: int = 32,
32
- num_epochs: int = 3,
33
  learning_rate: float = 3e-4,
34
  cutoff_len: int = 256,
35
- val_set_size: int = 2000,
36
  # lora hyperparams
37
  lora_r: int = 8,
38
  lora_alpha: int = 16,
@@ -46,12 +46,16 @@ def train(
46
  group_by_length: bool = False, # faster, but produces an odd training loss curve
47
  # either training checkpoint or final adapter
48
  resume_from_checkpoint: str = None,
 
 
 
49
  # logging
50
  callbacks: List[Any] = []
51
  ):
52
  if os.path.exists(output_dir):
53
  if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
54
- raise ValueError(f"The output directory already exists and is not empty. ({output_dir})")
 
55
 
56
  device_map = "auto"
57
  world_size = int(os.environ.get("WORLD_SIZE", 1))
@@ -186,17 +190,17 @@ def train(
186
  per_device_train_batch_size=micro_batch_size,
187
  gradient_accumulation_steps=gradient_accumulation_steps,
188
  warmup_steps=100,
189
- num_train_epochs=num_epochs,
190
  learning_rate=learning_rate,
191
  fp16=True,
192
- logging_steps=10,
193
  optim="adamw_torch",
194
  evaluation_strategy="steps" if val_set_size > 0 else "no",
195
  save_strategy="steps",
196
  eval_steps=200 if val_set_size > 0 else None,
197
- save_steps=200,
198
  output_dir=output_dir,
199
- save_total_limit=3,
200
  load_best_model_at_end=True if val_set_size > 0 else False,
201
  ddp_find_unused_parameters=False if ddp else None,
202
  group_by_length=group_by_length,
@@ -213,6 +217,24 @@ def train(
213
  os.makedirs(output_dir)
214
  with open(os.path.join(output_dir, "trainer_args.json"), 'w') as trainer_args_json_file:
215
  json.dump(trainer.args.to_dict(), trainer_args_json_file, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
  model.config.use_cache = False
218
 
@@ -232,7 +254,8 @@ def train(
232
  print(f"Model saved to {output_dir}.")
233
 
234
  with open(os.path.join(output_dir, "trainer_log_history.jsonl"), 'w') as trainer_log_history_jsonl_file:
235
- trainer_log_history = "\n".join([json.dumps(line) for line in trainer.state.log_history])
 
236
  trainer_log_history_jsonl_file.write(trainer_log_history)
237
 
238
  with open(os.path.join(output_dir, "train_output.json"), 'w') as train_output_json_file:
 
29
  # training hyperparams
30
  micro_batch_size: int = 4,
31
  gradient_accumulation_steps: int = 32,
32
+ num_train_epochs: int = 3,
33
  learning_rate: float = 3e-4,
34
  cutoff_len: int = 256,
35
+ val_set_size: int = 2000, # TODO: use percentage
36
  # lora hyperparams
37
  lora_r: int = 8,
38
  lora_alpha: int = 16,
 
46
  group_by_length: bool = False, # faster, but produces an odd training loss curve
47
  # either training checkpoint or final adapter
48
  resume_from_checkpoint: str = None,
49
+ save_steps: int = 200,
50
+ save_total_limit: int = 3,
51
+ logging_steps: int = 10,
52
  # logging
53
  callbacks: List[Any] = []
54
  ):
55
  if os.path.exists(output_dir):
56
  if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
57
+ raise ValueError(
58
+ f"The output directory already exists and is not empty. ({output_dir})")
59
 
60
  device_map = "auto"
61
  world_size = int(os.environ.get("WORLD_SIZE", 1))
 
190
  per_device_train_batch_size=micro_batch_size,
191
  gradient_accumulation_steps=gradient_accumulation_steps,
192
  warmup_steps=100,
193
+ num_train_epochs=num_train_epochs,
194
  learning_rate=learning_rate,
195
  fp16=True,
196
+ logging_steps=logging_steps,
197
  optim="adamw_torch",
198
  evaluation_strategy="steps" if val_set_size > 0 else "no",
199
  save_strategy="steps",
200
  eval_steps=200 if val_set_size > 0 else None,
201
+ save_steps=save_steps,
202
  output_dir=output_dir,
203
+ save_total_limit=save_total_limit,
204
  load_best_model_at_end=True if val_set_size > 0 else False,
205
  ddp_find_unused_parameters=False if ddp else None,
206
  group_by_length=group_by_length,
 
217
  os.makedirs(output_dir)
218
  with open(os.path.join(output_dir, "trainer_args.json"), 'w') as trainer_args_json_file:
219
  json.dump(trainer.args.to_dict(), trainer_args_json_file, indent=2)
220
+ with open(os.path.join(output_dir, "finetune_params.json"), 'w') as finetune_params_json_file:
221
+ finetune_params = {
222
+ 'micro_batch_size': micro_batch_size,
223
+ 'gradient_accumulation_steps': gradient_accumulation_steps,
224
+ 'num_train_epochs': num_train_epochs,
225
+ 'learning_rate': learning_rate,
226
+ 'cutoff_len': cutoff_len,
227
+ 'lora_r': lora_r,
228
+ 'lora_alpha': lora_alpha,
229
+ 'lora_dropout': lora_dropout,
230
+ 'lora_target_modules': lora_target_modules,
231
+ 'train_on_inputs': train_on_inputs,
232
+ 'group_by_length': group_by_length,
233
+ 'save_steps': save_steps,
234
+ 'save_total_limit': save_total_limit,
235
+ 'logging_steps': logging_steps,
236
+ }
237
+ json.dump(finetune_params, finetune_params_json_file, indent=2)
238
 
239
  model.config.use_cache = False
240
 
 
254
  print(f"Model saved to {output_dir}.")
255
 
256
  with open(os.path.join(output_dir, "trainer_log_history.jsonl"), 'w') as trainer_log_history_jsonl_file:
257
+ trainer_log_history = "\n".join(
258
+ [json.dumps(line) for line in trainer.state.log_history])
259
  trainer_log_history_jsonl_file.write(trainer_log_history)
260
 
261
  with open(os.path.join(output_dir, "train_output.json"), 'w') as train_output_json_file:
llama_lora/ui/finetune_ui.py CHANGED
@@ -269,6 +269,9 @@ def do_train(
269
  lora_dropout,
270
  lora_target_modules,
271
  model_name,
 
 
 
272
  progress=gr.Progress(track_tqdm=should_training_progress_track_tqdm),
273
  ):
274
  try:
@@ -276,7 +279,8 @@ def do_train(
276
  output_dir = os.path.join(Global.data_dir, "lora_models", model_name)
277
  if os.path.exists(output_dir):
278
  if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
279
- raise ValueError(f"The output directory already exists and is not empty. ({output_dir})")
 
280
 
281
  if not should_training_progress_track_tqdm:
282
  progress(0, desc="Preparing train data...")
@@ -484,6 +488,9 @@ Train data (first 10):
484
  train_on_inputs, # train_on_inputs
485
  False, # group_by_length
486
  None, # resume_from_checkpoint
 
 
 
487
  training_callbacks # callbacks
488
  )
489
 
@@ -500,7 +507,8 @@ Train data (first 10):
500
  return result_message
501
 
502
  except Exception as e:
503
- raise gr.Error(f"{e} (To dismiss this error, click the 'Abort' button)")
 
504
 
505
 
506
  def do_abort_training():
@@ -661,6 +669,8 @@ def finetune_ui():
661
  )
662
 
663
  with gr.Row():
 
 
664
  micro_batch_size_default_value = 1
665
 
666
  if Global.gpu_total_cores is not None and Global.gpu_total_memory is not None:
@@ -695,7 +705,7 @@ def finetune_ui():
695
  )
696
 
697
  evaluate_data_percentage = gr.Slider(
698
- minimum=0, maximum=0.5, step=0.001, value=0.03,
699
  label="Evaluation Data Percentage",
700
  info="The percentage of data to be used for evaluation. This percentage of data will not be used for training and will be used to assess the performance of the model during the process."
701
  )
@@ -726,6 +736,26 @@ def finetune_ui():
726
  info="Modules to replace with LoRA."
727
  )
728
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
729
  with gr.Column():
730
  model_name = gr.Textbox(
731
  lines=1, label="LoRA Model Name", value=random_name,
@@ -767,7 +797,10 @@ def finetune_ui():
767
  lora_alpha,
768
  lora_dropout,
769
  lora_target_modules,
770
- model_name
 
 
 
771
  ]),
772
  outputs=train_output
773
  )
@@ -860,6 +893,28 @@ def finetune_ui():
860
  'Press to load a sample dataset of the current selected format into the textbox.',
861
  });
862
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863
  tippy('#finetune_model_name', {
864
  placement: 'bottom',
865
  delay: [500, 0],
 
269
  lora_dropout,
270
  lora_target_modules,
271
  model_name,
272
+ save_steps,
273
+ save_total_limit,
274
+ logging_steps,
275
  progress=gr.Progress(track_tqdm=should_training_progress_track_tqdm),
276
  ):
277
  try:
 
279
  output_dir = os.path.join(Global.data_dir, "lora_models", model_name)
280
  if os.path.exists(output_dir):
281
  if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
282
+ raise ValueError(
283
+ f"The output directory already exists and is not empty. ({output_dir})")
284
 
285
  if not should_training_progress_track_tqdm:
286
  progress(0, desc="Preparing train data...")
 
488
  train_on_inputs, # train_on_inputs
489
  False, # group_by_length
490
  None, # resume_from_checkpoint
491
+ save_steps, # save_steps
492
+ save_total_limit, # save_total_limit
493
+ logging_steps, # logging_steps
494
  training_callbacks # callbacks
495
  )
496
 
 
507
  return result_message
508
 
509
  except Exception as e:
510
+ raise gr.Error(
511
+ f"{e} (To dismiss this error, click the 'Abort' button)")
512
 
513
 
514
  def do_abort_training():
 
669
  )
670
 
671
  with gr.Row():
672
+ # https://huggingface.co/docs/transformers/main/main_classes/trainer
673
+
674
  micro_batch_size_default_value = 1
675
 
676
  if Global.gpu_total_cores is not None and Global.gpu_total_memory is not None:
 
705
  )
706
 
707
  evaluate_data_percentage = gr.Slider(
708
+ minimum=0, maximum=0.5, step=0.001, value=0,
709
  label="Evaluation Data Percentage",
710
  info="The percentage of data to be used for evaluation. This percentage of data will not be used for training and will be used to assess the performance of the model during the process."
711
  )
 
736
  info="Modules to replace with LoRA."
737
  )
738
 
739
+ with gr.Row():
740
+ logging_steps = gr.Number(
741
+ label="Logging Steps",
742
+ precision=0,
743
+ value=10,
744
+ elem_id="finetune_logging_steps"
745
+ )
746
+ save_steps = gr.Number(
747
+ label="Steps Per Save",
748
+ precision=0,
749
+ value=500,
750
+ elem_id="finetune_save_steps"
751
+ )
752
+ save_total_limit = gr.Number(
753
+ label="Saved Checkpoints Limit",
754
+ precision=0,
755
+ value=5,
756
+ elem_id="finetune_save_total_limit"
757
+ )
758
+
759
  with gr.Column():
760
  model_name = gr.Textbox(
761
  lines=1, label="LoRA Model Name", value=random_name,
 
797
  lora_alpha,
798
  lora_dropout,
799
  lora_target_modules,
800
+ model_name,
801
+ save_steps,
802
+ save_total_limit,
803
+ logging_steps,
804
  ]),
805
  outputs=train_output
806
  )
 
893
  'Press to load a sample dataset of the current selected format into the textbox.',
894
  });
895
 
896
+ tippy('#finetune_save_total_limit', {
897
+ placement: 'bottom',
898
+ delay: [500, 0],
899
+ animation: 'scale-subtle',
900
+ content:
901
+ 'Total amount of checkpoints to preserve. Older checkpoints will be deleted.',
902
+ });
903
+ tippy('#finetune_save_steps', {
904
+ placement: 'bottom',
905
+ delay: [500, 0],
906
+ animation: 'scale-subtle',
907
+ content:
908
+ 'Number of updates steps before two checkpoint saves.',
909
+ });
910
+ tippy('#finetune_logging_steps', {
911
+ placement: 'bottom',
912
+ delay: [500, 0],
913
+ animation: 'scale-subtle',
914
+ content:
915
+ 'Number of update steps between two logs.',
916
+ });
917
+
918
  tippy('#finetune_model_name', {
919
  placement: 'bottom',
920
  delay: [500, 0],
llama_lora/ui/main_page.py CHANGED
@@ -432,6 +432,24 @@ def main_page_custom_css():
432
  flex: 2;
433
  }
434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
  @media screen and (max-width: 392px) {
436
  #inference_lora_model, #finetune_template {
437
  border-bottom-left-radius: 0;
 
432
  flex: 2;
433
  }
434
 
435
+ #finetune_save_total_limit,
436
+ #finetune_save_steps,
437
+ #finetune_logging_steps {
438
+ min-width: min(120px,100%) !important;
439
+ padding-top: 4px;
440
+ }
441
+ #finetune_save_total_limit span,
442
+ #finetune_save_steps span,
443
+ #finetune_logging_steps span {
444
+ font-size: 12px;
445
+ margin-bottom: 5px;
446
+ }
447
+ #finetune_save_total_limit input,
448
+ #finetune_save_steps input,
449
+ #finetune_logging_steps input {
450
+ padding: 4px 8px;
451
+ }
452
+
453
  @media screen and (max-width: 392px) {
454
  #inference_lora_model, #finetune_template {
455
  border-bottom-left-radius: 0;