winglian commited on
Commit
05fffb5
1 Parent(s): 2df63ef

more logging, wandb fixes

Browse files
configs/cerebras_1_3B_alpaca.yml CHANGED
@@ -23,7 +23,7 @@ lora_target_modules:
23
  lora_fan_in_fan_out: false
24
  wandb_project: pythia-1.4b-lora
25
  wandb_watch:
26
- wandb_run_name:
27
  wandb_log_model: checkpoint
28
  output_dir: ./lora-alpaca
29
  batch_size: 32
 
23
  lora_fan_in_fan_out: false
24
  wandb_project: pythia-1.4b-lora
25
  wandb_watch:
26
+ wandb_run_id:
27
  wandb_log_model: checkpoint
28
  output_dir: ./lora-alpaca
29
  batch_size: 32
configs/llama_65B_alpaca.yml CHANGED
@@ -25,7 +25,7 @@ lora_target_modules:
25
  lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific
26
  wandb_project: llama-65b-lora
27
  wandb_watch:
28
- wandb_run_name:
29
  wandb_log_model: checkpoint
30
  output_dir: ./lora-llama-alpaca
31
  batch_size: 128
 
25
  lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific
26
  wandb_project: llama-65b-lora
27
  wandb_watch:
28
+ wandb_run_id:
29
  wandb_log_model: checkpoint
30
  output_dir: ./lora-llama-alpaca
31
  batch_size: 128
configs/pythia_1_2B_alpaca.yml CHANGED
@@ -25,7 +25,7 @@ lora_target_modules:
25
  lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific
26
  wandb_project: pythia-1.4b-lora
27
  wandb_watch:
28
- wandb_run_name:
29
  wandb_log_model: checkpoint
30
  output_dir: ./lora-alpaca
31
  batch_size: 48
 
25
  lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific
26
  wandb_project: pythia-1.4b-lora
27
  wandb_watch:
28
+ wandb_run_id:
29
  wandb_log_model: checkpoint
30
  output_dir: ./lora-alpaca
31
  batch_size: 48
ds_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bf16": {
3
+ "enabled": "auto",
4
+ },
5
+ "fp16": {
6
+ "enabled": "auto",
7
+ "loss_scale": 0,
8
+ "loss_scale_window": 1000,
9
+ "initial_scale_power": 16,
10
+ "hysteresis": 2,
11
+ "min_loss_scale": 1
12
+ },
13
+ "optimizer": {
14
+ "type": "AdamW",
15
+ "params": {
16
+ "lr": "auto",
17
+ "betas": "auto",
18
+ "eps": "auto",
19
+ "weight_decay": "auto"
20
+ }
21
+ },
22
+ "scheduler": {
23
+ "type": "WarmupLR",
24
+ "params": {
25
+ "warmup_min_lr": "auto",
26
+ "warmup_max_lr": "auto",
27
+ "warmup_num_steps": "auto"
28
+ }
29
+ },
30
+ "zero_optimization": {
31
+ "stage": 3,
32
+ "offload_optimizer": {
33
+ "device": "cpu",
34
+ "pin_memory": true
35
+ },
36
+ "offload_param": {
37
+ "device": "cpu",
38
+ "pin_memory": true
39
+ },
40
+ "overlap_comm": true,
41
+ "contiguous_gradients": true,
42
+ "sub_group_size": 1e9,
43
+ "reduce_bucket_size": "auto",
44
+ "stage3_prefetch_bucket_size": "auto",
45
+ "stage3_param_persistence_threshold": "auto",
46
+ "stage3_max_live_parameters": 1e9,
47
+ "stage3_max_reuse_distance": 1e9,
48
+ "stage3_gather_16bit_weights_on_model_save": true
49
+ },
50
+ "gradient_accumulation_steps": "auto",
51
+ "gradient_clipping": "auto",
52
+ "steps_per_print": 5,
53
+ "train_batch_size": "auto",
54
+ "train_micro_batch_size_per_gpu": "auto",
55
+ "wall_clock_breakdown": false
56
+ }
scripts/finetune.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import math
2
  import os
3
  import random
@@ -37,6 +38,9 @@ from axolotl.prompt_tokenizers import (
37
  )
38
  from axolotl.prompters import AlpacaPrompter, GPTeacherPrompter, ShareGPTPrompter
39
 
 
 
 
40
 
41
  def setup_wandb_env_vars(cfg):
42
  if len(cfg.wandb_project) > 0:
@@ -46,6 +50,8 @@ def setup_wandb_env_vars(cfg):
46
  os.environ["WANDB_WATCH"] = cfg.wandb_watch
47
  if cfg.wandb_log_model and len(cfg.wandb_log_model) > 0:
48
  os.environ["WANDB_LOG_MODEL"] = cfg.wandb_log_model
 
 
49
 
50
 
51
  def load_model(base_model, model_type, tokenizer_type, cfg, adapter="lora"):
@@ -164,8 +170,8 @@ def check_dataset_labels(dataset, tokenizer):
164
  )
165
  colored_tokens.append(colored_token)
166
 
167
- print(" ".join(colored_tokens))
168
- print("\n\n\n")
169
 
170
 
171
  def do_inference(cfg, model, tokenizer):
@@ -247,7 +253,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
247
  ddp_find_unused_parameters=False if cfg.ddp else None,
248
  group_by_length=cfg.group_by_length,
249
  report_to="wandb" if cfg.use_wandb else None,
250
- run_name=cfg.wandb_run_name if cfg.use_wandb else None,
251
  **training_arguments_kwargs,
252
  )
253
 
@@ -341,9 +347,9 @@ def train(
341
  return
342
 
343
  if cfg.dataset_prepared_path and any(Path(cfg.dataset_prepared_path).glob("*")):
344
- print("Loading prepared dataset from disk...")
345
- dataset = load_from_disk(cfg.datasets)
346
- print("Prepared dataset loaded from disk...")
347
  else:
348
  datasets = []
349
  for d in cfg.datasets:
@@ -376,11 +382,12 @@ def train(
376
  [_ for _ in constant_len_dataset]
377
  ).train_test_split(test_size=cfg.val_set_size, shuffle=True, seed=42)
378
 
379
- print("Saving prepared dataset to disk...")
380
- if cfg.dataset_prepared_path:
381
- dataset.save_to_disk(cfg.dataset_prepared_path)
382
- else:
383
- dataset.save_to_disk("data/last_run")
 
384
 
385
  train_dataset = dataset["train"]
386
  eval_dataset = dataset["test"]
@@ -396,9 +403,11 @@ def train(
396
  model.config.use_cache = False
397
 
398
  if torch.__version__ >= "2" and sys.platform != "win32":
 
399
  model = torch.compile(model)
400
 
401
  # go ahead and presave, so we have the adapter config available to inspect
 
402
  lora_config.save_pretrained(cfg.output_dir)
403
 
404
  # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
@@ -407,9 +416,11 @@ def train(
407
  lambda signal, frame: (model.save_pretrained(cfg.output_dir), exit(0)),
408
  )
409
 
 
410
  trainer.train(resume_from_checkpoint=cfg.resume_from_checkpoint)
411
 
412
  # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
 
413
  model.save_pretrained(cfg.output_dir)
414
 
415
 
 
1
+ import logging
2
  import math
3
  import os
4
  import random
 
38
  )
39
  from axolotl.prompters import AlpacaPrompter, GPTeacherPrompter, ShareGPTPrompter
40
 
41
+ logger = logging.getLogger(__name__)
42
+ DEFAULT_DATASET_PREPARED_PATH = "data/last_run"
43
+
44
 
45
  def setup_wandb_env_vars(cfg):
46
  if len(cfg.wandb_project) > 0:
 
50
  os.environ["WANDB_WATCH"] = cfg.wandb_watch
51
  if cfg.wandb_log_model and len(cfg.wandb_log_model) > 0:
52
  os.environ["WANDB_LOG_MODEL"] = cfg.wandb_log_model
53
+ if cfg.wandb_run_id and len(cfg.wandb_run_id) > 0:
54
+ os.environ["WANDB_RUN_ID"] = cfg.wandb_run_id
55
 
56
 
57
  def load_model(base_model, model_type, tokenizer_type, cfg, adapter="lora"):
 
170
  )
171
  colored_tokens.append(colored_token)
172
 
173
+ logger.info(" ".join(colored_tokens))
174
+ logger.info("\n\n\n")
175
 
176
 
177
  def do_inference(cfg, model, tokenizer):
 
253
  ddp_find_unused_parameters=False if cfg.ddp else None,
254
  group_by_length=cfg.group_by_length,
255
  report_to="wandb" if cfg.use_wandb else None,
256
+ run_name=cfg.wandb_run_id if cfg.use_wandb else None,
257
  **training_arguments_kwargs,
258
  )
259
 
 
347
  return
348
 
349
  if cfg.dataset_prepared_path and any(Path(cfg.dataset_prepared_path).glob("*")):
350
+ logger.info("Loading prepared dataset from disk...")
351
+ dataset = load_from_disk(cfg.dataset_prepared_path)
352
+ logger.info("Prepared dataset loaded from disk...")
353
  else:
354
  datasets = []
355
  for d in cfg.datasets:
 
382
  [_ for _ in constant_len_dataset]
383
  ).train_test_split(test_size=cfg.val_set_size, shuffle=True, seed=42)
384
 
385
+ if cfg.local_rank == 0:
386
+ logger.info("Saving prepared dataset to disk...")
387
+ if cfg.dataset_prepared_path:
388
+ dataset.save_to_disk(cfg.dataset_prepared_path)
389
+ else:
390
+ dataset.save_to_disk(DEFAULT_DATASET_PREPARED_PATH)
391
 
392
  train_dataset = dataset["train"]
393
  eval_dataset = dataset["test"]
 
403
  model.config.use_cache = False
404
 
405
  if torch.__version__ >= "2" and sys.platform != "win32":
406
+ logger.info("Compiling torch model")
407
  model = torch.compile(model)
408
 
409
  # go ahead and presave, so we have the adapter config available to inspect
410
+ logger.info(f"Pre-saving adapter config to {cfg.output_dir}")
411
  lora_config.save_pretrained(cfg.output_dir)
412
 
413
  # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
 
416
  lambda signal, frame: (model.save_pretrained(cfg.output_dir), exit(0)),
417
  )
418
 
419
+ logger.info("Starting trainer...")
420
  trainer.train(resume_from_checkpoint=cfg.resume_from_checkpoint)
421
 
422
  # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
423
+ logger.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
424
  model.save_pretrained(cfg.output_dir)
425
 
426