Spaces:

Dovakiins
/

qwerrwe

Build error

App Files Files Community

kallewoof Karl-Johan Alm

winglian commited on Jan 5, 2024

Commit

bdfefaf

unverified ·

1 Parent(s): 63fb3eb

feature: better device mapping for large models (#918)

Browse files

* fix: improved memory handling when model is bigger than existing VRAM

* feature: add lora_on_cpu flag to do LoRA loading on CPU (RAM)

For big models where the models are taking up the entire GPU VRAM, the LoRA part will fail unless it is loaded on CPU only.

* doc: add README

* fix: enable progress bars in do_merge_lora()

* doc: mention gpu_memory_limit and lora_on_cpu in merge part of README

* Update src/axolotl/utils/models.py

Co-authored-by: Wing Lian <wing.lian@gmail.com>

* fix: remove deletion of removed model_kwargs key

* fix: validate that gpu_memory_limit and max_memory are not both set

---------

Co-authored-by: Karl-Johan Alm <kalle@gmail.com>
Co-authored-by: Wing Lian <wing.lian@gmail.com>

Files changed (4) hide show

README.md +8 -1
src/axolotl/cli/__init__.py +2 -1
src/axolotl/utils/config.py +5 -0
src/axolotl/utils/models.py +37 -4

README.md CHANGED Viewed

@@ -550,6 +550,11 @@ tf32: true # require >=ampere
 bfloat16: true # require >=ampere
 float16: true
 # A list of one or more datasets to finetune the model with
 datasets:
   # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
@@ -1042,12 +1047,14 @@ The following command will merge your LORA adapater with your base model.  You c
 python3 -m axolotl.cli.merge_lora your_config.yml --lora_model_dir="./completed-model"
 ```
-If you run out of CUDA memory, you can try to merge in system RAM with
 ```bash
 CUDA_VISIBLE_DEVICES="" python3 -m axolotl.cli.merge_lora ...
 ```
 ## Common Errors 🧰
 See also the [FAQ's](./docs/faq.md).

 bfloat16: true # require >=ampere
 float16: true
+# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
+gpu_memory_limit: 20GiB
+# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
+lora_on_cpu: true
 # A list of one or more datasets to finetune the model with
 datasets:
   # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
 python3 -m axolotl.cli.merge_lora your_config.yml --lora_model_dir="./completed-model"
 ```
+You may need to use the `gpu_memory_limit` and/or `lora_on_cpu` config options to avoid running out of memory. If you still run out of CUDA memory, you can try to merge in system RAM with
 ```bash
 CUDA_VISIBLE_DEVICES="" python3 -m axolotl.cli.merge_lora ...
 ```
+although this will be very slow, and using the config options above are recommended instead.
 ## Common Errors 🧰
 See also the [FAQ's](./docs/faq.md).

src/axolotl/cli/__init__.py CHANGED Viewed

@@ -73,7 +73,7 @@ def do_merge_lora(
     safe_serialization = cfg.save_safetensors is True
     LOG.info("running merge of LoRA with base model")
-    model = model.merge_and_unload()
     model.to(dtype=cfg.torch_dtype)
     if cfg.local_rank == 0:
@@ -81,6 +81,7 @@ def do_merge_lora(
         model.save_pretrained(
             str(Path(cfg.output_dir) / "merged"),
             safe_serialization=safe_serialization,
         )
         tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))

     safe_serialization = cfg.save_safetensors is True
     LOG.info("running merge of LoRA with base model")
+    model = model.merge_and_unload(progressbar=True)
     model.to(dtype=cfg.torch_dtype)
     if cfg.local_rank == 0:
         model.save_pretrained(
             str(Path(cfg.output_dir) / "merged"),
             safe_serialization=safe_serialization,
+            progressbar=True,
         )
         tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))

src/axolotl/utils/config.py CHANGED Viewed

@@ -457,6 +457,11 @@ def validate_config(cfg):
             "lora_modules_to_save not properly set yet adding new tokens. Please add `embed_tokens` and `lm_head` to `lora_modules_to_save`."
         )
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25

             "lora_modules_to_save not properly set yet adding new tokens. Please add `embed_tokens` and `lm_head` to `lora_modules_to_save`."
         )
+    if cfg.max_memory is not None and cfg.gpu_memory_limit is not None:
+        raise ValueError(
+            "max_memory and gpu_memory_limit are mutually exclusive and cannot be used together."
+        )
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25

src/axolotl/utils/models.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import logging
 import math
 import os
-from typing import Optional, Tuple  # noqa: F401
 import addict
 import bitsandbytes as bnb
@@ -288,8 +288,37 @@ def load_model(
     model_kwargs = {}
-    model_kwargs["device_map"] = cfg.device_map
-    model_kwargs["max_memory"] = cfg.max_memory
     model_kwargs["torch_dtype"] = cfg.torch_dtype
     # TODO can we put the reference model on it's own gpu? I think we have to move logits around to calculate loss
     # if cfg.rl:
@@ -426,7 +455,6 @@ def load_model(
             model_kwargs["device"] = torch.cuda.current_device()
             del model_kwargs["torch_dtype"]
             del model_kwargs["device_map"]
-            del model_kwargs["max_memory"]
             model = MambaLMHeadModel.from_pretrained(
                 base_model,
@@ -683,10 +711,15 @@ def load_lora(model, cfg, inference=False):
     if cfg.lora_model_dir:
         LOG.debug("Loading pretained PEFT - LoRA")
         model = PeftModel.from_pretrained(
             model,
             cfg.lora_model_dir,
             is_trainable=(not inference),
         )
     else:
         model = get_peft_model(model, lora_config)

 import logging
 import math
 import os
+from typing import Any, Optional, Tuple  # noqa: F401
 import addict
 import bitsandbytes as bnb
     model_kwargs = {}
+    max_memory = cfg.max_memory
+    device_map = cfg.device_map
+    if cfg.gpu_memory_limit:
+        gpu_memory_limit = (
+            str(cfg.gpu_memory_limit) + "GiB"
+            if isinstance(cfg.gpu_memory_limit, int)
+            else cfg.gpu_memory_limit
+        )
+        max_memory = {}
+        for i in range(torch.cuda.device_count()):
+            max_memory[i] = gpu_memory_limit
+        max_memory["cpu"] = "256GiB"  # something sufficiently large to fit anything
+    if max_memory is not None:
+        # Based on https://github.com/togethercomputer/OpenChatKit/blob/main/inference/bot.py
+        from accelerate import infer_auto_device_map, init_empty_weights
+        with init_empty_weights():
+            model_canvas = AutoModelForCausalLM.from_config(model_config)
+        model_canvas.tie_weights()
+        device_map = infer_auto_device_map(
+            model_canvas,
+            max_memory=max_memory,
+            dtype=cfg.torch_dtype,
+        )
+        # We can discard max_memory now as we have a device map set up for us
+        max_memory = None
+    model_kwargs["device_map"] = device_map
     model_kwargs["torch_dtype"] = cfg.torch_dtype
     # TODO can we put the reference model on it's own gpu? I think we have to move logits around to calculate loss
     # if cfg.rl:
             model_kwargs["device"] = torch.cuda.current_device()
             del model_kwargs["torch_dtype"]
             del model_kwargs["device_map"]
             model = MambaLMHeadModel.from_pretrained(
                 base_model,
     if cfg.lora_model_dir:
         LOG.debug("Loading pretained PEFT - LoRA")
+        model_kwargs: Any = {}
+        if cfg.lora_on_cpu:
+            model_kwargs["max_memory"] = {"cpu": "256GiB"}
+            model_kwargs["device_map"] = {"": "cpu"}
         model = PeftModel.from_pretrained(
             model,
             cfg.lora_model_dir,
             is_trainable=(not inference),
+            **model_kwargs,
         )
     else:
         model = get_peft_model(model, lora_config)