deepspeed-model-memory-usage

Running

App Files Files Community

muellerzr HF staff commited on Sep 1, 2023

Commit

e44403a

1 Parent(s): be6343c

Big refactor

Browse files

Files changed (8) hide show

Makefile +11 -0
README.md +1 -1
app.py +0 -187
pyproject.toml +16 -0
src/__init__.py +0 -0
src/app.py +74 -0
src/hub_utils.py +62 -0
src/model_utils.py +85 -0

Makefile ADDED Viewed

	@@ -0,0 +1,11 @@

+check_dirs := src
+# this target runs checks on all files
+quality:
+	black --required-version 23 --check $(check_dirs)
+	ruff $(check_dirs)
+# Format source code automatically and check is there are any problems left that need manual fixing
+style:
+	black --required-version 23 $(check_dirs)
+	ruff $(check_dirs) --fix

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: pink
 colorTo: blue
 sdk: gradio
 sdk_version: 3.40.1
-app_file: app.py
 pinned: false
 license: apache-2.0
 ---

 colorTo: blue
 sdk: gradio
 sdk_version: 3.40.1
+app_file: src/app.py
 pinned: false
 license: apache-2.0
 ---

app.py DELETED Viewed

@@ -1,187 +0,0 @@
-import os
-import re
-import webbrowser
-import pandas as pd
-import gradio as gr
-from huggingface_hub import HfApi
-from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
-from accelerate.commands.estimate import create_empty_model, check_has_model
-from accelerate.utils import convert_bytes, calculate_maximum_sizes
-from urllib.parse import urlparse
-# We need to store them as globals because gradio doesn't have a way for us to pass them in to the button
-HAS_DISCUSSION = True
-MODEL_NAME = None
-LIBRARY = None
-USER_TOKEN = None
-TOKEN = os.environ.get("HUGGINGFACE_API_LOGIN", None)
-def translate_llama2(text):
-    "Translates llama-2 to its hf counterpart"
-    if not text.endswith("-hf"):
-        return text + "-hf"
-    return text
-def check_for_discussion(model_name:str):
-    "Checks if an automated discussion has been opened on the model by `model-sizer-bot`"
-    global TOKEN
-    api = HfApi(token=TOKEN)
-    discussions = list(api.get_repo_discussions(model_name))
-    return any(discussion.title == "[AUTOMATED] Model Memory Requirements" and discussion.author == "model-sizer-bot" for discussion in discussions)
-def report_results():
-    "Reports the results of a memory calculation to the model's discussion page, and opens a new tab to it afterwards"
-    global MODEL_NAME, LIBRARY, TOKEN, USER_TOKEN
-    api = HfApi(token=TOKEN)
-    results, data = calculate_memory(MODEL_NAME, LIBRARY, ["fp32", "fp16", "int8", "int4"], access_token=USER_TOKEN, raw=True)
-    minimum = data[0]
-    USER_TOKEN = None
-    post = f"""# Model Memory Requirements\n
-You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam.
-These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub.
-The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer".
-When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model.
-When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer).
-## Results:
-{results}
-"""
-    discussion = api.create_discussion(MODEL_NAME, "[AUTOMATED] Model Memory Requirements", description=post)
-    webbrowser.open_new_tab(discussion.url)
-def extract_from_url(name:str):
-    "Checks if `name` is a URL, and if so converts it to a model name"
-    is_url = False
-    try:
-        result = urlparse(name)
-        is_url = all([result.scheme, result.netloc])
-    except:
-        is_url = False
-    # Pass through if not a URL
-    if not is_url:
-        return name
-    else:
-        path = result.path
-        return path[1:]
-def calculate_memory(model_name:str, library:str, options:list, access_token:str, raw=False):
-    "Calculates the memory usage for a model"
-    if "meta-llama" in model_name:
-        model_name = translate_llama2(model_name)
-    if library == "auto":
-        library = None
-    model_name = extract_from_url(model_name)
-    try:
-        model = create_empty_model(model_name, library_name=library, trust_remote_code=True, access_token=access_token)
-    except GatedRepoError:
-        raise gr.Error(f"Model `{model_name}` is a gated model, please ensure to pass in your access token and try again if you have access. You can find your access token here : https://huggingface.co/settings/tokens. ")
-    except RepositoryNotFoundError:
-        raise gr.Error(f"Model `{model_name}` was not found on the Hub, please try another model name.")
-    except ValueError as e:
-        raise gr.Error(f"Model `{model_name}` does not have any library metadata on the Hub, please manually select a library_name to use (such as `transformers`)")
-    except (RuntimeError, OSError) as e:
-        library = check_has_model(e)
-        if library != "unknown":
-            raise gr.Error(f"Tried to load `{model_name}` with `{library}` but a possible model to load was not found inside the repo.")
-        raise gr.Error(f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`")
-    except ImportError:
-        # hacky way to check if it works with `trust_remote_code=False`
-        model = create_empty_model(model_name, library_name=library, trust_remote_code=False, access_token=access_token)
-    except Exception as e:
-        raise gr.Error(f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`")
-    total_size, largest_layer = calculate_maximum_sizes(model)
-    data = []
-    title = f"Memory Usage for '{model_name}'"
-    for dtype in options:
-        dtype_total_size = total_size
-        dtype_largest_layer = largest_layer[0]
-        if dtype in ("fp16",  "bf16", "float16/bfloat16"):
-            dtype_total_size /= 2
-            dtype_largest_layer /= 2
-        elif dtype == "int8":
-            dtype_total_size /= 4
-            dtype_largest_layer /= 4
-        elif dtype == "int4":
-            dtype_total_size /= 8
-            dtype_largest_layer /= 8
-        dtype_training_size = convert_bytes(dtype_total_size * 4)
-        dtype_total_size = convert_bytes(dtype_total_size)
-        dtype_largest_layer = convert_bytes(dtype_largest_layer)
-        data.append({
-            "dtype": dtype,
-            "Largest Layer or Residual Group": dtype_largest_layer,
-            "Total Size": dtype_total_size,
-            "Training using Adam": dtype_training_size
-        })
-    global HAS_DISCUSSION, MODEL_NAME, LIBRARY
-    HAS_DISCUSSION = check_for_discussion(model_name)
-    MODEL_NAME = model_name
-    LIBRARY = library
-    if raw:
-        return pd.DataFrame(data).to_markdown(index=False), data
-    results = [
-        f'## {title}',
-        gr.update(visible=True, value=pd.DataFrame(data)),
-        gr.update(visible=not HAS_DISCUSSION)
-    ]
-    return results
-with gr.Blocks() as demo:
-    with gr.Column():
-        gr.Markdown(
-            """<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1>
-    This tool will help you calculate how much vRAM is needed to train and perform big model inference
-    on a model hosted on the 🤗 Hugging Face Hub. The minimum recommended vRAM needed for a model
-    is denoted as the size of the "largest layer", and training of a model is roughly 4x its size (for Adam).
-    These calculations are accurate within a few percent at most, such as `bert-base-cased` being 413.68 MB and the calculator estimating 413.18 MB.
-    When performing inference, expect to add up to an additional 20% to this as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/).
-    More tests will be performed in the future to get a more accurate benchmark for each model.
-    Currently this tool supports all models hosted that use `transformers` and `timm`.
-    To use this tool pass in the URL or model name of the model you want to calculate the memory usage for,
-    select which framework it originates from ("auto" will try and detect it from the model metadata), and
-    what precisions you want to use."""
-        )
-        out_text = gr.Markdown()
-        out = gr.DataFrame(
-            headers=["dtype", "Largest Layer", "Total Size", "Training using Adam"],
-            interactive=False,
-            visible=False,
-        )
-        with gr.Row():
-            inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
-        with gr.Row():
-            library = gr.Radio(["auto", "transformers", "timm"], label="Library", value="auto")
-            options = gr.CheckboxGroup(
-                ["float32", "float16/bfloat16", "int8", "int4"],
-                value="float32",
-                label="Model Precision",
-            )
-            access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
-        with gr.Row():
-            btn = gr.Button("Calculate Memory Usage")
-            post_to_hub = gr.Button(value = "Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False)
-    USER_TOKEN = access_token
-    btn.click(
-        calculate_memory, inputs=[inp, library, options, access_token], outputs=[out_text, out, post_to_hub],
-    )
-    post_to_hub.click(report_results).then(lambda: gr.Button.update(visible=False), outputs=post_to_hub)
-demo.launch()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[tool.black]
+line-length = 119
+target-version = ['py37']
+[tool.ruff]
+# Never enforce `E501` (line length violations).
+ignore = ["E501", "E741", "W605"]
+select = ["E", "F", "I", "W"]
+line-length = 119
+# Ignore import violations in all `__init__.py` files.
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
+[tool.ruff.isort]
+lines-after-imports = 2

src/__init__.py ADDED Viewed

File without changes

src/app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import gradio as gr
+import pandas as pd
+from .hub_utils import check_for_discussion, report_results
+from .model_utils import calculate_memory, get_model
+# We need to store them as globals because gradio doesn't have a way for us to pass them in to the button
+MODEL = None
+def get_results(model_name: str, library: str, options: list, access_token: str):
+    global MODEL
+    MODEL = get_model(model_name, library, access_token)
+    has_discussion = check_for_discussion(model_name)
+    title = f"## Memory usage for '{model_name}'"
+    data = calculate_memory(MODEL, options)
+    return [title, gr.update(visible=True, value=pd.DataFrame(data)), gr.update(visible=not has_discussion)]
+with gr.Blocks() as demo:
+    with gr.Column():
+        gr.Markdown(
+            """<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1>
+    This tool will help you calculate how much vRAM is needed to train and perform big model inference
+    on a model hosted on the 🤗 Hugging Face Hub. The minimum recommended vRAM needed for a model
+    is denoted as the size of the "largest layer", and training of a model is roughly 4x its size (for Adam).
+    These calculations are accurate within a few percent at most, such as `bert-base-cased` being 413.68 MB and the calculator estimating 413.18 MB.
+    When performing inference, expect to add up to an additional 20% to this as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/).
+    More tests will be performed in the future to get a more accurate benchmark for each model.
+    Currently this tool supports all models hosted that use `transformers` and `timm`.
+    To use this tool pass in the URL or model name of the model you want to calculate the memory usage for,
+    select which framework it originates from ("auto" will try and detect it from the model metadata), and
+    what precisions you want to use."""
+        )
+        out_text = gr.Markdown()
+        out = gr.DataFrame(
+            headers=["dtype", "Largest Layer", "Total Size", "Training using Adam"],
+            interactive=False,
+            visible=False,
+        )
+        with gr.Row():
+            inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
+        with gr.Row():
+            library = gr.Radio(["auto", "transformers", "timm"], label="Library", value="auto")
+            options = gr.CheckboxGroup(
+                ["float32", "float16/bfloat16", "int8", "int4"],
+                value="float32",
+                label="Model Precision",
+            )
+            access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
+        with gr.Row():
+            btn = gr.Button("Calculate Memory Usage")
+            post_to_hub = gr.Button(
+                value="Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False
+            )
+    btn.click(
+        get_results,
+        inputs=[inp, library, options, access_token],
+        outputs=[out_text, out, post_to_hub],
+    )
+    post_to_hub.click(report_results, inputs=[inp, library, access_token]).then(
+        lambda: gr.Button.update(visible=False), outputs=post_to_hub
+    )
+demo.launch()

src/hub_utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Utilities related to searching and posting on the Hub
+import os
+import webbrowser
+from urllib.parse import urlparse
+import pandas as pd
+from huggingface_hub import HfApi
+from .model_utils import calculate_memory, get_model
+def extract_from_url(name: str):
+    "Checks if `name` is a URL, and if so converts it to a model name"
+    is_url = False
+    try:
+        result = urlparse(name)
+        is_url = all([result.scheme, result.netloc])
+    except Exception:
+        is_url = False
+    # Pass through if not a URL
+    if not is_url:
+        return name
+    else:
+        path = result.path
+        return path[1:]
+def check_for_discussion(model_name: str):
+    "Checks if an automated discussion has been opened on the model by `model-sizer-bot`"
+    api = HfApi(token=os.environ.get("HUGGINGFACE_API_LOGIN", None))
+    discussions = list(api.get_repo_discussions(model_name))
+    return any(
+        discussion.title == "[AUTOMATED] Model Memory Requirements" and discussion.author == "model-sizer-bot"
+        for discussion in discussions
+    )
+def report_results(model_name, library, access_token):
+    "Reports the results of a memory calculation to the model's discussion page, and opens a new tab to it afterwards"
+    model = get_model(model_name, library, access_token)
+    data = calculate_memory(model, ["fp32", "fp16", "int8", "int4"])
+    minimum = data[0]
+    data = pd.DataFrame(data).to_markdown(index=False)
+    post = f"""# Model Memory Requirements\n
+You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam.
+These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub.
+The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer".
+When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model.
+When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer).
+## Results:
+{data}
+"""
+    api = HfApi(token=os.environ.get("HUGGINGFACE_API_LOGIN", None))
+    discussion = api.create_discussion(model_name, "[AUTOMATED] Model Memory Requirements", description=post)
+    webbrowser.open_new_tab(discussion.url)

src/model_utils.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Utilities related to loading in and working with models/specific models
+import gradio as gr
+import torch
+from accelerate.commands.estimate import check_has_model, create_empty_model
+from accelerate.utils import calculate_maximum_sizes, convert_bytes
+from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
+from .hub_utils import extract_from_url
+DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8}
+def translate_llama2(text):
+    "Translates llama-2 to its hf counterpart"
+    if not text.endswith("-hf"):
+        return text + "-hf"
+    return text
+def get_model(model_name: str, library: str, access_token: str):
+    "Finds and grabs model from the Hub, and initializes on `meta`"
+    if "meta-llama" in model_name:
+        model_name = translate_llama2(model_name)
+    if library == "auto":
+        library = None
+    model_name = extract_from_url(model_name)
+    try:
+        model = create_empty_model(model_name, library_name=library, trust_remote_code=True, access_token=access_token)
+    except GatedRepoError:
+        raise gr.Error(
+            f"Model `{model_name}` is a gated model, please ensure to pass in your access token and try again if you have access. You can find your access token here : https://huggingface.co/settings/tokens. "
+        )
+    except RepositoryNotFoundError:
+        raise gr.Error(f"Model `{model_name}` was not found on the Hub, please try another model name.")
+    except ValueError:
+        raise gr.Error(
+            f"Model `{model_name}` does not have any library metadata on the Hub, please manually select a library_name to use (such as `transformers`)"
+        )
+    except (RuntimeError, OSError) as e:
+        library = check_has_model(e)
+        if library != "unknown":
+            raise gr.Error(
+                f"Tried to load `{model_name}` with `{library}` but a possible model to load was not found inside the repo."
+            )
+        raise gr.Error(
+            f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`"
+        )
+    except ImportError:
+        # hacky way to check if it works with `trust_remote_code=False`
+        model = create_empty_model(
+            model_name, library_name=library, trust_remote_code=False, access_token=access_token
+        )
+    except Exception as e:
+        raise gr.Error(
+            f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`"
+        )
+    return model
+def calculate_memory(model: torch.nn.Module, options: list):
+    "Calculates the memory usage for a model init on `meta` device"
+    total_size, largest_layer = calculate_maximum_sizes(model)
+    data = []
+    for dtype in options:
+        dtype_total_size = total_size
+        dtype_largest_layer = largest_layer[0]
+        modifier = DTYPE_MODIFIER[dtype]
+        dtype_total_size /= modifier
+        dtype_largest_layer /= modifier
+        dtype_training_size = convert_bytes(dtype_total_size * 4)
+        dtype_total_size = convert_bytes(dtype_total_size)
+        dtype_largest_layer = convert_bytes(dtype_largest_layer)
+        data.append(
+            {
+                "dtype": dtype,
+                "Largest Layer or Residual Group": dtype_largest_layer,
+                "Total Size": dtype_total_size,
+                "Training using Adam": dtype_training_size,
+            }
+        )
+    return data