Spaces:

NyxKrage
/

LLM-Model-VRAM-Calculator

Running

App Files Files Community

mo137 commited on Mar 16

Commit

c8c7129

•

1 Parent(s): 9243531

Add support for EXL2 4 bit KV cache; switch from metric gigabytes (1e9 bytes) to JEDEC gigabytes (2^30 bytes)

Browse files

I'm sorry for mashing 2 unrelated issues together in one PR.

1. I changed the 8 bit cache checkbox to a drop-down that defaults to 16 bit cache, but 8 bit or 4 bit can be selected. The calculation now uses an int value instead of a conditional statement.

1. (Concerns lines **168-170** only) Your calculator over-estimated the memory use because it used metric gigabytes, equal to 1e9 bytes. But VRAM is measured in JEDEC Standard 100B.01 gigabytes, equal to 2^30 bytes. An RTX 4090 has 24 GB = 25.77e9 B memory. This 7.4% difference may seem insignificant, but it is significant when figuring out how big of a model you can squeeze into your GPU. For instance, 22.5 GB is equal 24.16e9 B. The first number suggests that the model will fit in 24 GB VRAM, the other implies it won't.

Files changed (1) hide show

index.html +17 -17

index.html CHANGED Viewed

@@ -128,19 +128,16 @@
         return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
       }
-      function kvCache(context=8192, model_config, fp8_cache=false) {
         const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
         const n_embd_gqa = model_config["hidden_size"] / n_gqa
         const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
         const size = 2 * n_elements
-        if (fp8_cache) {
-          return size
-        }
-        return size * 2
       }
-      function contextSize(context=8192, model_config, bsz=512, fp8_cache=false) {
-        return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, fp8_cache) + computeBuffer(context, model_config, bsz)).toFixed(2))
       }
       function modelSize(model_config, bpw=4.5) {
@@ -152,22 +149,22 @@
           const model_config = await modelConfig(document.getElementById("modelsearch").value)
           const context = parseInt(document.getElementById("contextsize").value)
           let bsz = 512
-          let fp8_cache = false
           let bpw = 0
           if (format === "gguf") {
             bsz = parseInt(document.getElementById("batchsize").value)
             bpw = gguf_quants[document.getElementById("quantsize").innerText]
           } else if (format == "exl2") {
-            fp8_cache = document.getElementById("fp8cache").checked
             bpw = Number.parseFloat(document.getElementById("bpw").value)
           }
           const model_size = modelSize(model_config, bpw)
-          const context_size = contextSize(context, model_config, bsz, fp8_cache)
-          const total_size = ((model_size + context_size) / 1e+9)
-          document.getElementById("resultmodel").innerText = (model_size / 1e+9).toFixed(2)
-          document.getElementById("resultcontext").innerText = (context_size / 1e+9).toFixed(2)
           const result_total_el = document.getElementById("resulttotal");
           result_total_el.innerText = total_size.toFixed(2)
@@ -401,13 +398,16 @@
                 class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
               >
                 <label
-                  for="fp8cache"
                   class="inline-block bg-white text-xs font-medium text-gray-900"
                 >
-                  FP8 Cache
                 </label>
-                <input id="fp8cache" type="checkbox">
-              </input>
               </div>
             </div>
           </div>

         return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
       }
+      function kvCache(context=8192, model_config, cache_bit=16) {
         const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
         const n_embd_gqa = model_config["hidden_size"] / n_gqa
         const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
         const size = 2 * n_elements
+        return size * (cache_bit / 8)
       }
+      function contextSize(context=8192, model_config, bsz=512, cache_bit=16) {
+        return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, cache_bit) + computeBuffer(context, model_config, bsz)).toFixed(2))
       }
       function modelSize(model_config, bpw=4.5) {
           const model_config = await modelConfig(document.getElementById("modelsearch").value)
           const context = parseInt(document.getElementById("contextsize").value)
           let bsz = 512
+          let cache_bit = 16
           let bpw = 0
           if (format === "gguf") {
             bsz = parseInt(document.getElementById("batchsize").value)
             bpw = gguf_quants[document.getElementById("quantsize").innerText]
           } else if (format == "exl2") {
+            cache_bit = Number.parseInt(document.getElementById("kvCache").value)
             bpw = Number.parseFloat(document.getElementById("bpw").value)
           }
           const model_size = modelSize(model_config, bpw)
+          const context_size = contextSize(context, model_config, bsz, cache_bit)
+          const total_size = ((model_size + context_size) / 2**30)
+          document.getElementById("resultmodel").innerText = (model_size / 2**30).toFixed(2)
+          document.getElementById("resultcontext").innerText = (context_size / 2**30).toFixed(2)
           const result_total_el = document.getElementById("resulttotal");
           result_total_el.innerText = total_size.toFixed(2)
                 class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
               >
                 <label
+                  for="kvCache"
                   class="inline-block bg-white text-xs font-medium text-gray-900"
                 >
+                  KV Cache
                 </label>
+                <select id="kvCache" name="kvCache">
+                  <option value="16">16 bit</option>
+                  <option value="8">8 bit</option>
+                  <option value="4">4 bit</option>
+                </select>
               </div>
             </div>
           </div>