Add support for EXL2 4 bit KV cache; switch from metric gigabytes (1e9 bytes) to JEDEC gigabytes (2^30 bytes)
Browse filesI'm sorry for mashing 2 unrelated issues together in one PR.
1. I changed the 8 bit cache checkbox to a drop-down that defaults to 16 bit cache, but 8 bit or 4 bit can be selected. The calculation now uses an int value instead of a conditional statement.
1. (Concerns lines **168-170** only) Your calculator over-estimated the memory use because it used metric gigabytes, equal to 1e9 bytes. But VRAM is measured in JEDEC Standard 100B.01 gigabytes, equal to 2^30 bytes. An RTX 4090 has 24 GB = 25.77e9 B memory. This 7.4% difference may seem insignificant, but it is significant when figuring out how big of a model you can squeeze into your GPU. For instance, 22.5 GB is equal 24.16e9 B. The first number suggests that the model will fit in 24 GB VRAM, the other implies it won't.
- index.html +17 -17
@@ -128,19 +128,16 @@
|
|
128 |
return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
|
129 |
}
|
130 |
|
131 |
-
function kvCache(context=8192, model_config,
|
132 |
const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
|
133 |
const n_embd_gqa = model_config["hidden_size"] / n_gqa
|
134 |
const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
|
135 |
const size = 2 * n_elements
|
136 |
-
|
137 |
-
return size
|
138 |
-
}
|
139 |
-
return size * 2
|
140 |
}
|
141 |
|
142 |
-
function contextSize(context=8192, model_config, bsz=512,
|
143 |
-
return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config,
|
144 |
}
|
145 |
|
146 |
function modelSize(model_config, bpw=4.5) {
|
@@ -152,22 +149,22 @@
|
|
152 |
const model_config = await modelConfig(document.getElementById("modelsearch").value)
|
153 |
const context = parseInt(document.getElementById("contextsize").value)
|
154 |
let bsz = 512
|
155 |
-
let
|
156 |
let bpw = 0
|
157 |
if (format === "gguf") {
|
158 |
bsz = parseInt(document.getElementById("batchsize").value)
|
159 |
bpw = gguf_quants[document.getElementById("quantsize").innerText]
|
160 |
|
161 |
} else if (format == "exl2") {
|
162 |
-
|
163 |
bpw = Number.parseFloat(document.getElementById("bpw").value)
|
164 |
}
|
165 |
|
166 |
const model_size = modelSize(model_config, bpw)
|
167 |
-
const context_size = contextSize(context, model_config, bsz,
|
168 |
-
const total_size = ((model_size + context_size) /
|
169 |
-
document.getElementById("resultmodel").innerText = (model_size /
|
170 |
-
document.getElementById("resultcontext").innerText = (context_size /
|
171 |
const result_total_el = document.getElementById("resulttotal");
|
172 |
result_total_el.innerText = total_size.toFixed(2)
|
173 |
|
@@ -401,13 +398,16 @@
|
|
401 |
class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
|
402 |
>
|
403 |
<label
|
404 |
-
for="
|
405 |
class="inline-block bg-white text-xs font-medium text-gray-900"
|
406 |
>
|
407 |
-
|
408 |
</label>
|
409 |
-
<
|
410 |
-
|
|
|
|
|
|
|
411 |
</div>
|
412 |
</div>
|
413 |
</div>
|
|
|
128 |
return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
|
129 |
}
|
130 |
|
131 |
+
function kvCache(context=8192, model_config, cache_bit=16) {
|
132 |
const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
|
133 |
const n_embd_gqa = model_config["hidden_size"] / n_gqa
|
134 |
const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
|
135 |
const size = 2 * n_elements
|
136 |
+
return size * (cache_bit / 8)
|
|
|
|
|
|
|
137 |
}
|
138 |
|
139 |
+
function contextSize(context=8192, model_config, bsz=512, cache_bit=16) {
|
140 |
+
return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, cache_bit) + computeBuffer(context, model_config, bsz)).toFixed(2))
|
141 |
}
|
142 |
|
143 |
function modelSize(model_config, bpw=4.5) {
|
|
|
149 |
const model_config = await modelConfig(document.getElementById("modelsearch").value)
|
150 |
const context = parseInt(document.getElementById("contextsize").value)
|
151 |
let bsz = 512
|
152 |
+
let cache_bit = 16
|
153 |
let bpw = 0
|
154 |
if (format === "gguf") {
|
155 |
bsz = parseInt(document.getElementById("batchsize").value)
|
156 |
bpw = gguf_quants[document.getElementById("quantsize").innerText]
|
157 |
|
158 |
} else if (format == "exl2") {
|
159 |
+
cache_bit = Number.parseInt(document.getElementById("kvCache").value)
|
160 |
bpw = Number.parseFloat(document.getElementById("bpw").value)
|
161 |
}
|
162 |
|
163 |
const model_size = modelSize(model_config, bpw)
|
164 |
+
const context_size = contextSize(context, model_config, bsz, cache_bit)
|
165 |
+
const total_size = ((model_size + context_size) / 2**30)
|
166 |
+
document.getElementById("resultmodel").innerText = (model_size / 2**30).toFixed(2)
|
167 |
+
document.getElementById("resultcontext").innerText = (context_size / 2**30).toFixed(2)
|
168 |
const result_total_el = document.getElementById("resulttotal");
|
169 |
result_total_el.innerText = total_size.toFixed(2)
|
170 |
|
|
|
398 |
class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
|
399 |
>
|
400 |
<label
|
401 |
+
for="kvCache"
|
402 |
class="inline-block bg-white text-xs font-medium text-gray-900"
|
403 |
>
|
404 |
+
KV Cache
|
405 |
</label>
|
406 |
+
<select id="kvCache" name="kvCache">
|
407 |
+
<option value="16">16 bit</option>
|
408 |
+
<option value="8">8 bit</option>
|
409 |
+
<option value="4">4 bit</option>
|
410 |
+
</select>
|
411 |
</div>
|
412 |
</div>
|
413 |
</div>
|