Model Card

High quality quantization of GLM-4.5-Air without using imatrix.

Run

ik_llama.cpp

See this detailed guide on how to setup ik_llama and how to make custom quants.

./build/bin/llama-server \
    --alias anikifoss/GLM-4.5-Air-HQ4_K \
    --model /mnt/data/Models/anikifoss/GLM-4.5-Air-HQ4_K/GLM-4.5-Air-HQ4_K-00001-of-00002.gguf \
    --no-mmap -rtr \
    --temp 0.5 --top-k 0 --top-p 1.0 --min-p 0.1 --repeat-penalty 1.0 \
    --ctx-size 116000 \
    -ctk f16 -ctv f16 \
    -fa \
    -b 1024 -ub 1024 \
    -fmoe \
    --n-gpu-layers 99 \
    --override-tensor exps=CPU \
    --parallel 1 \
    --threads 32 \
    --threads-batch 64 \
    --host 127.0.0.1 \
    --port 8090

llama.cpp

./build/bin/llama-server \
    --alias anikifoss/GLM-4.5-Air-HQ4_K \
    --model /mnt/data/Models/anikifoss/GLM-4.5-Air-HQ4_K/GLM-4.5-Air-HQ4_K-00001-of-00002.gguf \
    --no-mmap \
    --temp 0.5 --top-k 0 --top-p 1.0 --min-p 0.1 \
    --ctx-size 116000 \
    -ctk f16 -ctv f16 \
    -fa \
    --override-tensor exps=CPU \
    -ngl 99 \
    --jinja \
    --parallel 1 \
    --threads 32 \
    --threads-batch 64 \
    --host 127.0.0.1 \
    --port 8090

Quantization Recipe

Quantized with ik_llama, but should work with any GGUF compatible inference framework.

#!/usr/bin/env bash

custom="
blk\.0\.ffn_down\.weight=q8_0
blk\.0\.ffn_gate\.weight=q8_0
blk\.0\.ffn_up\.weight=q8_0
blk\.46\.nextn\.eh_proj\.weight=bf16
blk\.46\.nextn\.embed_tokens\.weight=q8_0
blk\.46\.nextn\.enorm\.weight=f32
blk\.46\.nextn\.hnorm\.weight=f32
blk\.46\.nextn\.shared_head_head\.weight=q8_0
blk\.46\.nextn\.shared_head_norm\.weight=f32

blk\.[0-9]\.attn_k\.bias=f32
blk\.[0-9]\.attn_k\.weight=q8_0
blk\.[0-9]\.attn_norm\.weight=f32
blk\.[0-9]\.attn_output\.weight=q8_0
blk\.[0-9]\.attn_q\.bias=f32
blk\.[0-9]\.attn_q\.weight=q8_0
blk\.[0-9]\.attn_v\.bias=f32
blk\.[0-9]\.attn_v\.weight=q8_0
blk\.[0-9]\.post_attention_norm\.weight=f32
blk\.[1-3][0-9]\.attn_k\.bias=f32
blk\.[1-3][0-9]\.attn_k\.weight=q8_0
blk\.[1-3][0-9]\.attn_norm\.weight=f32
blk\.[1-3][0-9]\.attn_output\.weight=q8_0
blk\.[1-3][0-9]\.attn_q\.bias=f32
blk\.[1-3][0-9]\.attn_q\.weight=q8_0
blk\.[1-3][0-9]\.attn_v\.bias=f32
blk\.[1-3][0-9]\.attn_v\.weight=q8_0
blk\.[1-3][0-9]\.post_attention_norm\.weight=f32
blk\.4[0-6]\.attn_k\.bias=f32
blk\.4[0-6]\.attn_k\.weight=q8_0
blk\.4[0-6]\.attn_norm\.weight=f32
blk\.4[0-6]\.attn_output\.weight=q8_0
blk\.4[0-6]\.attn_q\.bias=f32
blk\.4[0-6]\.attn_q\.weight=q8_0
blk\.4[0-6]\.attn_v\.bias=f32
blk\.4[0-6]\.attn_v\.weight=q8_0
blk\.4[0-6]\.post_attention_norm\.weight=f32

blk\.[1-9]\.exp_probs_b\.bias=f32
blk\.[1-9]\.ffn_down_exps\.weight=q8_0
blk\.[1-9]\.ffn_down_shexp\.weight=bf16
blk\.[1-9]\.ffn_gate_exps\.weight=q4_K
blk\.[1-9]\.ffn_gate_inp\.weight=f32
blk\.[1-9]\.ffn_gate_shexp\.weight=bf16
blk\.[1-9]\.ffn_up_exps\.weight=q4_K
blk\.[1-9]\.ffn_up_shexp\.weight=bf16
blk\.[1-3][0-9]\.exp_probs_b\.bias=f32
blk\.[1-3][0-9]\.ffn_down_exps\.weight=q8_0
blk\.[1-3][0-9]\.ffn_down_shexp\.weight=bf16
blk\.[1-3][0-9]\.ffn_gate_exps\.weight=q4_K
blk\.[1-3][0-9]\.ffn_gate_inp\.weight=f32
blk\.[1-3][0-9]\.ffn_gate_shexp\.weight=bf16
blk\.[1-3][0-9]\.ffn_up_exps\.weight=q4_K
blk\.[1-3][0-9]\.ffn_up_shexp\.weight=bf16
blk\.4[0-6]\.exp_probs_b\.bias=f32
blk\.4[0-6]\.ffn_down_exps\.weight=q8_0
blk\.4[0-6]\.ffn_down_shexp\.weight=bf16
blk\.4[0-6]\.ffn_gate_exps\.weight=q4_K
blk\.4[0-6]\.ffn_gate_inp\.weight=f32
blk\.4[0-6]\.ffn_gate_shexp\.weight=bf16
blk\.4[0-6]\.ffn_up_exps\.weight=q4_K
blk\.4[0-6]\.ffn_up_shexp\.weight=bf16

output\.weight=bf16
output_norm\.weight=f32
token_embd\.weight=bf16
"

custom=$(
  echo "$custom" | grep -v '^#' | \
  sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)

echo "Running with: -custom-q $custom"

TARGET_MODEL="GLM-4.5-Air-HQ4_K"
mkdir -p ~/Env/models/anikifoss/$TARGET_MODEL
./build/bin/llama-quantize \
    --custom-q "$custom" \
    /mnt/data/Models/zai-org/GLM-4.5-Air-GGUF/GLM-4.5-Air-128x9.4B-BF16-00001-of-00005.gguf \
    ~/Env/models/anikifoss/$TARGET_MODEL/$TARGET_MODEL.gguf \
    Q4_K \
    32
Downloads last month
47
GGUF
Model size
110B params
Architecture
glm4moe
Hardware compatibility
Log In to view the estimation
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐Ÿ™‹ Ask for provider support

Model tree for anikifoss/GLM-4.5-Air-HQ4_K

Quantized
(48)
this model