Create README.md

d4b224c verified 21 days ago

4.46 kB

metadata

base_model:
  - Qwen/Qwen3-VL-30B-A3B-Instruct

How to quant:

git clone https://github.com/vllm-project/llm-compressor.git

cd llm-compressor

pip install -e .

pip install transformers==4.57.0

export TOKENIZERS_PARALLELISM=false

cd ..

Script For quant:

Save the next code like quant.py.

python3 quant.py

import torch
from datasets import load_dataset
from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modeling import replace_modules_for_calibration
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.utils import dispatch_for_generation

# NOTE: Requires a minimum of transformers 4.57.0

MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"

# Load model.
model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
    MODEL_ID, 
    torch_dtype=torch.bfloat16,
    device_map=None,
    trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = replace_modules_for_calibration(model)

DATASET_ID = "neuralmagic/calibration"
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 8192

ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)


def preprocess_function(example):
    messages = []
    for message in example["messages"]:
        messages.append(
            {
                "role": message["role"],
                "content": [{"type": "text", "text": message["content"]}],
            }
        )

    return processor.apply_chat_template(
        messages,
        return_tensors="pt",
        padding=False,
        truncation=True,
        max_length=MAX_SEQUENCE_LENGTH,
        tokenize=True,
        add_special_tokens=False,
        return_dict=True,
        add_generation_prompt=False,
    )


ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)


def data_collator(batch):
    assert len(batch) == 1
    return {
        key: (
            torch.tensor(value)
            if key != "pixel_values"
            else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
        )
        for key, value in batch[0].items()
    }


# Configure AWQ quantization with smoothing and balancing
recipe = AWQModifier(
    ignore=[
        're:.*embed_tokens', 
        're:.*input_layernorm$', 
        're:.*mlp[.]gate$', 
        're:.*post_attention_layernorm$', 
        're:.*norm$', 
        're:model[.]visual.*',
        're:visual.*',
        'lm_head'
    ],
    mappings=[
        {
            "smooth_layer": "re:.*input_layernorm$",
            "balance_layers": ['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$']
        },
        {
            "smooth_layer": "re:.*v_proj$",
            "balance_layers": ['re:.*o_proj$']
        },
        {
            "smooth_layer": "re:.*post_attention_layernorm$",
            "balance_layers": ['re:.*gate_proj$', 're:.*up_proj$']
        },
        {
            "smooth_layer": "re:.*up_proj$",
            "balance_layers": ['re:.*down_proj$']
        }
    ],
    duo_scaling=True,
    config_groups={
        "group_0": {
            "targets": ["Linear"],
            "weights": {
                "num_bits": 4,
                "type": "int",
                "symmetric": True,
                "group_size": 32,
                "strategy": "group",
                "block_structure": None,
                "dynamic": False,
                "actorder": None,
                "observer": "mse",
                "observer_kwargs": {}
            },
            "input_activations": None,
            "output_activations": None,
            "format": None
        }
    }
)

# Apply AWQ quantization.
oneshot(
    model=model,
    processor=processor,
    recipe=recipe,
    dataset=ds,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    data_collator=data_collator,

)

print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(processor.decode(output[0]))
print("==========================================")

# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-AWQ-W4A16-mse-seq"
model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)