chuanli-lambda commited on
Commit
e841326
·
verified ·
1 Parent(s): d9f3dda

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +27 -3
README.md CHANGED
@@ -1,3 +1,27 @@
1
- ---
2
- license: llama3.3
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: llama3.3
3
+ ---
4
+
5
+ The original [Llama 3.3 70B Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) model quantized using AutoAWQ. Follow the instruction [here](https://docs.vllm.ai/en/latest/quantization/auto_awq.html).
6
+
7
+ ```
8
+ from awq import AutoAWQForCausalLM
9
+ from transformers import AutoTokenizer
10
+
11
+ model_path = 'meta-llama/Llama-3.3-70B-Instruct'
12
+ quant_path = 'Llama-3.3-70B-Instruct-AWQ-4bit'
13
+ quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
14
+
15
+ # Load model
16
+ model = AutoAWQForCausalLM.from_pretrained(
17
+ model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
18
+ )
19
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
20
+
21
+ # Quantize
22
+ model.quantize(tokenizer, quant_config=quant_config)
23
+
24
+ # Save quantized model
25
+ model.save_quantized(quant_path)
26
+ tokenizer.save_pretrained(quant_path)
27
+ ```