cicdatopea
commited on
Update README.md
Browse files
README.md
CHANGED
@@ -29,7 +29,7 @@ intel-extension-for-transformers: faster repacking, slower inference,higher accu
|
|
29 |
|
30 |
intel-extension-for-pytorch: much slower repacking, faster inference, lower accuracy
|
31 |
|
32 |
-
|
33 |
from auto_round import AutoRoundConfig ##must import for autoround format
|
34 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
35 |
import torch
|
@@ -161,7 +161,7 @@ prompt = "There is a girl who likes adventure,"
|
|
161 |
prompt = "Please give a brief introduction of DeepSeek company."
|
162 |
##INT4:
|
163 |
"""DeepSeek Artificial Intelligence Co., Ltd. (referred to as "DeepSeek" or "深度求索") , founded in 2023, is a Chinese company dedicated to making AGI a reality"""
|
164 |
-
|
165 |
|
166 |
### INT4 Inference on CUDA(have not tested, maybe need 8X80G GPU)
|
167 |
|
@@ -217,7 +217,7 @@ we have no enough resource to evaluate the model
|
|
217 |
|
218 |
We discovered that the inputs and outputs of certain layers in this model are very large and even exceed the FP16 range when tested with a few prompts. It is recommended to exclude these layers from quantization—particularly the 'down_proj' in layer 60—and run them using BF16 precision instead. However, we have not implemented this in this int4 model as in cpu, the compute dtype for int4 is bf16 or FP32.
|
219 |
|
220 |
-
|
221 |
model.layers.60.mlp.experts.150.down_proj tensor(1144.) tensor(2122.9451)
|
222 |
model.layers.60.mlp.experts.231.down_proj tensor(25856.) tensor(12827.9980)
|
223 |
model.layers.60.mlp.shared_experts.down_proj tensor(1880.) tensor(3156.7344)
|
@@ -227,17 +227,14 @@ model.layers.59.mlp.experts.138.down_proj tensor(1568.) tensor(190.8769)
|
|
227 |
model.layers.60.mlp.experts.81.down_proj tensor(7360.) tensor(10024.4531)
|
228 |
model.layers.60.mlp.experts.92.down_proj tensor(116224.) tensor(55192.4180)
|
229 |
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
|
234 |
**1 add meta data to bf16 model** https://huggingface.co/opensourcerelease/DeepSeek-V3-bf16
|
235 |
|
236 |
-
|
237 |
import safetensors
|
238 |
from safetensors.torch import save_file
|
239 |
|
240 |
-
|
241 |
for i in range(1, 164):
|
242 |
idx_str = "0" * (5-len(str(i))) + str(i)
|
243 |
safetensors_path = f"model-{idx_str}-of-000163.safetensors"
|
@@ -247,7 +244,7 @@ for i in range(1, 164):
|
|
247 |
for key in f.keys():
|
248 |
tensors[key] = f.get_tensor(key)
|
249 |
save_file(tensors, safetensors_path, metadata={'format': 'pt'})
|
250 |
-
|
251 |
|
252 |
|
253 |
|
@@ -259,9 +256,9 @@ https://github.com/intel/auto-round/blob/deepseekv3/modeling_deepseek.py
|
|
259 |
|
260 |
**3 tuning**
|
261 |
|
262 |
-
|
263 |
git clone https://github.com/intel/auto-round.git && cd auto-round && git checkout deepseekv3
|
264 |
-
|
265 |
|
266 |
```bash
|
267 |
python3 -m auto_round --model "/models/DeepSeek-V3-bf16/" --group_size 128 --format "auto_gptq" --iters 200 --devices 0,1,2,3,4 --nsamples 512 --batch_size 8 --seqlen 512 --low_gpu_mem_usage --output_dir "tmp_autoround" --disable_eval e 2>&1 | tee -a seekv3.txt
|
@@ -289,4 +286,4 @@ The license on this model does not constitute legal advice. We are not responsib
|
|
289 |
|
290 |
@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
|
291 |
|
292 |
-
[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
|
|
|
29 |
|
30 |
intel-extension-for-pytorch: much slower repacking, faster inference, lower accuracy
|
31 |
|
32 |
+
~~~python
|
33 |
from auto_round import AutoRoundConfig ##must import for autoround format
|
34 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
35 |
import torch
|
|
|
161 |
prompt = "Please give a brief introduction of DeepSeek company."
|
162 |
##INT4:
|
163 |
"""DeepSeek Artificial Intelligence Co., Ltd. (referred to as "DeepSeek" or "深度求索") , founded in 2023, is a Chinese company dedicated to making AGI a reality"""
|
164 |
+
~~~
|
165 |
|
166 |
### INT4 Inference on CUDA(have not tested, maybe need 8X80G GPU)
|
167 |
|
|
|
217 |
|
218 |
We discovered that the inputs and outputs of certain layers in this model are very large and even exceed the FP16 range when tested with a few prompts. It is recommended to exclude these layers from quantization—particularly the 'down_proj' in layer 60—and run them using BF16 precision instead. However, we have not implemented this in this int4 model as in cpu, the compute dtype for int4 is bf16 or FP32.
|
219 |
|
220 |
+
~~~python
|
221 |
model.layers.60.mlp.experts.150.down_proj tensor(1144.) tensor(2122.9451)
|
222 |
model.layers.60.mlp.experts.231.down_proj tensor(25856.) tensor(12827.9980)
|
223 |
model.layers.60.mlp.shared_experts.down_proj tensor(1880.) tensor(3156.7344)
|
|
|
227 |
model.layers.60.mlp.experts.81.down_proj tensor(7360.) tensor(10024.4531)
|
228 |
model.layers.60.mlp.experts.92.down_proj tensor(116224.) tensor(55192.4180)
|
229 |
|
230 |
+
~~~
|
|
|
|
|
231 |
|
232 |
**1 add meta data to bf16 model** https://huggingface.co/opensourcerelease/DeepSeek-V3-bf16
|
233 |
|
234 |
+
~~~python
|
235 |
import safetensors
|
236 |
from safetensors.torch import save_file
|
237 |
|
|
|
238 |
for i in range(1, 164):
|
239 |
idx_str = "0" * (5-len(str(i))) + str(i)
|
240 |
safetensors_path = f"model-{idx_str}-of-000163.safetensors"
|
|
|
244 |
for key in f.keys():
|
245 |
tensors[key] = f.get_tensor(key)
|
246 |
save_file(tensors, safetensors_path, metadata={'format': 'pt'})
|
247 |
+
~~~
|
248 |
|
249 |
|
250 |
|
|
|
256 |
|
257 |
**3 tuning**
|
258 |
|
259 |
+
```bash
|
260 |
git clone https://github.com/intel/auto-round.git && cd auto-round && git checkout deepseekv3
|
261 |
+
```
|
262 |
|
263 |
```bash
|
264 |
python3 -m auto_round --model "/models/DeepSeek-V3-bf16/" --group_size 128 --format "auto_gptq" --iters 200 --devices 0,1,2,3,4 --nsamples 512 --batch_size 8 --seqlen 512 --low_gpu_mem_usage --output_dir "tmp_autoround" --disable_eval e 2>&1 | tee -a seekv3.txt
|
|
|
286 |
|
287 |
@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
|
288 |
|
289 |
+
[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
|