lrl-modelcloud
commited on
Commit
•
854b0c5
1
Parent(s):
dd26e3d
Update README.md
Browse files
README.md
CHANGED
@@ -23,7 +23,8 @@ import os
|
|
23 |
os.environ['VLLM_ATTENTION_BACKEND'] = 'FLASHINFER'
|
24 |
|
25 |
from transformers import AutoTokenizer
|
26 |
-
from
|
|
|
27 |
|
28 |
model_name = "ModelCloud/gemma-2-27b-it-gptq-4bit"
|
29 |
|
@@ -31,13 +32,20 @@ prompt = [{"role": "user", "content": "I am in Shanghai, preparing to visit the
|
|
31 |
|
32 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
37 |
sampling_params = SamplingParams(temperature=0.95, max_tokens=128)
|
38 |
|
39 |
inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
|
40 |
-
|
|
|
|
|
|
|
|
|
41 |
|
42 |
print(outputs[0].outputs[0].text)
|
|
|
43 |
```
|
|
|
23 |
os.environ['VLLM_ATTENTION_BACKEND'] = 'FLASHINFER'
|
24 |
|
25 |
from transformers import AutoTokenizer
|
26 |
+
from gptqmodel import BACKEND, GPTQModel
|
27 |
+
from vllm import SamplingParams
|
28 |
|
29 |
model_name = "ModelCloud/gemma-2-27b-it-gptq-4bit"
|
30 |
|
|
|
32 |
|
33 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
34 |
|
35 |
+
model = GPTQModel.from_quantized(
|
36 |
+
model_name,
|
37 |
+
backend=BACKEND.VLLM,
|
38 |
+
)
|
39 |
+
|
40 |
sampling_params = SamplingParams(temperature=0.95, max_tokens=128)
|
41 |
|
42 |
inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
|
43 |
+
|
44 |
+
outputs = model.generate(
|
45 |
+
prompts=inputs,
|
46 |
+
sampling_params=sampling_params,
|
47 |
+
)
|
48 |
|
49 |
print(outputs[0].outputs[0].text)
|
50 |
+
|
51 |
```
|