lrl-modelcloud commited on
Commit
854b0c5
1 Parent(s): dd26e3d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +13 -5
README.md CHANGED
@@ -23,7 +23,8 @@ import os
23
  os.environ['VLLM_ATTENTION_BACKEND'] = 'FLASHINFER'
24
 
25
  from transformers import AutoTokenizer
26
- from vllm import LLM, SamplingParams
 
27
 
28
  model_name = "ModelCloud/gemma-2-27b-it-gptq-4bit"
29
 
@@ -31,13 +32,20 @@ prompt = [{"role": "user", "content": "I am in Shanghai, preparing to visit the
31
 
32
  tokenizer = AutoTokenizer.from_pretrained(model_name)
33
 
34
- llm = LLM(
35
- model=model_name,
36
- )
 
 
37
  sampling_params = SamplingParams(temperature=0.95, max_tokens=128)
38
 
39
  inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
40
- outputs = llm.generate(prompts=inputs, sampling_params=sampling_params)
 
 
 
 
41
 
42
  print(outputs[0].outputs[0].text)
 
43
  ```
 
23
  os.environ['VLLM_ATTENTION_BACKEND'] = 'FLASHINFER'
24
 
25
  from transformers import AutoTokenizer
26
+ from gptqmodel import BACKEND, GPTQModel
27
+ from vllm import SamplingParams
28
 
29
  model_name = "ModelCloud/gemma-2-27b-it-gptq-4bit"
30
 
 
32
 
33
  tokenizer = AutoTokenizer.from_pretrained(model_name)
34
 
35
+ model = GPTQModel.from_quantized(
36
+ model_name,
37
+ backend=BACKEND.VLLM,
38
+ )
39
+
40
  sampling_params = SamplingParams(temperature=0.95, max_tokens=128)
41
 
42
  inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
43
+
44
+ outputs = model.generate(
45
+ prompts=inputs,
46
+ sampling_params=sampling_params,
47
+ )
48
 
49
  print(outputs[0].outputs[0].text)
50
+
51
  ```