alvarobartt HF staff commited on
Commit
55aee82
1 Parent(s): 618f4ed

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +16 -22
README.md CHANGED
@@ -49,13 +49,18 @@ import torch
49
  from transformers import AutoModelForCausalLM, AutoTokenizer
50
 
51
  model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
 
 
 
 
 
 
 
 
52
  prompt = [
53
  {"role": "system", "content": "You are a helpful assistant, that responds as a pirate."},
54
  {"role": "user", "content": "What's Deep Learning?"},
55
  ]
56
-
57
- tokenizer = AutoTokenizer.from_pretrained(model_id)
58
-
59
  inputs = tokenizer.apply_chat_template(
60
  prompt,
61
  tokenize=True,
@@ -64,13 +69,6 @@ inputs = tokenizer.apply_chat_template(
64
  return_dict=True,
65
  ).to("cuda")
66
 
67
- model = AutoModelForCausalLM.from_pretrained(
68
- model_id,
69
- torch_dtype=torch.float16,
70
- low_cpu_mem_usage=True,
71
- device_map="auto",
72
- )
73
-
74
  outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
75
  print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
76
  ```
@@ -92,13 +90,18 @@ from auto_gptq import AutoGPTQForCausalLM
92
  from transformers import AutoModelForCausalLM, AutoTokenizer
93
 
94
  model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
 
 
 
 
 
 
 
 
95
  prompt = [
96
  {"role": "system", "content": "You are a helpful assistant, that responds as a pirate."},
97
  {"role": "user", "content": "What's Deep Learning?"},
98
  ]
99
-
100
- tokenizer = AutoTokenizer.from_pretrained(model_id)
101
-
102
  inputs = tokenizer.apply_chat_template(
103
  prompt,
104
  tokenize=True,
@@ -107,13 +110,6 @@ inputs = tokenizer.apply_chat_template(
107
  return_dict=True,
108
  ).to("cuda")
109
 
110
- model = AutoGPTQForCausalLM.from_pretrained(
111
- model_id,
112
- torch_dtype=torch.float16,
113
- low_cpu_mem_usage=True,
114
- device_map="auto",
115
- )
116
-
117
  outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
118
  print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
119
  ```
@@ -135,7 +131,6 @@ Then you just need to run the TGI v2.2.0 (or higher) Docker container as follows
135
  docker run --gpus all --shm-size 1g -ti -p 8080:80 \
136
  -v hf_cache:/data \
137
  -e MODEL_ID=hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 \
138
- -e NUM_SHARD=4 \
139
  -e QUANTIZE=gptq \
140
  -e HF_TOKEN=$(cat ~/.cache/huggingface/token) \
141
  -e MAX_INPUT_LENGTH=4000 \
@@ -214,7 +209,6 @@ docker run --runtime nvidia --gpus all --ipc=host -p 8000:8000 \
214
  vllm/vllm-openai:latest \
215
  --model hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 \
216
  --quantization gptq_marlin \
217
- --tensor-parallel-size 4 \
218
  --max-model-len 4096
219
  ```
220
 
 
49
  from transformers import AutoModelForCausalLM, AutoTokenizer
50
 
51
  model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
52
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
53
+ model = AutoModelForCausalLM.from_pretrained(
54
+ model_id,
55
+ torch_dtype=torch.float16,
56
+ low_cpu_mem_usage=True,
57
+ device_map="auto",
58
+ )
59
+
60
  prompt = [
61
  {"role": "system", "content": "You are a helpful assistant, that responds as a pirate."},
62
  {"role": "user", "content": "What's Deep Learning?"},
63
  ]
 
 
 
64
  inputs = tokenizer.apply_chat_template(
65
  prompt,
66
  tokenize=True,
 
69
  return_dict=True,
70
  ).to("cuda")
71
 
 
 
 
 
 
 
 
72
  outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
73
  print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
74
  ```
 
90
  from transformers import AutoModelForCausalLM, AutoTokenizer
91
 
92
  model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
93
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
94
+ model = AutoGPTQForCausalLM.from_pretrained(
95
+ model_id,
96
+ torch_dtype=torch.float16,
97
+ low_cpu_mem_usage=True,
98
+ device_map="auto",
99
+ )
100
+
101
  prompt = [
102
  {"role": "system", "content": "You are a helpful assistant, that responds as a pirate."},
103
  {"role": "user", "content": "What's Deep Learning?"},
104
  ]
 
 
 
105
  inputs = tokenizer.apply_chat_template(
106
  prompt,
107
  tokenize=True,
 
110
  return_dict=True,
111
  ).to("cuda")
112
 
 
 
 
 
 
 
 
113
  outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
114
  print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
115
  ```
 
131
  docker run --gpus all --shm-size 1g -ti -p 8080:80 \
132
  -v hf_cache:/data \
133
  -e MODEL_ID=hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 \
 
134
  -e QUANTIZE=gptq \
135
  -e HF_TOKEN=$(cat ~/.cache/huggingface/token) \
136
  -e MAX_INPUT_LENGTH=4000 \
 
209
  vllm/vllm-openai:latest \
210
  --model hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 \
211
  --quantization gptq_marlin \
 
212
  --max-model-len 4096
213
  ```
214