JRosenkranz
commited on
Commit
•
d95c54e
1
Parent(s):
0dae012
Update README.md
Browse files
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
license:
|
3 |
---
|
4 |
|
5 |
## Installation from source
|
@@ -33,7 +33,7 @@ Training is light-weight and can be completed in only a few days depending on ba
|
|
33 |
|
34 |
_Note: For all samples, your environment must have access to cuda_
|
35 |
|
36 |
-
### Production
|
37 |
|
38 |
*To try this out running in a production-like environment, please use the pre-built docker image:*
|
39 |
|
@@ -43,7 +43,7 @@ _Note: For all samples, your environment must have access to cuda_
|
|
43 |
HF_HUB_CACHE=/hf_hub_cache
|
44 |
chmod a+w $HF_HUB_CACHE
|
45 |
HF_HUB_TOKEN="your huggingface hub token"
|
46 |
-
TGIS_IMAGE=quay.io/wxpe/text-gen-server:main.
|
47 |
|
48 |
docker pull $TGIS_IMAGE
|
49 |
|
@@ -54,7 +54,7 @@ docker run --rm \
|
|
54 |
-e TRANSFORMERS_CACHE=/models \
|
55 |
$TGIS_IMAGE \
|
56 |
text-generation-server download-weights \
|
57 |
-
|
58 |
--token $HF_HUB_TOKEN
|
59 |
|
60 |
# optionally download the speculator model if the weights do not already exist
|
@@ -74,7 +74,7 @@ docker run -d --rm --gpus all \
|
|
74 |
-v $HF_HUB_CACHE:/models \
|
75 |
-e HF_HUB_CACHE=/models \
|
76 |
-e TRANSFORMERS_CACHE=/models \
|
77 |
-
-e MODEL_NAME=
|
78 |
-e SPECULATOR_NAME=ibm-granite/granite-7b-lab-accelerator \
|
79 |
-e FLASH_ATTENTION=true \
|
80 |
-e PAGED_ATTENTION=true \
|
@@ -101,6 +101,27 @@ python sample_client.py
|
|
101 |
|
102 |
_Note: first prompt may be slower as there is a slight warmup time_
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
### Minimal Sample
|
105 |
|
106 |
*To try this out with the fms-native compiled model, please execute the following:*
|
@@ -118,7 +139,7 @@ pip install transformers==4.35.0 sentencepiece numpy
|
|
118 |
##### batch_size=1 (compile + cudagraphs)
|
119 |
|
120 |
```bash
|
121 |
-
MODEL_PATH=/path/to/
|
122 |
python fms-extras/scripts/paged_speculative_inference.py \
|
123 |
--variant=7b.ibm_instruct_lab \
|
124 |
--model_path=$MODEL_PATH \
|
@@ -135,7 +156,7 @@ python fms-extras/scripts/paged_speculative_inference.py \
|
|
135 |
##### batch_size=1 (compile)
|
136 |
|
137 |
```bash
|
138 |
-
MODEL_PATH=/path/to/
|
139 |
python fms-extras/scripts/paged_speculative_inference.py \
|
140 |
--variant=7b.ibm_instruct_lab \
|
141 |
--model_path=$MODEL_PATH \
|
@@ -151,7 +172,7 @@ python fms-extras/scripts/paged_speculative_inference.py \
|
|
151 |
##### batch_size=4 (compile)
|
152 |
|
153 |
```bash
|
154 |
-
MODEL_PATH=/path/to/
|
155 |
python fms-extras/scripts/paged_speculative_inference.py \
|
156 |
--variant=7b.ibm_instruct_lab \
|
157 |
--model_path=$MODEL_PATH \
|
|
|
1 |
---
|
2 |
+
license: apache-2.0
|
3 |
---
|
4 |
|
5 |
## Installation from source
|
|
|
33 |
|
34 |
_Note: For all samples, your environment must have access to cuda_
|
35 |
|
36 |
+
### Use in IBM Production TGIS
|
37 |
|
38 |
*To try this out running in a production-like environment, please use the pre-built docker image:*
|
39 |
|
|
|
43 |
HF_HUB_CACHE=/hf_hub_cache
|
44 |
chmod a+w $HF_HUB_CACHE
|
45 |
HF_HUB_TOKEN="your huggingface hub token"
|
46 |
+
TGIS_IMAGE=quay.io/wxpe/text-gen-server:main.ddc56ee
|
47 |
|
48 |
docker pull $TGIS_IMAGE
|
49 |
|
|
|
54 |
-e TRANSFORMERS_CACHE=/models \
|
55 |
$TGIS_IMAGE \
|
56 |
text-generation-server download-weights \
|
57 |
+
ibm-granite/granite-7b-lab \
|
58 |
--token $HF_HUB_TOKEN
|
59 |
|
60 |
# optionally download the speculator model if the weights do not already exist
|
|
|
74 |
-v $HF_HUB_CACHE:/models \
|
75 |
-e HF_HUB_CACHE=/models \
|
76 |
-e TRANSFORMERS_CACHE=/models \
|
77 |
+
-e MODEL_NAME=ibm-granite/granite-7b-lab \
|
78 |
-e SPECULATOR_NAME=ibm-granite/granite-7b-lab-accelerator \
|
79 |
-e FLASH_ATTENTION=true \
|
80 |
-e PAGED_ATTENTION=true \
|
|
|
101 |
|
102 |
_Note: first prompt may be slower as there is a slight warmup time_
|
103 |
|
104 |
+
### Use in Huggingface TGI
|
105 |
+
|
106 |
+
#### start the server
|
107 |
+
|
108 |
+
```bash
|
109 |
+
model=ibm-granite/granite-7b-lab-accelerator
|
110 |
+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
111 |
+
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model
|
112 |
+
```
|
113 |
+
|
114 |
+
_note: for tensor parallel, add --num-shard_
|
115 |
+
|
116 |
+
#### make a request
|
117 |
+
|
118 |
+
```bash
|
119 |
+
curl 127.0.0.1:8080/generate_stream \
|
120 |
+
-X POST \
|
121 |
+
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
|
122 |
+
-H 'Content-Type: application/json'
|
123 |
+
```
|
124 |
+
|
125 |
### Minimal Sample
|
126 |
|
127 |
*To try this out with the fms-native compiled model, please execute the following:*
|
|
|
139 |
##### batch_size=1 (compile + cudagraphs)
|
140 |
|
141 |
```bash
|
142 |
+
MODEL_PATH=/path/to/ibm-granite/granite-7b-lab
|
143 |
python fms-extras/scripts/paged_speculative_inference.py \
|
144 |
--variant=7b.ibm_instruct_lab \
|
145 |
--model_path=$MODEL_PATH \
|
|
|
156 |
##### batch_size=1 (compile)
|
157 |
|
158 |
```bash
|
159 |
+
MODEL_PATH=/path/to/ibm-granite/granite-7b-lab
|
160 |
python fms-extras/scripts/paged_speculative_inference.py \
|
161 |
--variant=7b.ibm_instruct_lab \
|
162 |
--model_path=$MODEL_PATH \
|
|
|
172 |
##### batch_size=4 (compile)
|
173 |
|
174 |
```bash
|
175 |
+
MODEL_PATH=/path/to/ibm-granite/granite-7b-lab
|
176 |
python fms-extras/scripts/paged_speculative_inference.py \
|
177 |
--variant=7b.ibm_instruct_lab \
|
178 |
--model_path=$MODEL_PATH \
|