JRosenkranz commited on
Commit
d95c54e
1 Parent(s): 0dae012

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +29 -8
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- license: llama2
3
  ---
4
 
5
  ## Installation from source
@@ -33,7 +33,7 @@ Training is light-weight and can be completed in only a few days depending on ba
33
 
34
  _Note: For all samples, your environment must have access to cuda_
35
 
36
- ### Production Server Sample
37
 
38
  *To try this out running in a production-like environment, please use the pre-built docker image:*
39
 
@@ -43,7 +43,7 @@ _Note: For all samples, your environment must have access to cuda_
43
  HF_HUB_CACHE=/hf_hub_cache
44
  chmod a+w $HF_HUB_CACHE
45
  HF_HUB_TOKEN="your huggingface hub token"
46
- TGIS_IMAGE=quay.io/wxpe/text-gen-server:main.ee927a4
47
 
48
  docker pull $TGIS_IMAGE
49
 
@@ -54,7 +54,7 @@ docker run --rm \
54
  -e TRANSFORMERS_CACHE=/models \
55
  $TGIS_IMAGE \
56
  text-generation-server download-weights \
57
- instructlab/granite-7b-lab \
58
  --token $HF_HUB_TOKEN
59
 
60
  # optionally download the speculator model if the weights do not already exist
@@ -74,7 +74,7 @@ docker run -d --rm --gpus all \
74
  -v $HF_HUB_CACHE:/models \
75
  -e HF_HUB_CACHE=/models \
76
  -e TRANSFORMERS_CACHE=/models \
77
- -e MODEL_NAME=instructlab/granite-7b-lab \
78
  -e SPECULATOR_NAME=ibm-granite/granite-7b-lab-accelerator \
79
  -e FLASH_ATTENTION=true \
80
  -e PAGED_ATTENTION=true \
@@ -101,6 +101,27 @@ python sample_client.py
101
 
102
  _Note: first prompt may be slower as there is a slight warmup time_
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  ### Minimal Sample
105
 
106
  *To try this out with the fms-native compiled model, please execute the following:*
@@ -118,7 +139,7 @@ pip install transformers==4.35.0 sentencepiece numpy
118
  ##### batch_size=1 (compile + cudagraphs)
119
 
120
  ```bash
121
- MODEL_PATH=/path/to/instructlab/granite-7b-lab
122
  python fms-extras/scripts/paged_speculative_inference.py \
123
  --variant=7b.ibm_instruct_lab \
124
  --model_path=$MODEL_PATH \
@@ -135,7 +156,7 @@ python fms-extras/scripts/paged_speculative_inference.py \
135
  ##### batch_size=1 (compile)
136
 
137
  ```bash
138
- MODEL_PATH=/path/to/instructlab/granite-7b-lab
139
  python fms-extras/scripts/paged_speculative_inference.py \
140
  --variant=7b.ibm_instruct_lab \
141
  --model_path=$MODEL_PATH \
@@ -151,7 +172,7 @@ python fms-extras/scripts/paged_speculative_inference.py \
151
  ##### batch_size=4 (compile)
152
 
153
  ```bash
154
- MODEL_PATH=/path/to/instructlab/granite-7b-lab
155
  python fms-extras/scripts/paged_speculative_inference.py \
156
  --variant=7b.ibm_instruct_lab \
157
  --model_path=$MODEL_PATH \
 
1
  ---
2
+ license: apache-2.0
3
  ---
4
 
5
  ## Installation from source
 
33
 
34
  _Note: For all samples, your environment must have access to cuda_
35
 
36
+ ### Use in IBM Production TGIS
37
 
38
  *To try this out running in a production-like environment, please use the pre-built docker image:*
39
 
 
43
  HF_HUB_CACHE=/hf_hub_cache
44
  chmod a+w $HF_HUB_CACHE
45
  HF_HUB_TOKEN="your huggingface hub token"
46
+ TGIS_IMAGE=quay.io/wxpe/text-gen-server:main.ddc56ee
47
 
48
  docker pull $TGIS_IMAGE
49
 
 
54
  -e TRANSFORMERS_CACHE=/models \
55
  $TGIS_IMAGE \
56
  text-generation-server download-weights \
57
+ ibm-granite/granite-7b-lab \
58
  --token $HF_HUB_TOKEN
59
 
60
  # optionally download the speculator model if the weights do not already exist
 
74
  -v $HF_HUB_CACHE:/models \
75
  -e HF_HUB_CACHE=/models \
76
  -e TRANSFORMERS_CACHE=/models \
77
+ -e MODEL_NAME=ibm-granite/granite-7b-lab \
78
  -e SPECULATOR_NAME=ibm-granite/granite-7b-lab-accelerator \
79
  -e FLASH_ATTENTION=true \
80
  -e PAGED_ATTENTION=true \
 
101
 
102
  _Note: first prompt may be slower as there is a slight warmup time_
103
 
104
+ ### Use in Huggingface TGI
105
+
106
+ #### start the server
107
+
108
+ ```bash
109
+ model=ibm-granite/granite-7b-lab-accelerator
110
+ volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
111
+ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model
112
+ ```
113
+
114
+ _note: for tensor parallel, add --num-shard_
115
+
116
+ #### make a request
117
+
118
+ ```bash
119
+ curl 127.0.0.1:8080/generate_stream \
120
+ -X POST \
121
+ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
122
+ -H 'Content-Type: application/json'
123
+ ```
124
+
125
  ### Minimal Sample
126
 
127
  *To try this out with the fms-native compiled model, please execute the following:*
 
139
  ##### batch_size=1 (compile + cudagraphs)
140
 
141
  ```bash
142
+ MODEL_PATH=/path/to/ibm-granite/granite-7b-lab
143
  python fms-extras/scripts/paged_speculative_inference.py \
144
  --variant=7b.ibm_instruct_lab \
145
  --model_path=$MODEL_PATH \
 
156
  ##### batch_size=1 (compile)
157
 
158
  ```bash
159
+ MODEL_PATH=/path/to/ibm-granite/granite-7b-lab
160
  python fms-extras/scripts/paged_speculative_inference.py \
161
  --variant=7b.ibm_instruct_lab \
162
  --model_path=$MODEL_PATH \
 
172
  ##### batch_size=4 (compile)
173
 
174
  ```bash
175
+ MODEL_PATH=/path/to/ibm-granite/granite-7b-lab
176
  python fms-extras/scripts/paged_speculative_inference.py \
177
  --variant=7b.ibm_instruct_lab \
178
  --model_path=$MODEL_PATH \