ibm-fms
/

llama-13b-accelerator

Inference Endpoints

Model card Files Files and versions Community

JRosenkranz commited on Apr 17

Commit

08facc0

•

1 Parent(s): 9fca77e

Update README.md

Files changed (1) hide show

README.md +4 -4

README.md CHANGED Viewed

@@ -37,7 +37,7 @@ docker run -d --rm --gpus all \
     -p 8033:8033 \
     -v /path/to/all/models:/models \
     -e MODEL_NAME=/models/model_weights/llama/13B-F \
-    -e SPECULATOR_NAME=/models/speculator_weights/llama/llama-13b-accelerator \
     -e FLASH_ATTENTION=true \
     -e PAGED_ATTENTION=true \
     -e DTYPE_STR=float16 \
@@ -85,7 +85,7 @@ python fms-extras/scripts/paged_speculative_inference.py \
     --model_path=/path/to/model_weights/llama/13B-F \
     --model_source=hf \
     --tokenizer=/path/to/llama/13B-F \
-    --speculator_path=/path/to/speculator_weights/llama/13B-F \
     --speculator_source=hf \
     --compile \
     --compile_mode=reduce-overhead
@@ -99,7 +99,7 @@ python fms-extras/scripts/paged_speculative_inference.py \
     --model_path=/path/to/model_weights/llama/13B-F \
     --model_source=hf \
     --tokenizer=/path/to/llama/13B-F \
-    --speculator_path=/path/to/speculator_weights/llama/13B-F \
     --speculator_source=hf \
     --compile \
 ```
@@ -112,7 +112,7 @@ python fms-extras/scripts/paged_speculative_inference.py \
     --model_path=/path/to/model_weights/llama/13B-F \
     --model_source=hf \
     --tokenizer=/path/to/llama/13B-F \
-    --speculator_path=/path/to/speculator_weights/llama/13B-F \
     --speculator_source=hf \
     --batch_input \
     --compile \

     -p 8033:8033 \
     -v /path/to/all/models:/models \
     -e MODEL_NAME=/models/model_weights/llama/13B-F \
+    -e SPECULATOR_NAME=ibm-fms/llama-13b-accelerator \
     -e FLASH_ATTENTION=true \
     -e PAGED_ATTENTION=true \
     -e DTYPE_STR=float16 \
     --model_path=/path/to/model_weights/llama/13B-F \
     --model_source=hf \
     --tokenizer=/path/to/llama/13B-F \
+    --speculator_path=ibm-fms/llama-13b-accelerator \
     --speculator_source=hf \
     --compile \
     --compile_mode=reduce-overhead
     --model_path=/path/to/model_weights/llama/13B-F \
     --model_source=hf \
     --tokenizer=/path/to/llama/13B-F \
+    --speculator_path=ibm-fms/llama-13b-accelerator \
     --speculator_source=hf \
     --compile \
 ```
     --model_path=/path/to/model_weights/llama/13B-F \
     --model_source=hf \
     --tokenizer=/path/to/llama/13B-F \
+    --speculator_path=ibm-fms/llama-13b-accelerator \
     --speculator_source=hf \
     --batch_input \
     --compile \