Update README.md
Browse files
README.md
CHANGED
@@ -49,6 +49,39 @@ Only the weights and activations of the linear operators within transformers blo
|
|
49 |
This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
|
50 |
|
51 |
```python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
vllm serve neuralmagic/Llama-3.2-90B-Vision-Instruct-FP8-dynamic --enforce-eager --max-num-seqs 16 --tensor-parallel-size 4
|
53 |
```
|
54 |
|
|
|
49 |
This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/latest/) backend, as shown in the example below.
|
50 |
|
51 |
```python
|
52 |
+
from vllm import LLM, SamplingParams
|
53 |
+
from vllm.assets.image import ImageAsset
|
54 |
+
|
55 |
+
# Initialize the LLM
|
56 |
+
model_name = "neuralmagic/Llama-3.2-90B-Vision-Instruct-FP8-dynamic"
|
57 |
+
llm = LLM(model=model_name, max_num_seqs=1, enforce_eager=True, tensor_parallel_size=4)
|
58 |
+
|
59 |
+
# Load the image
|
60 |
+
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
61 |
+
|
62 |
+
# Create the prompt
|
63 |
+
question = "If I had to write a haiku for this one, it would be: "
|
64 |
+
prompt = f"<|image|><|begin_of_text|>{question}"
|
65 |
+
|
66 |
+
# Set up sampling parameters
|
67 |
+
sampling_params = SamplingParams(temperature=0.2, max_tokens=30)
|
68 |
+
|
69 |
+
# Generate the response
|
70 |
+
inputs = {
|
71 |
+
"prompt": prompt,
|
72 |
+
"multi_modal_data": {
|
73 |
+
"image": image
|
74 |
+
},
|
75 |
+
}
|
76 |
+
outputs = llm.generate(inputs, sampling_params=sampling_params)
|
77 |
+
|
78 |
+
# Print the generated text
|
79 |
+
print(outputs[0].outputs[0].text)
|
80 |
+
```
|
81 |
+
|
82 |
+
vLLM also supports OpenAI-compatible serving. See the [documentation](https://docs.vllm.ai/en/latest/) for more details.
|
83 |
+
|
84 |
+
```
|
85 |
vllm serve neuralmagic/Llama-3.2-90B-Vision-Instruct-FP8-dynamic --enforce-eager --max-num-seqs 16 --tensor-parallel-size 4
|
86 |
```
|
87 |
|