Add transformers.js tags and example code
#7
by
Xenova
HF staff
- opened
README.md
CHANGED
@@ -6,11 +6,13 @@ license: apache-2.0
|
|
6 |
tags:
|
7 |
- vision
|
8 |
- image-text-to-text
|
|
|
9 |
datasets:
|
10 |
- lmms-lab/LLaVA-OneVision-Data
|
11 |
pipeline_tag: image-text-to-text
|
12 |
inference: false
|
13 |
arxiv: 2408.03326
|
|
|
14 |
---
|
15 |
# LLaVA-Onevision Model Card
|
16 |
|
@@ -156,6 +158,86 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(
|
|
156 |
).to(0)
|
157 |
```
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
# Citation
|
160 |
```
|
161 |
@misc{li2024llavaonevisioneasyvisualtask,
|
|
|
6 |
tags:
|
7 |
- vision
|
8 |
- image-text-to-text
|
9 |
+
- transformers.js
|
10 |
datasets:
|
11 |
- lmms-lab/LLaVA-OneVision-Data
|
12 |
pipeline_tag: image-text-to-text
|
13 |
inference: false
|
14 |
arxiv: 2408.03326
|
15 |
+
library_name: transformers
|
16 |
---
|
17 |
# LLaVA-Onevision Model Card
|
18 |
|
|
|
158 |
).to(0)
|
159 |
```
|
160 |
|
161 |
+
|
162 |
+
### Usage w/ Transformers.js
|
163 |
+
|
164 |
+
If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@huggingface/transformers) using:
|
165 |
+
```bash
|
166 |
+
npm i @huggingface/transformers
|
167 |
+
```
|
168 |
+
|
169 |
+
**Example:** Multi-round conversations w/ PKV caching
|
170 |
+
```js
|
171 |
+
import { AutoProcessor, AutoTokenizer, LlavaOnevisionForConditionalGeneration, RawImage } from '@huggingface/transformers';
|
172 |
+
|
173 |
+
// Load tokenizer, processor and model
|
174 |
+
const model_id = 'llava-hf/llava-onevision-qwen2-0.5b-ov-hf';
|
175 |
+
|
176 |
+
const tokenizer = await AutoTokenizer.from_pretrained(model_id);
|
177 |
+
const processor = await AutoProcessor.from_pretrained(model_id);
|
178 |
+
const model = await LlavaOnevisionForConditionalGeneration.from_pretrained(model_id, {
|
179 |
+
dtype: {
|
180 |
+
embed_tokens: 'fp16', // or 'fp32' or 'q8'
|
181 |
+
vision_encoder: 'fp16', // or 'fp32' or 'q8'
|
182 |
+
decoder_model_merged: 'q4', // or 'q8'
|
183 |
+
},
|
184 |
+
// device: 'webgpu',
|
185 |
+
});
|
186 |
+
|
187 |
+
// Prepare text inputs
|
188 |
+
const prompt = 'What does the text say?';
|
189 |
+
const messages = [
|
190 |
+
{ role: 'system', content: 'Answer the question.' },
|
191 |
+
{ role: 'user', content: `<image>\n${prompt}` }
|
192 |
+
]
|
193 |
+
const text = tokenizer.apply_chat_template(messages, { tokenize: false, add_generation_prompt: true });
|
194 |
+
const text_inputs = tokenizer(text);
|
195 |
+
|
196 |
+
// Prepare vision inputs
|
197 |
+
const url = 'https://huggingface.co/qnguyen3/nanoLLaVA/resolve/main/example_1.png';
|
198 |
+
const image = await RawImage.fromURL(url);
|
199 |
+
const vision_inputs = await processor(image);
|
200 |
+
|
201 |
+
// Generate response
|
202 |
+
const { past_key_values, sequences } = await model.generate({
|
203 |
+
...text_inputs,
|
204 |
+
...vision_inputs,
|
205 |
+
do_sample: false,
|
206 |
+
max_new_tokens: 64,
|
207 |
+
return_dict_in_generate: true,
|
208 |
+
});
|
209 |
+
|
210 |
+
// Decode output
|
211 |
+
const answer = tokenizer.decode(
|
212 |
+
sequences.slice(0, [text_inputs.input_ids.dims[1], null]),
|
213 |
+
{ skip_special_tokens: true },
|
214 |
+
);
|
215 |
+
console.log(answer);
|
216 |
+
// The text says "small but mighty" in a playful font.
|
217 |
+
|
218 |
+
const new_messages = [
|
219 |
+
...messages,
|
220 |
+
{ role: 'assistant', content: answer },
|
221 |
+
{ role: 'user', content: 'How does the text correlate to the context of the image?' }
|
222 |
+
]
|
223 |
+
const new_text = tokenizer.apply_chat_template(new_messages, { tokenize: false, add_generation_prompt: true });
|
224 |
+
const new_text_inputs = tokenizer(new_text);
|
225 |
+
|
226 |
+
// Generate another response
|
227 |
+
const output = await model.generate({
|
228 |
+
...new_text_inputs,
|
229 |
+
past_key_values,
|
230 |
+
do_sample: false,
|
231 |
+
max_new_tokens: 256,
|
232 |
+
});
|
233 |
+
const new_answer = tokenizer.decode(
|
234 |
+
output.slice(0, [new_text_inputs.input_ids.dims[1], null]),
|
235 |
+
{ skip_special_tokens: true },
|
236 |
+
);
|
237 |
+
console.log(new_answer);
|
238 |
+
// The text "small but mighty" is likely a playful or humorous reference to the image of the blue mouse with the orange dumbbell. It could be used as a motivational phrase or a playful way to express the idea that even small things can be impressive or powerful.
|
239 |
+
```
|
240 |
+
|
241 |
# Citation
|
242 |
```
|
243 |
@misc{li2024llavaonevisioneasyvisualtask,
|