|
--- |
|
license: apache-2.0 |
|
datasets: |
|
- google/docci |
|
- gokaygokay/random_instruct_docci |
|
language: |
|
- en |
|
pipeline_tag: image-text-to-text |
|
--- |
|
|
|
Fine tuned version of [moondream2](https://huggingface.co/vikhyatk/moondream2) model using [gokaygokay/random_instruct_docci](https://huggingface.co/datasets/gokaygokay/random_instruct_docci) dataset. Which gives extremely detailed captions of the images. |
|
|
|
``` |
|
pip install transformers timm einops bitsandbytes accelerate flash-attn |
|
``` |
|
|
|
```python |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
from PIL import Image |
|
|
|
DEVICE = "cuda" |
|
DTYPE = ( |
|
torch.float32 if DEVICE == "cpu" else torch.float16 |
|
) # CPU doesn't support float16 |
|
revision = "3ec40c7b6b5d87bc0c51edee45e21f5f29b449d8" |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
"fal-ai/moondream2-docci-instruct", |
|
trust_remote_code=True, |
|
revision=revision |
|
) |
|
moondream = AutoModelForCausalLM.from_pretrained( |
|
"fal-ai/moondream2-docci-instruct", |
|
trust_remote_code=True, |
|
torch_dtype=DTYPE, |
|
device_map={"": DEVICE}, |
|
attn_implementation="flash_attention_2", |
|
revision=revision |
|
) |
|
moondream.eval() |
|
|
|
image_path = "<your_image_path>" |
|
image = Image.open(image_path).convert("RGB") |
|
md_answer = moondream.answer_question( |
|
moondream.encode_image(image), |
|
"what is this picture about", |
|
tokenizer=tokenizer, |
|
) |
|
|
|
print(md_answer) |
|
``` |