nanoVLM-460M-ft-distill

Native NanoVLM export converted from patrickamadeus/dt-distill-full-460m-4000-student.

This is a native repository for this codebase's models.nanovlm.VisionLanguageModel. It is not a Transformers AutoModel checkpoint. Use it from a checkout of the PrefixVLM codebase that defines models.nanovlm.VisionLanguageModel.

Setup

git clone https://github.com/patrickamadeus/PrefixVLM.git
cd PrefixVLM
pip install -r requirements.txt

Load

from models.nanovlm import VisionLanguageModel

model = VisionLanguageModel.from_pretrained("patrickamadeus/nanovlm-460m-ft-distill")
model.eval()

CLI Inference From PrefixVLM

python generate_nanovlm.py \
  --checkpoint patrickamadeus/nanovlm-460m-ft-distill \
  --image ./assets/cat.png \
  --prompt "What is in the image?" \
  --greedy \
  --max_new_tokens 64

Self-Contained Python Inference

import math
import torch
from einops import rearrange
from PIL import Image
from torchvision.transforms.functional import InterpolationMode, resize, to_tensor
from transformers import AutoTokenizer

from models.nanovlm import VisionLanguageModel


def load_tokenizer(cfg):
    tokenizer = AutoTokenizer.from_pretrained(
        cfg.lm_tokenizer,
        use_fast=True,
        extra_special_tokens=cfg.vlm_extra_tokens,
        chat_template=cfg.lm_chat_template,
    )
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer


def resize_to_patch_grid(image, patch_size, max_side_len, resize_to_max_side_len=False):
    width, height = image.size
    long_side, short_side = (width, height) if width >= height else (height, width)
    target_long = max_side_len if resize_to_max_side_len else min(
        max_side_len,
        math.ceil(long_side / patch_size) * patch_size,
    )
    scale = target_long / long_side
    target_short = max(patch_size, math.ceil(short_side * scale / patch_size) * patch_size)
    new_height, new_width = (
        (target_short, target_long) if width >= height else (target_long, target_short)
    )
    return resize(image, [new_height, new_width], interpolation=InterpolationMode.BICUBIC)


def split_global_and_tiles(image_tensor, tile_size):
    if image_tensor.ndim == 3:
        image_tensor = image_tensor.unsqueeze(0)
    _, _, height, width = image_tensor.shape
    if height % tile_size or width % tile_size:
        raise ValueError(f"image size {(height, width)} is not divisible by {tile_size}")

    n_h, n_w = height // tile_size, width // tile_size
    tiles = rearrange(
        image_tensor,
        "b c (nh ph) (nw pw) -> (b nh nw) c ph pw",
        ph=tile_size,
        pw=tile_size,
    )
    if (n_h, n_w) == (1, 1):
        return tiles, (n_h, n_w)
    global_tile = resize(image_tensor, [tile_size, tile_size])
    return torch.cat([global_tile, tiles], dim=0), (n_h, n_w)


def build_image_string(tokenizer, grid, image_token_length):
    n_h, n_w = grid
    text = ""
    if hasattr(tokenizer, "global_image_token"):
        text += tokenizer.global_image_token
        text += tokenizer.image_token * image_token_length
        if (n_h, n_w) == (1, 1):
            return text

    for row in range(n_h):
        for col in range(n_w):
            text += getattr(tokenizer, f"r{row + 1}c{col + 1}")
            text += tokenizer.image_token * image_token_length
    return text


def build_inputs(model, image_path, prompt, device):
    cfg = model.cfg
    tokenizer = load_tokenizer(cfg)
    image = Image.open(image_path).convert("RGB")
    image = resize_to_patch_grid(
        image,
        patch_size=cfg.vit_img_size,
        max_side_len=cfg.max_img_size,
        resize_to_max_side_len=cfg.resize_to_max_side_len,
    )
    image_tensor, grid = split_global_and_tiles(to_tensor(image), cfg.vit_img_size)

    if not hasattr(tokenizer, "global_image_token") and grid[0] * grid[1] == image_tensor.size(0) - 1:
        image_tensor = image_tensor[1:]

    image_text = build_image_string(tokenizer, grid, cfg.mp_image_token_length)
    messages = [{"role": "user", "content": image_text + prompt}]
    prompt_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)
    input_ids = torch.tensor(prompt_ids, dtype=torch.long, device=device).unsqueeze(0)
    attention_mask = torch.ones_like(input_ids)
    return tokenizer, input_ids, attention_mask, [image_tensor]


model_id = "patrickamadeus/nanovlm-460m-ft-distill"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = VisionLanguageModel.from_pretrained(model_id).to(device)
model.eval()

tokenizer, input_ids, attention_mask, images = build_inputs(
    model,
    "./assets/cat.png",
    "What is in the image?",
    torch.device(device),
)

with torch.inference_mode():
    output_ids = model.generate(
        input_ids=input_ids,
        images=images,
        attention_mask=attention_mask,
        max_new_tokens=64,
        greedy=True,
    )

print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0])
Downloads last month
39
Safetensors
Model size
0.5B params
Tensor type
F32
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support