nanoVLM-460M-ft-distill
Native NanoVLM export converted from patrickamadeus/dt-distill-full-460m-4000-student.
This is a native repository for this codebase's models.nanovlm.VisionLanguageModel.
It is not a Transformers AutoModel checkpoint. Use it from a checkout of the
PrefixVLM codebase that defines models.nanovlm.VisionLanguageModel.
Setup
git clone https://github.com/patrickamadeus/PrefixVLM.git
cd PrefixVLM
pip install -r requirements.txt
Load
from models.nanovlm import VisionLanguageModel
model = VisionLanguageModel.from_pretrained("patrickamadeus/nanovlm-460m-ft-distill")
model.eval()
CLI Inference From PrefixVLM
python generate_nanovlm.py \
--checkpoint patrickamadeus/nanovlm-460m-ft-distill \
--image ./assets/cat.png \
--prompt "What is in the image?" \
--greedy \
--max_new_tokens 64
Self-Contained Python Inference
import math
import torch
from einops import rearrange
from PIL import Image
from torchvision.transforms.functional import InterpolationMode, resize, to_tensor
from transformers import AutoTokenizer
from models.nanovlm import VisionLanguageModel
def load_tokenizer(cfg):
tokenizer = AutoTokenizer.from_pretrained(
cfg.lm_tokenizer,
use_fast=True,
extra_special_tokens=cfg.vlm_extra_tokens,
chat_template=cfg.lm_chat_template,
)
tokenizer.pad_token = tokenizer.eos_token
return tokenizer
def resize_to_patch_grid(image, patch_size, max_side_len, resize_to_max_side_len=False):
width, height = image.size
long_side, short_side = (width, height) if width >= height else (height, width)
target_long = max_side_len if resize_to_max_side_len else min(
max_side_len,
math.ceil(long_side / patch_size) * patch_size,
)
scale = target_long / long_side
target_short = max(patch_size, math.ceil(short_side * scale / patch_size) * patch_size)
new_height, new_width = (
(target_short, target_long) if width >= height else (target_long, target_short)
)
return resize(image, [new_height, new_width], interpolation=InterpolationMode.BICUBIC)
def split_global_and_tiles(image_tensor, tile_size):
if image_tensor.ndim == 3:
image_tensor = image_tensor.unsqueeze(0)
_, _, height, width = image_tensor.shape
if height % tile_size or width % tile_size:
raise ValueError(f"image size {(height, width)} is not divisible by {tile_size}")
n_h, n_w = height // tile_size, width // tile_size
tiles = rearrange(
image_tensor,
"b c (nh ph) (nw pw) -> (b nh nw) c ph pw",
ph=tile_size,
pw=tile_size,
)
if (n_h, n_w) == (1, 1):
return tiles, (n_h, n_w)
global_tile = resize(image_tensor, [tile_size, tile_size])
return torch.cat([global_tile, tiles], dim=0), (n_h, n_w)
def build_image_string(tokenizer, grid, image_token_length):
n_h, n_w = grid
text = ""
if hasattr(tokenizer, "global_image_token"):
text += tokenizer.global_image_token
text += tokenizer.image_token * image_token_length
if (n_h, n_w) == (1, 1):
return text
for row in range(n_h):
for col in range(n_w):
text += getattr(tokenizer, f"r{row + 1}c{col + 1}")
text += tokenizer.image_token * image_token_length
return text
def build_inputs(model, image_path, prompt, device):
cfg = model.cfg
tokenizer = load_tokenizer(cfg)
image = Image.open(image_path).convert("RGB")
image = resize_to_patch_grid(
image,
patch_size=cfg.vit_img_size,
max_side_len=cfg.max_img_size,
resize_to_max_side_len=cfg.resize_to_max_side_len,
)
image_tensor, grid = split_global_and_tiles(to_tensor(image), cfg.vit_img_size)
if not hasattr(tokenizer, "global_image_token") and grid[0] * grid[1] == image_tensor.size(0) - 1:
image_tensor = image_tensor[1:]
image_text = build_image_string(tokenizer, grid, cfg.mp_image_token_length)
messages = [{"role": "user", "content": image_text + prompt}]
prompt_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)
input_ids = torch.tensor(prompt_ids, dtype=torch.long, device=device).unsqueeze(0)
attention_mask = torch.ones_like(input_ids)
return tokenizer, input_ids, attention_mask, [image_tensor]
model_id = "patrickamadeus/nanovlm-460m-ft-distill"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = VisionLanguageModel.from_pretrained(model_id).to(device)
model.eval()
tokenizer, input_ids, attention_mask, images = build_inputs(
model,
"./assets/cat.png",
"What is in the image?",
torch.device(device),
)
with torch.inference_mode():
output_ids = model.generate(
input_ids=input_ids,
images=images,
attention_mask=attention_mask,
max_new_tokens=64,
greedy=True,
)
print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0])
- Downloads last month
- 39
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support