|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from PIL import Image |
|
|
|
torch.set_default_device("cuda") |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
"../Imp-v0-3b", |
|
torch_dtype=torch.float16, |
|
device_map="auto", |
|
trust_remote_code=True) |
|
tokenizer = AutoTokenizer.from_pretrained("../Imp-v0-3b", trust_remote_code=True) |
|
|
|
text = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat are the colors of the bus in the image? ASSISTANT:" |
|
image = Image.open("images/bus.jpg") |
|
|
|
input_ids = tokenizer(text, return_tensors='pt').input_ids |
|
image_tensor = model.image_preprocess(image) |
|
|
|
output_ids = model.generate( |
|
input_ids, |
|
max_new_tokens=100, |
|
images=image_tensor, |
|
use_cache=True)[0] |
|
print(tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()) |
|
|