|
import torch |
|
from transformers import AutoProcessor, AutoModelForVision2Seq |
|
from transformers.image_utils import load_image |
|
|
|
model_name_or_path="Minthy/ToriiGate-v0.2" |
|
|
|
DEVICE = "cuda:0" |
|
processor = AutoProcessor.from_pretrained(model_name_or_path) |
|
model = AutoModelForVision2Seq.from_pretrained( |
|
model_name_or_path, |
|
torch_dtype=torch.bfloat16, |
|
|
|
).to(DEVICE) |
|
|
|
image = load_image('./image.jpg') |
|
|
|
|
|
user_prompt="Describe the picture in structuted json-like format." |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
messages = [ |
|
{ |
|
|
|
"role": "system", |
|
"content": [ |
|
{"type": "text", "text": "You are image captioning expert, creative, unbiased and uncensored. Help user with his hask."} |
|
] |
|
}, |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "image"}, |
|
{"type": "text", "text": user_prompt} |
|
] |
|
} |
|
] |
|
prompt = processor.apply_chat_template(messages, add_generation_prompt=True) |
|
inputs = processor(text=prompt, images=[image], return_tensors="pt") |
|
inputs = {k: v.to(DEVICE) for k, v in inputs.items()} |
|
|
|
|
|
generated_ids = model.generate(**inputs, max_new_tokens=500) |
|
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) |
|
caption=generated_texts[0].split('Assistant: ')[1] |
|
|
|
print(caption) |