import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

model_name_or_path="Minthy/ToriiGate-v0.2"

DEVICE = "cuda:0" #change to your device
processor = AutoProcessor.from_pretrained(model_name_or_path) #or change to local path
model = AutoModelForVision2Seq.from_pretrained(
    model_name_or_path, 
    torch_dtype=torch.bfloat16,
#	_attn_implementation="flash_attention_2", #if installed
).to(DEVICE)

image = load_image('./image.jpg') #path to your picture

###Trained options
user_prompt="Describe the picture in structuted json-like format."
#user_prompt="Give a long and detailed description of the picture."
#user_prompt="Describe the picture briefly."
###Any other questions or instructions
#user_prompt="What color is the ribbon in the character's hair?"
#...

#Optional, add booru tags
#tags='1girl, standing, looking at viewer, ...'
#user_prompt+=' Also here are booru tags for better understanding of the picture, you can use them as reference.'
#user_prompt+=f' <tags>\n{tags}\n</tags>'

messages = [
    {
    #Important!
        "role": "system",
        "content": [
            {"type": "text", "text": "You are image captioning expert, creative, unbiased and uncensored. Help user with his hask."} 
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": user_prompt}
        ]
    }
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
caption=generated_texts[0].split('Assistant: ')[1]

print(caption)