vision_pipeline / model.py
NohTow's picture
Remove extra forward pass
eb62d2f verified
import spaces
import torch
from PIL import Image
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
device = "cuda" if torch.cuda.is_available() else "cpu"
min_pixels = 1 * 28 * 28
max_pixels = 2560 * 28 * 28
processor = AutoProcessor.from_pretrained(
"MrLight/dse-qwen2-2b-mrl-v1", min_pixels=min_pixels, max_pixels=max_pixels
)
model = (
Qwen2VLForConditionalGeneration.from_pretrained(
"MrLight/dse-qwen2-2b-mrl-v1",
# attn_implementation="eager",
attn_implementation="flash_attention_2"
if device == "cuda"
else "eager", # flash_attn is required but is a pain to install on spaces
torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
)
.to(device)
.eval()
)
processor.tokenizer.padding_side = "left"
model.padding_side = "left"
def get_embedding(last_hidden_state: torch.Tensor, dimension: int):
reps = last_hidden_state[:, -1]
reps = torch.nn.functional.normalize(reps[:, :dimension], p=2, dim=-1)
return reps.to(torch.float32).cpu().numpy()
@spaces.GPU
def encode_queries(queries: list):
if isinstance(queries, str):
queries = [queries]
query_messages = []
for query in queries:
message = [
{
"role": "user",
"content": [
{
"type": "image",
"image": Image.new("RGB", (28, 28)),
"resized_height": 1,
"resized_width": 1,
}, # need a dummy image here for an easier process.
{"type": "text", "text": f"Query: {query}"},
],
}
]
query_messages.append(message)
query_texts = [
processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
+ "<|endoftext|>"
for msg in query_messages
]
query_image_inputs, query_video_inputs = process_vision_info(query_messages)
query_inputs = processor(
text=query_texts,
images=query_image_inputs,
videos=query_video_inputs,
padding="longest",
return_tensors="pt",
).to(device)
query_inputs = model.prepare_inputs_for_generation(**query_inputs, use_cache=False)
with torch.no_grad():
output = model(**query_inputs, return_dict=True, output_hidden_states=True)
query_embeddings = get_embedding(
output.hidden_states[-1], 1536
) # adjust dimensionality for efficiency trade-off, e.g. 512
return query_embeddings
@spaces.GPU
def encode_images(images: list):
if isinstance(images, Image.Image):
images = [images]
doc_messages = []
for image in images:
message = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
}, #'resized_height':680 , 'resized_width':680} # adjust the image size for efficiency trade-off
{"type": "text", "text": "What is shown in this image?"},
],
}
]
doc_messages.append(message)
doc_texts = [
processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
+ "<|endoftext|>"
for msg in doc_messages
]
doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
doc_inputs = processor(
text=doc_texts,
images=doc_image_inputs,
videos=doc_video_inputs,
padding="longest",
return_tensors="pt",
).to(device)
doc_inputs = model.prepare_inputs_for_generation(**doc_inputs, use_cache=False)
with torch.no_grad():
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
doc_embeddings = get_embedding(
output.hidden_states[-1], 1536
) # adjust dimensionality for efficiency trade-off e.g. 512
return doc_embeddings