|
import torch |
|
import torch.nn.functional as F |
|
import gradio as gr |
|
from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModel |
|
import spaces |
|
|
|
|
|
MODELS = { |
|
"CLIP ViT-B/32": ("openai/clip-vit-base-patch32", 224, "clip"), |
|
"CLIP ViT-B/16": ("openai/clip-vit-base-patch16", 224, "clip"), |
|
"CLIP ViT-L/14": ("openai/clip-vit-large-patch14", 224, "clip"), |
|
"CLIP ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336, "clip"), |
|
"SigLIP Large/16-256": ("google/siglip-large-patch16-256", 256, "siglip"), |
|
"SigLIP Base/16-384": ("google/siglip-base-patch16-384", 384, "siglip"), |
|
"SigLIP Large/16-384": ("google/siglip-large-patch16-384", 384, "siglip"), |
|
} |
|
|
|
|
|
models = {} |
|
processors = {} |
|
|
|
for model_name, (model_path, _, model_type) in MODELS.items(): |
|
if model_type == "clip": |
|
models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda") |
|
processors[model_name] = CLIPProcessor.from_pretrained(model_path) |
|
elif model_type == "siglip": |
|
models[model_name] = AutoModel.from_pretrained(model_path).to("cuda") |
|
processors[model_name] = AutoProcessor.from_pretrained(model_path) |
|
|
|
|
|
@spaces.GPU |
|
def calculate_score(image, text, model_name): |
|
labels = text.split(";") |
|
labels = [l.strip() for l in labels] |
|
labels = list(filter(None, labels)) |
|
if len(labels) == 0: |
|
return dict() |
|
|
|
model = models[model_name] |
|
processor = processors[model_name] |
|
model_type = MODELS[model_name][2] |
|
|
|
|
|
inputs = processor(text=labels, images=[image], return_tensors="pt", padding="max_length") |
|
inputs = {k: v.to("cuda") for k, v in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
if model_type == "clip": |
|
image_embeds = outputs.image_embeds |
|
text_embeds = outputs.text_embeds |
|
elif model_type == "siglip": |
|
image_embeds = outputs.image_embeds |
|
text_embeds = outputs.text_embeds |
|
|
|
|
|
image_embeds = F.normalize(image_embeds, p=2, dim=1) |
|
text_embeds = F.normalize(text_embeds, p=2, dim=1) |
|
|
|
|
|
if model_type == "clip": |
|
|
|
similarities = torch.mm(text_embeds, image_embeds.t()).squeeze(1) |
|
similarities = torch.clamp(similarities, min=0, max=1) |
|
elif model_type == "siglip": |
|
|
|
logits = torch.mm(text_embeds, image_embeds.t()).squeeze(1) |
|
similarities = torch.sigmoid(logits) |
|
|
|
|
|
similarities = similarities.cpu().numpy() |
|
|
|
results_dict = {label: float(score) for label, score in zip(labels, similarities)} |
|
return results_dict |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Multi-Model CLIP and SigLIP Score") |
|
gr.Markdown( |
|
"Calculate the score (cosine similarity) between the given image and text descriptions using different CLIP and SigLIP model variants" |
|
) |
|
|
|
with gr.Row(): |
|
image_input = gr.Image(type="pil") |
|
output_label = gr.Label() |
|
|
|
with gr.Row(): |
|
text_input = gr.Textbox(label="Descriptions (separated by semicolons)") |
|
model_dropdown = gr.Dropdown( |
|
choices=list(MODELS.keys()), label="Model", value="CLIP ViT-B/16" |
|
) |
|
|
|
def process_inputs(image, text, model_name): |
|
if image is None or text.strip() == "": |
|
return None |
|
return calculate_score(image, text, model_name) |
|
|
|
inputs = [image_input, text_input, model_dropdown] |
|
outputs = output_label |
|
|
|
image_input.change(fn=process_inputs, inputs=inputs, outputs=outputs) |
|
text_input.submit(fn=process_inputs, inputs=inputs, outputs=outputs) |
|
model_dropdown.change(fn=process_inputs, inputs=inputs, outputs=outputs) |
|
|
|
gr.Examples( |
|
examples=[ |
|
[ |
|
"cat.jpg", |
|
"a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void", |
|
"CLIP ViT-B/16", |
|
] |
|
], |
|
fn=process_inputs, |
|
inputs=inputs, |
|
outputs=outputs, |
|
) |
|
|
|
demo.launch() |
|
|