CLIPScore / app.py
taesiri's picture
Update app.py
7fc3bdc verified
import torch
import torch.nn.functional as F
import gradio as gr
from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModel
import spaces
# Dictionary of available models with their image sizes
MODELS = {
"CLIP ViT-B/32": ("openai/clip-vit-base-patch32", 224, "clip"),
"CLIP ViT-B/16": ("openai/clip-vit-base-patch16", 224, "clip"),
"CLIP ViT-L/14": ("openai/clip-vit-large-patch14", 224, "clip"),
"CLIP ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336, "clip"),
"SigLIP Large/16-256": ("google/siglip-large-patch16-256", 256, "siglip"),
"SigLIP Base/16-384": ("google/siglip-base-patch16-384", 384, "siglip"),
"SigLIP Large/16-384": ("google/siglip-large-patch16-384", 384, "siglip"),
}
# Initialize models and processors
models = {}
processors = {}
for model_name, (model_path, _, model_type) in MODELS.items():
if model_type == "clip":
models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
processors[model_name] = CLIPProcessor.from_pretrained(model_path)
elif model_type == "siglip":
models[model_name] = AutoModel.from_pretrained(model_path).to("cuda")
processors[model_name] = AutoProcessor.from_pretrained(model_path)
@spaces.GPU
def calculate_score(image, text, model_name):
labels = text.split(";")
labels = [l.strip() for l in labels]
labels = list(filter(None, labels))
if len(labels) == 0:
return dict()
model = models[model_name]
processor = processors[model_name]
model_type = MODELS[model_name][2]
# Preprocess the image and text
inputs = processor(text=labels, images=[image], return_tensors="pt", padding="max_length")
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# Calculate embeddings
with torch.no_grad():
outputs = model(**inputs)
if model_type == "clip":
image_embeds = outputs.image_embeds
text_embeds = outputs.text_embeds
elif model_type == "siglip":
image_embeds = outputs.image_embeds
text_embeds = outputs.text_embeds
# Normalize embeddings
image_embeds = F.normalize(image_embeds, p=2, dim=1)
text_embeds = F.normalize(text_embeds, p=2, dim=1)
# Calculate similarity
if model_type == "clip":
# For CLIP, use cosine similarity
similarities = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
similarities = torch.clamp(similarities, min=0, max=1)
elif model_type == "siglip":
# For SigLIP, use sigmoid on dot product
logits = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
similarities = torch.sigmoid(logits)
# Convert to numpy array
similarities = similarities.cpu().numpy()
results_dict = {label: float(score) for label, score in zip(labels, similarities)}
return results_dict
with gr.Blocks() as demo:
gr.Markdown("# Multi-Model CLIP and SigLIP Score")
gr.Markdown(
"Calculate the score (cosine similarity) between the given image and text descriptions using different CLIP and SigLIP model variants"
)
with gr.Row():
image_input = gr.Image(type="pil")
output_label = gr.Label()
with gr.Row():
text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
model_dropdown = gr.Dropdown(
choices=list(MODELS.keys()), label="Model", value="CLIP ViT-B/16"
)
def process_inputs(image, text, model_name):
if image is None or text.strip() == "":
return None
return calculate_score(image, text, model_name)
inputs = [image_input, text_input, model_dropdown]
outputs = output_label
image_input.change(fn=process_inputs, inputs=inputs, outputs=outputs)
text_input.submit(fn=process_inputs, inputs=inputs, outputs=outputs)
model_dropdown.change(fn=process_inputs, inputs=inputs, outputs=outputs)
gr.Examples(
examples=[
[
"cat.jpg",
"a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
"CLIP ViT-B/16",
]
],
fn=process_inputs,
inputs=inputs,
outputs=outputs,
)
demo.launch()