File size: 2,986 Bytes
17c016d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68e19a9
17c016d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68e19a9
17c016d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7971358
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import gradio as gr
from transformers import pipeline
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import requests
import os


device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "openai/clip-vit-base-patch16"  # You can choose a different CLIP model from Hugging Face
clipprocessor = CLIPProcessor.from_pretrained(model_id)
clipmodel = CLIPModel.from_pretrained(model_id).to(device)


model_id = "Salesforce/blip-image-captioning-base" ## load modelID for BLIP
blipmodel = BlipForConditionalGeneration.from_pretrained(model_id)
blipprocessor = BlipProcessor.from_pretrained(model_id)

im_dir = os.path.join(os.getcwd(),'images')

def evaluate_caption(image, caption):
    # # Pre-process image
    # image = processor(images=image, return_tensors="pt").to(device)

    # # Tokenize and encode the caption
    # text = processor(text=caption, return_tensors="pt").to(device)



    blip_input = blipprocessor(image, return_tensors="pt")
    out = blipmodel.generate(**blip_input,max_new_tokens=50)
    blip_caption = blipprocessor.decode(out[0], skip_special_tokens=True)

    inputs = clipprocessor(text=[caption,blip_caption], images=image, return_tensors="pt", padding=True)

    similarity_score = clipmodel(**inputs).logits_per_image



    # Convert score to a float
    score = similarity_score.softmax(dim=1).detach().numpy()
    print(score)
    if score[0][0]>score[0][1]:
      winner = "The first caption is the human"
    else:
      winner = "The second caption is the human"

    
    return blip_caption,winner
    # ,gr.Image(type="pil", value="mukherjee_kushin_WIDPICS1.jpg")


callback = gr.HuggingFaceDatasetSaver('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', "gradioTest")
with gr.Blocks() as demo:
  im_path_str = 'n01677366_12918.JPEG'
  im_path = gr.Textbox(label="Image fname",value=im_path_str,interactive=False, visible=False)
  # fn=evaluate_caption,
  # inputs=["image", "text"]
  
  with gr.Column():
    im = gr.Image(label="Target Image", interactive = False, type="pil",value =os.path.join(im_dir,im_path_str),height=500)
    caps = gr.Textbox(label="Player 1 Caption")
    submit_btn = gr.Button("Submit!!")
  # outputs=["text","text"],
  with gr.Column():
    out1 = gr.Textbox(label="Player 2 (Machine) Caption",interactive=False)
    out2 = gr.Textbox(label="Winner",interactive=False)


  # live=False,
  # interpretation="default"
  callback.setup([caps, out1, out2, im_path], "flagged_data_points")
  # callback.flag([image, caption, blip_caption, winner])
  submit_btn.click(fn = evaluate_caption,inputs = [im,caps], outputs = [out1, out2],api_name="test").success(lambda *args: callback.flag(args), [caps, out1, out2, im_path], None, preprocess=False)
  # with gr.Row():
  #     btn = gr.Button("Flag")
  # btn.click(lambda *args: callback.flag(args), [im, caps, out1, out2], None, preprocess=False)

demo.launch(share=True)