File size: 2,416 Bytes
9b070b0
 
 
fed60af
 
9b070b0
fed60af
 
 
 
 
 
 
 
 
a46f695
 
fed60af
 
 
 
 
 
 
 
 
 
 
 
9b070b0
 
 
fed60af
9b070b0
fed60af
 
 
a46f695
fed60af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b070b0
 
 
 
 
119fb19
9b070b0
 
 
 
db27411
b191987
119fb19
67921c3
119fb19
 
9b070b0
db27411
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import sys 

import gradio as gr
import jax
from huggingface_hub import snapshot_download
from PIL import Image
from transformers import AutoTokenizer
import torch
from torchvision.io import ImageReadMode, read_image


LOCAL_PATH = snapshot_download("flax-community/medclip")
sys.path.append(LOCAL_PATH)

from src.modeling_medclip import FlaxMedCLIP
from run_medclip import Transform

def prepare_image(image_path, model):
    image = read_image(image_path, mode=ImageReadMode.RGB)
    preprocess = Transform(model.config.vision_config.image_size)
    preprocess = torch.jit.script(preprocess)
    preprocessed_image = preprocess(image)
    pixel_values = torch.stack([preprocessed_image]).permute(0, 2, 3, 1).numpy()
    return pixel_values

def prepare_text(text, tokenizer):
    return tokenizer(text, return_tensors="np")

def save_file_to_disk(uplaoded_file):
    temp_file = "/tmp/image.jpeg"
    im = Image.fromarray(uplaoded_file)
    im.save(temp_file)
    return temp_file

def load_tokenizer_and_model():
    # load the saved model
    tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
    model = FlaxMedCLIP.from_pretrained(LOCAL_PATH)
    return tokenizer, model

def run_inference(image_path, text, model, tokenizer):
    pixel_values = prepare_image(image_path, model)
    input_text = prepare_text(text, tokenizer)
    model_output = model(
        input_text["input_ids"],
        pixel_values,
        attention_mask=input_text["attention_mask"],
        train=False,
        return_dict=True,
    )
    logits = model_output["logits_per_image"]
    score = jax.nn.sigmoid(logits)[0][0]
    return score

tokenizer, model = load_tokenizer_and_model()

def score_image_caption_pair(uploaded_file, text_input):
    local_image_path = save_file_to_disk(uploaded_file)
    score = run_inference(
        local_image_path, text_input, model, tokenizer).tolist()
    return {"Score": score}


image = gr.inputs.Image(shape=(299, 299))
iface = gr.Interface(
    fn=score_image_caption_pair, inputs=[image, "text"], outputs=["label"], allow_flagging=False, allow_screenshot=False,
    title="Your personal TA",
    description="""
    The purpose of this demo is to help medical students measure their diagnostic capabilities in purely academic settings.
    Under no circumstances should it be used to make a self-diagnosis or confront a real doctor.
    """
)
iface.launch()