File size: 4,054 Bytes
834334c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7243c0f
834334c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbb97d0
834334c
 
 
 
 
 
 
 
 
bbb97d0
 
 
 
 
 
834334c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d791456
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os

os.system('pip install ./transformers-4.47.0.dev0-py3-none-any.whl')

import gradio as gr
import PIL.Image
import transformers
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
import torch
import string
import functools
import re
import numpy as np
import spaces


adapter_id = "merve/paligemma2-3b-vqav2"
model_id = "google/paligemma2-3b-pt-448"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PaliGemmaForConditionalGeneration.from_pretrained(adapter_id).eval().to(device)
processor = PaliGemmaProcessor.from_pretrained(model_id)

###### Transformers Inference
@spaces.GPU
def infer(
    text,
    image: PIL.Image.Image,
    max_new_tokens: int
) -> str:
    text = "answer en " + text
    inputs = processor(text=text, images=image, return_tensors="pt").to(device)
    with torch.inference_mode():
      generated_ids = model.generate(
          **inputs,
          max_new_tokens=max_new_tokens,
          do_sample=False
      )
    result = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return result[0][len(text):].lstrip("\n")


######## Demo

INTRO_TEXT = """## PaliGemma 2 demo\n\n
| [Github](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) 
| [Blogpost](https://huggingface.co/blog/paligemma) 
| [Fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb) 
|\n\n
PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and 
built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343) 
vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile 
model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question 
answering, text reading, object detection and object segmentation.
\n\n
This space includes a model LoRA fine-tuned by the team at Hugging Face on VQAv2, inferred using transformers.
See the [Blogpost](https://huggingface.co/blog/paligemma2), the project  
[README](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) and the
[fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
for detailed information about how to use and fine-tune PaliGemma and PaliGemma 2 models.
\n\n
**This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
"""


with gr.Blocks(css="style.css") as demo:
    gr.Markdown(INTRO_TEXT)
    with gr.Column():
        image = gr.Image(label="Input Image", type="pil", height=400)
        question = gr.Text(label="Question")
        tokens = gr.Slider(
            label="Max New Tokens",
            info="Set to larger for longer generation.",
            minimum=20,
            maximum=160,
            value=80,
            step=10,
        )
        
        
        caption_btn = gr.Button(value="Submit")
        text_output = gr.Text(label="Text Output")
        
        
    
    caption_inputs = [
        question,
        image,
        tokens
        ]
    caption_outputs = [
        text_output
    ]
    caption_btn.click(
        fn=infer,
        inputs=caption_inputs,
        outputs=caption_outputs,
    )
    
    
    examples = [
        ["What is the graphic about?", "./howto.jpg", 60],
        ["What is the password", "./password.jpg", 20],
        ["Who is in this image?", "./examples_bowie.jpg", 80],
        ]
    gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).")
    
    gr.Examples(
        examples=examples,
        inputs=caption_inputs,
    )
#########

if __name__ == "__main__":
    demo.queue(max_size=10).launch(debug=True, ssr_mode = False)