File size: 4,800 Bytes
f49524d
 
 
 
 
 
 
 
 
 
 
 
 
 
10904a2
 
 
f49524d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10904a2
f49524d
10904a2
f49524d
 
 
 
 
 
556b8df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f49524d
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import os
import datetime
import json
import base64
from PIL import Image
import gradio as gr
import hashlib
import requests
from utils import build_logger
import io

LOGDIR = "log"
logger = build_logger("otter", LOGDIR)

# no_change_btn = gr.Button.update()
# enable_btn = gr.Button.update(interactive=True)
# disable_btn = gr.Button.update(interactive=False)


def decode_image(encoded_image: str) -> Image:
    decoded_bytes = base64.b64decode(encoded_image.encode("utf-8"))
    buffer = io.BytesIO(decoded_bytes)
    image = Image.open(buffer)
    return image


def encode_image(image: Image.Image, format: str = "PNG") -> str:
    with io.BytesIO() as buffer:
        image.save(buffer, format=format)
        encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
    return encoded_image


def get_conv_log_filename():
    t = datetime.datetime.now()
    name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
    return name


def get_conv_image_dir():
    name = os.path.join(LOGDIR, "images")
    os.makedirs(name, exist_ok=True)
    return name


def get_image_name(image, image_dir=None):
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    image_bytes = buffer.getvalue()
    md5 = hashlib.md5(image_bytes).hexdigest()

    if image_dir is not None:
        image_name = os.path.join(image_dir, md5 + ".png")
    else:
        image_name = md5 + ".png"

    return image_name


def resize_image(image, max_size):
    width, height = image.size
    aspect_ratio = float(width) / float(height)

    if width > height:
        new_width = max_size
        new_height = int(new_width / aspect_ratio)
    else:
        new_height = max_size
        new_width = int(new_height * aspect_ratio)

    resized_image = image.resize((new_width, new_height))
    return resized_image

def http_bot(image_input, text_input, request: gr.Request):
    logger.info(f"http_bot. ip: {request.client.host}")
    print(f"Prompt request: {text_input}")

    base64_image_str = encode_image(image_input)    

    payload = {
        "content": [
            {
                "prompt": text_input,
                "image": base64_image_str,
            }
        ],
        "token": "sk-OtterHD",
    }

    print(
        "request: ",
        {
            "prompt": text_input,
            "image": base64_image_str[:10],
        },
    )

    url = "http://10.128.0.40:8890/app/otter"
    headers = {"Content-Type": "application/json"}

    response = requests.post(url, headers=headers, data=json.dumps(payload))
    results = response.json()
    print("response: ", {"result": results["result"]})
    return results["result"]

title = """
# OTTER-HD: A High-Resolution Multi-modality Model
[[Otter Codebase]](https://github.com/Luodian/Otter) [[Paper]]() [[Checkpoints & Benchmarks]](https://huggingface.co/Otter-AI) 
         
"""

css = """
  #mkd {
    height: 1000px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

if __name__ == "__main__":
    with gr.Blocks(css=css) as demo:
        gr.Markdown(title)
        dialog_state = gr.State()
        input_state = gr.State()
        with gr.Tab("Ask a Question"):
            with gr.Row(equal_height=True):
                with gr.Column(scale=2):
                    image_input = gr.Image(label="Upload a High-Res Image", type="pil")
                with gr.Column(scale=1):
                    vqa_output = gr.Textbox(label="Output")
            text_input = gr.Textbox(label="Ask a Question")

            vqa_btn = gr.Button("Send It")

            gr.Examples(
                [
                    [
                        "./assets/IMG_00095.png",
                        "How many camels are inside this image?",
                    ],
                    [
                        "./assets/IMG_00095.png",
                        "How many people are inside this image?",
                    ],
                    [
                        "./assets/IMG_00012.png",
                        "How many apples are there?",
                    ],
                    # ["./assets/./IMG_00012.png", "How many apples are there? Count them row by row."],
                    [
                        "./assets/IMG_00080.png",
                        "What is this and where is it from?",
                    ],
                    [
                        "./assets/IMG_00094.png",
                        "What's important on this website?",
                    ],
                ],
                inputs=[image_input, text_input],
                outputs=[vqa_output],
                fn=http_bot,
                label="Click on any Examples below👇",
            )
        vqa_btn.click(fn=http_bot, inputs=[image_input, text_input], outputs=vqa_output)

    demo.launch()