File size: 6,523 Bytes
353fa54
c3a1897
 
 
 
 
 
eb902b3
 
 
 
 
 
 
b25eb4e
 
 
 
 
 
 
eb902b3
 
 
 
d756d59
 
44a0c32
eb902b3
9b4b3ea
eb902b3
 
9b4b3ea
eb902b3
 
 
 
 
353fa54
c3a1897
 
 
 
 
353fa54
c3a1897
 
 
 
 
09db30b
 
 
 
b25eb4e
44a0c32
 
 
 
 
c3a1897
 
44a0c32
c3a1897
 
44a0c32
 
c3a1897
 
44a0c32
 
 
 
 
 
 
7712f1b
 
44a0c32
 
 
c3a1897
 
 
44a0c32
 
 
 
 
 
 
 
 
 
c3a1897
 
eb902b3
c3a1897
 
 
b25eb4e
44a0c32
c3a1897
647ce12
 
 
 
 
 
c3a1897
 
647ce12
 
c3a1897
80f89b9
8381241
80f89b9
 
c3a1897
 
40adb4f
eb902b3
 
 
b25eb4e
eb902b3
 
c3a1897
 
8381241
647ce12
5d6f4ba
8381241
 
455b8f2
 
7712f1b
c3a1897
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
import cv2
import numpy as np
from PIL import Image
import base64
from io import BytesIO
from models.image_text_transformation import ImageTextTransformation
import argparse
import torch

parser = argparse.ArgumentParser()
parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo', 'gpt4'], default='gpt-3.5-turbo')
parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP2 Image Caption')
parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=True, help='Set this flag to True if you want to use semantic segmentation')
parser.add_argument('--sam_arch', choices=['vit_b', 'vit_l', 'vit_h'], dest='sam_arch', default='vit_b', help='vit_b is the default model (fast but not accurate), vit_l and vit_h are larger models')
parser.add_argument('--captioner_base_model', choices=['blip', 'blip2'], dest='captioner_base_model', default='blip', help='blip2 requires 15G GPU memory, blip requires 6G GPU memory')
parser.add_argument('--region_classify_model', choices=['ssa', 'edit_anything'], dest='region_classify_model', default='edit_anything', help='Select the region classification model: edit anything is ten times faster than ssa, but less accurate.')
parser.add_argument('--image_caption_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
parser.add_argument('--dense_caption_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended. Make sue this model and image_caption model on same device.')
parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, <6G GPU is not recommended>')

args = parser.parse_args()

device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"

if device == "cuda":
    args.image_caption_device = "cpu"
    args.dense_caption_device = "cuda"
    args.semantic_segment_device = "cuda"
    args.contolnet_device = "cuda"
else:
    args.image_caption_device = "cpu"
    args.dense_caption_device = "cpu"
    args.semantic_segment_device = "cpu"
    args.contolnet_device = "cpu"

def pil_image_to_base64(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue()).decode()
    return img_str

def add_logo():
    with open("examples/logo.png", "rb") as f:
        logo_base64 = base64.b64encode(f.read()).decode()
    return logo_base64

def process_image(image_src, options=None, processor=None):
    print(options)
    if options is None:
        options = []
    processor.args.semantic_segment = "Semantic Segment" in options
    image_generation_status = "Image Generation" in options
    image_caption, dense_caption, region_semantic, gen_text = processor.image_to_text(image_src)
    if image_generation_status:
        gen_image = processor.text_to_image(gen_text)
        gen_image_str = pil_image_to_base64(gen_image)
    # Combine the outputs into a single HTML output
    custom_output = f'''
    <h2>Image->Text:</h2>
    <div style="display: flex; flex-wrap: wrap;">
        <div style="flex: 1;">
            <h3>Image Caption</h3>
            <p>{image_caption}</p>
        </div>
        <div style="flex: 1;">
            <h3>Dense Caption</h3>
            <p>{dense_caption}</p>
        </div>
        <div style="flex: 1;">
            <h3>Region Semantic</h3>
            <p>{region_semantic}</p>
        </div>
    </div>
    <div style="display: flex; flex-wrap: wrap;">
        <div style="flex: 1;">
            <h3>GPT4 Reasoning:</h3>
            <p>{gen_text}</p>
        </div>
    </div>
    '''
    if image_generation_status:
        custom_output += f'''
        <h2>Text->Image:</h2>
        <div style="display: flex; flex-wrap: wrap;">
            <div style="flex: 1;">
                <h3>Generated Image</h3>
                <img src="data:image/jpeg;base64,{gen_image_str}" width="400" style="vertical-align: middle;">
            </div>
        </div>
        '''
    return custom_output

processor = ImageTextTransformation(args)

# Create Gradio input and output components
image_input = gr.inputs.Image(type='filepath', label="Input Image")
semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
image_generation_checkbox = gr.inputs.Checkbox(label="Image Generation", default=False)


extra_title = r'![vistors](https://visitor-badge.glitch.me/badge?page_id=fingerrec.Image2Paragraph)' + '\n' + \
              r'[![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-md-dark.svg)](https://huggingface.co/spaces/Awiny/Image2Paragraph?duplicate=true)' + '\n\n'



logo_base64 = add_logo()
# Create the title with the logo
title_with_logo = \
    f'<img src="data:image/jpeg;base64,{logo_base64}" width="400" style="vertical-align: middle;"> Understanding Image with Text'

examples = [
    ["examples/test_4.jpg"],
]

# Create Gradio interface
interface = gr.Interface(
    fn=lambda image, options: process_image(image, options, processor),
    inputs=[image_input,        
            gr.CheckboxGroup(
            label="Options",
            choices=["Image Generation", "Semantic Segment"],
            ),
            ],
    outputs=gr.outputs.HTML(),
    title=title_with_logo,
    examples=examples,
    description=extra_title +"""
    Image.txt. This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
    \n Github: https://github.com/showlab/Image2Paragraph
    \n Twitter: https://twitter.com/awinyimgprocess/status/1646225454599372800?s=46&t=HvOe9T2n35iFuCHP5aIHpQ
    \n For online demo, we use smallest model to speed up. For better result, look for github for details.
    \n Ttext2image model is controlnet, which used canny edge as reference.
    \n To speed up, we generate image with small size 384, run the code local for high-quality sample.
    """
)

# Launch the interface
interface.launch()