Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,979 Bytes
1858b2a 9c79daa 1858b2a 9c79daa 1858b2a fc7652c 9c79daa fc7652c 5d15f06 fc7652c 1858b2a fc7652c 9c79daa 3b99a8a 9c79daa 1858b2a fc7652c 1858b2a fc7652c 3b99a8a 9c79daa a3dae10 9c79daa 1858b2a 9c79daa 1858b2a fc7652c 9c79daa 1858b2a fc7652c 9c79daa 1858b2a 9c79daa fc7652c 9c79daa 1858b2a 9c79daa 1858b2a 9c79daa 1858b2a fc7652c 9c79daa eba8e42 1858b2a eba8e42 1858b2a 9c79daa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
from typing import Tuple, Optional
import gradio as gr
import spaces
import supervision as sv
import torch
from PIL import Image
from gradio_image_prompter import ImagePrompter
from utils.annotate import annotate_with_boxes
from utils.models import load_models, run_inference, CHECKPOINTS, \
pre_process_region_task_input, post_process_region_output
from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \
IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \
DENSE_REGION_CAPTION_TASK_NAME
MARKDOWN = """
# Better Florence-2 Playground 🔥
<div>
<a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-finetune-florence-2-on-detection-dataset.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
</a>
<a href="https://blog.roboflow.com/florence-2/">
<img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
</a>
<a href="https://arxiv.org/abs/2311.06242">
<img src="https://img.shields.io/badge/arXiv-2311.06242-b31b1b.svg" alt="arXiv" style="display:inline-block;">
</a>
<a href="https://www.youtube.com/watch?v=i3KjYgxNH6w">
<img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
</a>
</div>
Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the
MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities
across tasks such as captioning, object detection, grounding, and segmentation.
The model takes images and task prompts as input, generating the desired results in
text format. It uses a DaViT vision encoder to convert images into visual token
embeddings. These are then concatenated with BERT-generated text embeddings and
processed by a transformer-based multi-modal encoder-decoder to generate the response.
"""
EXAMPLES = [
["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg", None],
]
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE = "cuda"
MODELS, PROCESSORS = load_models(DEVICE)
@spaces.GPU
def process(
checkpoint_dropdown,
task_dropdown,
image_input,
image_prompter_input
) -> Tuple[Optional[Image.Image], Optional[str]]:
model = MODELS[checkpoint_dropdown]
processor = PROCESSORS[checkpoint_dropdown]
task = TASKS[task_dropdown]
if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES:
_, response = run_inference(
model, processor, DEVICE, image_input, task)
detections = sv.Detections.from_lmm(
lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
return annotate_with_boxes(image_input, detections), None
elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES:
_, response = run_inference(
model, processor, DEVICE, image_input, task)
return None, response[task]
elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES:
detections_list = []
print(image_prompter_input)
image_input = image_prompter_input["image"]
for prompt in image_prompter_input["points"]:
text = pre_process_region_task_input(
prompt=prompt,
resolution_wh=image_input.size
)
_, response = run_inference(
model, processor, DEVICE, image_input, task, text)
detections = sv.Detections.from_lmm(
lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
detections_list.append(detections)
detections = sv.Detections.merge(detections_list=detections_list)
detections = post_process_region_output(
detections=detections, resolution_wh=image_input.size)
return annotate_with_boxes(image_input, detections), None
with gr.Blocks() as demo:
gr.Markdown(MARKDOWN)
with gr.Row():
checkpoint_dropdown_component = gr.Dropdown(
choices=CHECKPOINTS,
value=CHECKPOINTS[0],
label="Model", info="Select a Florence 2 model to use.",
interactive=True
)
task_dropdown_component = gr.Dropdown(
choices=TASK_NAMES,
value=TASK_NAMES[0],
label="Task", info="Select a task to perform with the model.",
interactive=True
)
with gr.Row():
with gr.Column():
image_input_component = gr.Image(
type='pil', label='Upload image')
image_prompter_input_component = ImagePrompter(
type='pil', label='Image prompt', visible=False)
submit_button_component = gr.Button(value='Submit', variant='primary')
with gr.Column():
image_output_component = gr.Image(type='pil', label='Image Output')
text_output_component = gr.Textbox(label='Caption Output', visible=False)
with gr.Row():
gr.Examples(
fn=process,
examples=EXAMPLES,
inputs=[
checkpoint_dropdown_component,
task_dropdown_component,
image_input_component,
image_prompter_input_component
],
outputs=[
image_output_component,
text_output_component
],
run_on_click=True
)
def on_dropdown_change(text):
return [
gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES),
ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES),
gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES),
gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES)
]
task_dropdown_component.change(
on_dropdown_change,
inputs=[task_dropdown_component],
outputs=[
image_input_component,
image_prompter_input_component,
image_output_component,
text_output_component
]
)
submit_button_component.click(
fn=process,
inputs=[
checkpoint_dropdown_component,
task_dropdown_component,
image_input_component,
image_prompter_input_component
],
outputs=[
image_output_component,
text_output_component
]
)
demo.launch(debug=False, show_error=True, max_threads=1)
|