SkalskiP's picture
cuda
a3dae10
from typing import Tuple, Optional
import gradio as gr
import spaces
import supervision as sv
import torch
from PIL import Image
from gradio_image_prompter import ImagePrompter
from utils.annotate import annotate_with_boxes
from utils.models import load_models, run_inference, CHECKPOINTS, \
pre_process_region_task_input, post_process_region_output
from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \
IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \
DENSE_REGION_CAPTION_TASK_NAME
MARKDOWN = """
# Better Florence-2 Playground 🔥
<div>
<a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-finetune-florence-2-on-detection-dataset.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
</a>
<a href="https://blog.roboflow.com/florence-2/">
<img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
</a>
<a href="https://arxiv.org/abs/2311.06242">
<img src="https://img.shields.io/badge/arXiv-2311.06242-b31b1b.svg" alt="arXiv" style="display:inline-block;">
</a>
<a href="https://www.youtube.com/watch?v=i3KjYgxNH6w">
<img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
</a>
</div>
Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the
MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities
across tasks such as captioning, object detection, grounding, and segmentation.
The model takes images and task prompts as input, generating the desired results in
text format. It uses a DaViT vision encoder to convert images into visual token
embeddings. These are then concatenated with BERT-generated text embeddings and
processed by a transformer-based multi-modal encoder-decoder to generate the response.
"""
EXAMPLES = [
["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg", None],
]
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE = "cuda"
MODELS, PROCESSORS = load_models(DEVICE)
@spaces.GPU
def process(
checkpoint_dropdown,
task_dropdown,
image_input,
image_prompter_input
) -> Tuple[Optional[Image.Image], Optional[str]]:
model = MODELS[checkpoint_dropdown]
processor = PROCESSORS[checkpoint_dropdown]
task = TASKS[task_dropdown]
if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES:
_, response = run_inference(
model, processor, DEVICE, image_input, task)
detections = sv.Detections.from_lmm(
lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
return annotate_with_boxes(image_input, detections), None
elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES:
_, response = run_inference(
model, processor, DEVICE, image_input, task)
return None, response[task]
elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES:
detections_list = []
print(image_prompter_input)
image_input = image_prompter_input["image"]
for prompt in image_prompter_input["points"]:
text = pre_process_region_task_input(
prompt=prompt,
resolution_wh=image_input.size
)
_, response = run_inference(
model, processor, DEVICE, image_input, task, text)
detections = sv.Detections.from_lmm(
lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
detections_list.append(detections)
detections = sv.Detections.merge(detections_list=detections_list)
detections = post_process_region_output(
detections=detections, resolution_wh=image_input.size)
return annotate_with_boxes(image_input, detections), None
with gr.Blocks() as demo:
gr.Markdown(MARKDOWN)
with gr.Row():
checkpoint_dropdown_component = gr.Dropdown(
choices=CHECKPOINTS,
value=CHECKPOINTS[0],
label="Model", info="Select a Florence 2 model to use.",
interactive=True
)
task_dropdown_component = gr.Dropdown(
choices=TASK_NAMES,
value=TASK_NAMES[0],
label="Task", info="Select a task to perform with the model.",
interactive=True
)
with gr.Row():
with gr.Column():
image_input_component = gr.Image(
type='pil', label='Upload image')
image_prompter_input_component = ImagePrompter(
type='pil', label='Image prompt', visible=False)
submit_button_component = gr.Button(value='Submit', variant='primary')
with gr.Column():
image_output_component = gr.Image(type='pil', label='Image Output')
text_output_component = gr.Textbox(label='Caption Output', visible=False)
with gr.Row():
gr.Examples(
fn=process,
examples=EXAMPLES,
inputs=[
checkpoint_dropdown_component,
task_dropdown_component,
image_input_component,
image_prompter_input_component
],
outputs=[
image_output_component,
text_output_component
],
run_on_click=True
)
def on_dropdown_change(text):
return [
gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES),
ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES),
gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES),
gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES)
]
task_dropdown_component.change(
on_dropdown_change,
inputs=[task_dropdown_component],
outputs=[
image_input_component,
image_prompter_input_component,
image_output_component,
text_output_component
]
)
submit_button_component.click(
fn=process,
inputs=[
checkpoint_dropdown_component,
task_dropdown_component,
image_input_component,
image_prompter_input_component
],
outputs=[
image_output_component,
text_output_component
]
)
demo.launch(debug=False, show_error=True, max_threads=1)